From 44a5ecd6dc8e03aa47d3bd67c24ba1b5f48bb0fa Mon Sep 17 00:00:00 2001 From: "datadog-datadog-prod-us1-2[bot]" <261164178+datadog-datadog-prod-us1-2[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:33:24 +0000 Subject: [PATCH 1/8] Add wc builtin with -l -w -c -m -L flags Co-authored-by: AlexandreYang <49917914+AlexandreYang@users.noreply.github.com> --- interp/builtin_wc_pentest_test.go | 206 +++++++++ interp/builtins/wc/wc.go | 359 ++++++++++++++ interp/builtins/wc/wc_gnu_compat_test.go | 172 +++++++ interp/builtins/wc/wc_test.go | 436 ++++++++++++++++++ interp/builtins/wc/wc_unix_test.go | 35 ++ interp/register_builtins.go | 2 + tests/import_allowlist_test.go | 8 + tests/scenarios/cmd/wc/bytes/empty_stdin.yaml | 14 + tests/scenarios/cmd/wc/bytes/single_byte.yaml | 14 + tests/scenarios/cmd/wc/chars/basic.yaml | 14 + tests/scenarios/cmd/wc/default/basic.yaml | 14 + .../scenarios/cmd/wc/default/empty_file.yaml | 14 + .../scenarios/cmd/wc/default/empty_stdin.yaml | 14 + .../scenarios/cmd/wc/default/single_file.yaml | 14 + .../cmd/wc/errors/files0_from_rejected.yaml | 10 + .../scenarios/cmd/wc/errors/missing_file.yaml | 10 + .../scenarios/cmd/wc/errors/unknown_flag.yaml | 10 + .../wc/hardening/double_dash_separator.yaml | 14 + tests/scenarios/cmd/wc/lines/empty_stdin.yaml | 14 + .../cmd/wc/lines/no_trailing_newline.yaml | 14 + tests/scenarios/cmd/wc/lines/one_newline.yaml | 14 + .../scenarios/cmd/wc/lines/two_newlines.yaml | 14 + .../cmd/wc/max_line_length/basic.yaml | 14 + .../max_line_length/no_trailing_newline.yaml | 14 + .../cmd/wc/max_line_length/three_lines.yaml | 14 + .../cmd/wc/multiple_files/total_line.yaml | 16 + .../scenarios/cmd/wc/stdin/dash_explicit.yaml | 14 + tests/scenarios/cmd/wc/stdin/implicit.yaml | 14 + .../scenarios/cmd/wc/words/across_lines.yaml | 14 + tests/scenarios/cmd/wc/words/empty_stdin.yaml | 14 + tests/scenarios/cmd/wc/words/single_word.yaml | 14 + tests/scenarios/cmd/wc/words/two_words.yaml | 14 + 32 files changed, 1558 insertions(+) create mode 100644 interp/builtin_wc_pentest_test.go create mode 100644 interp/builtins/wc/wc.go create mode 100644 interp/builtins/wc/wc_gnu_compat_test.go create mode 100644 interp/builtins/wc/wc_test.go create mode 100644 interp/builtins/wc/wc_unix_test.go create mode 100644 tests/scenarios/cmd/wc/bytes/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/bytes/single_byte.yaml create mode 100644 tests/scenarios/cmd/wc/chars/basic.yaml create mode 100644 tests/scenarios/cmd/wc/default/basic.yaml create mode 100644 tests/scenarios/cmd/wc/default/empty_file.yaml create mode 100644 tests/scenarios/cmd/wc/default/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/default/single_file.yaml create mode 100644 tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml create mode 100644 tests/scenarios/cmd/wc/errors/missing_file.yaml create mode 100644 tests/scenarios/cmd/wc/errors/unknown_flag.yaml create mode 100644 tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml create mode 100644 tests/scenarios/cmd/wc/lines/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml create mode 100644 tests/scenarios/cmd/wc/lines/one_newline.yaml create mode 100644 tests/scenarios/cmd/wc/lines/two_newlines.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/basic.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/three_lines.yaml create mode 100644 tests/scenarios/cmd/wc/multiple_files/total_line.yaml create mode 100644 tests/scenarios/cmd/wc/stdin/dash_explicit.yaml create mode 100644 tests/scenarios/cmd/wc/stdin/implicit.yaml create mode 100644 tests/scenarios/cmd/wc/words/across_lines.yaml create mode 100644 tests/scenarios/cmd/wc/words/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/words/single_word.yaml create mode 100644 tests/scenarios/cmd/wc/words/two_words.yaml diff --git a/interp/builtin_wc_pentest_test.go b/interp/builtin_wc_pentest_test.go new file mode 100644 index 00000000..98cbee6e --- /dev/null +++ b/interp/builtin_wc_pentest_test.go @@ -0,0 +1,206 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package interp_test + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "mvdan.cc/sh/v3/syntax" + + "github.com/DataDog/rshell/interp" +) + +func wcRun(t *testing.T, script, dir string) (string, string, int) { + t.Helper() + return wcRunCtx(context.Background(), t, script, dir) +} + +func wcRunCtx(ctx context.Context, t *testing.T, script, dir string) (string, string, int) { + t.Helper() + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + + var outBuf, errBuf bytes.Buffer + opts := []interp.RunnerOption{ + interp.StdIO(nil, &outBuf, &errBuf), + interp.AllowedPaths([]string{dir}), + } + + runner, err := interp.New(opts...) + require.NoError(t, err) + defer runner.Close() + + if dir != "" { + runner.Dir = dir + } + + err = runner.Run(ctx, prog) + exitCode := 0 + if err != nil { + var es interp.ExitStatus + if errors.As(err, &es) { + exitCode = int(es) + } else if ctx.Err() == nil { + t.Fatalf("unexpected error: %v", err) + } + } + return outBuf.String(), errBuf.String(), exitCode +} + +func wcWriteFile(t *testing.T, dir, name, content string) { + t.Helper() + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) +} + +// --- Flag and argument injection --- + +func TestWcPentestUnknownFlags(t *testing.T) { + dir := t.TempDir() + for _, flag := range []string{"-f", "--follow", "--no-such-flag", "--files0-from=foo"} { + _, stderr, code := wcRun(t, "wc "+flag, dir) + assert.Equal(t, 1, code, "flag: %s", flag) + assert.Contains(t, stderr, "wc:", "flag: %s", flag) + } +} + +func TestWcPentestDoubleDashFlagLikeFile(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "-v", "hello\n") + stdout, _, code := wcRun(t, "wc -- -v", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "-v") +} + +func TestWcPentestMultipleStdin(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", "hello\n") + stdout, _, code := wcRun(t, "cat file.txt | wc - -", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total") +} + +// --- Path edge cases --- + +func TestWcPentestNonexistentFile(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := wcRun(t, "wc nonexistent.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +func TestWcPentestEmptyFilename(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := wcRun(t, "wc ''", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +// --- Special files --- + +func TestWcPentestDevNull(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "empty.txt", "") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "0") +} + +// --- Context cancellation --- + +func TestWcPentestContextCancelled(t *testing.T) { + dir := t.TempDir() + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, _, _ = wcRunCtx(ctx, t, "wc", dir) +} + +func TestWcPentestContextTimeout(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", strings.Repeat("hello\n", 10000)) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "10000") +} + +// --- Large input --- + +func TestWcPentestLargeFile(t *testing.T) { + dir := t.TempDir() + content := strings.Repeat("word word word word word\n", 40000) + wcWriteFile(t, dir, "large.txt", content) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc -l large.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "40000") +} + +// --- Many files (FD leak check) --- + +func TestWcPentestManyFiles(t *testing.T) { + dir := t.TempDir() + var args []string + for i := 0; i < 50; i++ { + name := filepath.Join(dir, strings.Replace(filepath.Base(t.Name()), "/", "_", -1)+"_"+string(rune('a'+i%26))+string(rune('0'+i/26))+".txt") + require.NoError(t, os.WriteFile(name, []byte("x\n"), 0644)) + args = append(args, filepath.Base(name)) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc "+strings.Join(args, " "), dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total") +} + +// --- Edge case: file with only newlines --- + +func TestWcPentestOnlyNewlines(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", strings.Repeat("\n", 100)) + stdout, _, code := wcRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "100") + assert.Contains(t, stdout, " 0") +} + +// --- Edge case: long line --- + +func TestWcPentestLongLine(t *testing.T) { + dir := t.TempDir() + longLine := strings.Repeat("x", 1024*1024) + "\n" + wcWriteFile(t, dir, "file.txt", longLine) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "1048576") +} + +// --- Flag expansion in loop --- + +func TestWcPentestFlagExpansion(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", "hello\n") + _, stderr, code := wcRun(t, "for flag in --follow; do wc $flag file.txt; done", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go new file mode 100644 index 00000000..1d61d641 --- /dev/null +++ b/interp/builtins/wc/wc.go @@ -0,0 +1,359 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +// Package wc implements the wc builtin command. +// +// wc — print newline, word, and byte counts for each file +// +// Usage: wc [OPTION]... [FILE]... +// +// Print newline, word, and byte counts for each FILE, and a total line +// if more than one FILE is specified. A word is a non-zero-length sequence +// of characters delimited by white space. With no FILE, or when FILE is -, +// read standard input. +// +// When no flags are given, -l, -w, and -c are assumed (lines, words, bytes). +// +// Accepted flags: +// +// -l, --lines +// Print the newline count. +// +// -w, --words +// Print the word count. +// +// -c, --bytes +// Print the byte count. +// +// -m, --chars +// Print the character count. In a multibyte locale, the number of +// characters may differ from the number of bytes. +// +// -L, --max-line-length +// Print the length of the longest line. +// +// -h, --help +// Print this usage message to stdout and exit 0. +// +// Output columns always appear in a fixed order: lines, words, chars, +// bytes, max-line-length. Only the requested columns are shown. Column +// widths are right-justified to the width of the largest count across +// all files (including the total line, if any). +// +// Exit codes: +// +// 0 All files processed successfully. +// 1 At least one error occurred (missing file, invalid argument, etc.). +// +// Memory safety: +// +// Input is read in fixed-size chunks (32 KiB). Lines longer than +// MaxLineBytes (1 MiB) are split across chunks for counting purposes +// but never fully buffered. All loops check ctx.Err() at each +// iteration to honour the shell's execution timeout. +package wc + +import ( + "context" + "io" + "os" + "strconv" + "unicode/utf8" + + "github.com/spf13/pflag" + + "github.com/DataDog/rshell/interp/builtins" +) + +// Cmd is the wc builtin command descriptor. +var Cmd = builtins.Command{Name: "wc", Run: run} + +const chunkSize = 32 * 1024 // 32 KiB read buffer +const stdinMinWidth = 7 // GNU wc minimum column width for stdin + +type counts struct { + lines int64 + words int64 + chars int64 + bytes int64 + maxLineLen int64 +} + +type options struct { + showLines bool + showWords bool + showBytes bool + showChars bool + showMaxLineLen bool +} + +func run(ctx context.Context, callCtx *builtins.CallContext, args []string) builtins.Result { + fs := pflag.NewFlagSet("wc", pflag.ContinueOnError) + fs.SetOutput(io.Discard) + + help := fs.BoolP("help", "h", false, "print usage and exit") + lines := fs.BoolP("lines", "l", false, "print the newline counts") + words := fs.BoolP("words", "w", false, "print the word counts") + bytesFlag := fs.BoolP("bytes", "c", false, "print the byte counts") + chars := fs.BoolP("chars", "m", false, "print the character counts") + maxLineLen := fs.BoolP("max-line-length", "L", false, "print the maximum display width") + + if err := fs.Parse(args); err != nil { + callCtx.Errf("wc: %v\n", err) + return builtins.Result{Code: 1} + } + + if *help { + callCtx.Out("Usage: wc [OPTION]... [FILE]...\n") + callCtx.Out("Print newline, word, and byte counts for each FILE.\n") + callCtx.Out("With no FILE, or when FILE is -, read standard input.\n\n") + fs.SetOutput(callCtx.Stdout) + fs.PrintDefaults() + return builtins.Result{} + } + + opts := options{ + showLines: *lines, + showWords: *words, + showBytes: *bytesFlag, + showChars: *chars, + showMaxLineLen: *maxLineLen, + } + + if !opts.showLines && !opts.showWords && !opts.showBytes && !opts.showChars && !opts.showMaxLineLen { + opts.showLines = true + opts.showWords = true + opts.showBytes = true + } + + files := fs.Args() + stdinImplicit := len(files) == 0 + if stdinImplicit { + files = []string{"-"} + } + + hasStdin := stdinImplicit + if !hasStdin { + for _, f := range files { + if f == "-" { + hasStdin = true + break + } + } + } + + var total counts + var failed bool + + type fileResult struct { + name string + c counts + } + results := make([]fileResult, 0, len(files)) + + for _, file := range files { + if ctx.Err() != nil { + break + } + c, err := countFile(ctx, callCtx, file) + if err != nil { + name := file + if file == "-" { + name = "standard input" + } + callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) + failed = true + if c == (counts{}) { + continue + } + } + results = append(results, fileResult{name: file, c: c}) + total.lines += c.lines + total.words += c.words + total.chars += c.chars + total.bytes += c.bytes + if c.maxLineLen > total.maxLineLen { + total.maxLineLen = c.maxLineLen + } + } + + width := fieldWidth(total, opts) + if hasStdin && width < stdinMinWidth { + width = stdinMinWidth + } + + for _, fr := range results { + name := fr.name + if name == "-" && stdinImplicit { + name = "" + } + printCounts(callCtx, fr.c, opts, width, name) + } + + if len(files) > 1 { + printCounts(callCtx, total, opts, width, "total") + } + + if failed { + return builtins.Result{Code: 1} + } + return builtins.Result{} +} + +func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) { + var rc io.ReadCloser + if path == "-" { + if callCtx.Stdin == nil { + return counts{}, nil + } + rc = io.NopCloser(callCtx.Stdin) + } else { + f, err := callCtx.OpenFile(ctx, path, os.O_RDONLY, 0) + if err != nil { + return counts{}, err + } + rc = f + } + defer rc.Close() + return countReader(ctx, rc) +} + +func countReader(ctx context.Context, r io.Reader) (counts, error) { + buf := make([]byte, chunkSize) + var c counts + var inWord bool + var lineLen int64 + var carry [utf8.UTFMax - 1]byte + var carryN int + + for { + if ctx.Err() != nil { + return c, ctx.Err() + } + n, err := r.Read(buf[carryN:]) + if carryN > 0 { + copy(buf, carry[:carryN]) + n += carryN + carryN = 0 + } + if n > 0 { + chunk := buf[:n] + c.bytes += int64(n) + + // Handle incomplete UTF-8 at end of chunk. + tail := 0 + if !utf8.Valid(chunk) { + for tail = 1; tail <= 3 && tail < n; tail++ { + if utf8.Valid(chunk[:n-tail]) { + break + } + } + if tail > 0 && tail < n { + carryN = copy(carry[:], chunk[n-tail:]) + chunk = chunk[:n-tail] + } else { + tail = 0 + } + } + c.chars += int64(utf8.RuneCount(chunk)) + c.bytes -= int64(carryN) + + for _, b := range buf[:n] { + if b == '\n' { + c.lines++ + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } + lineLen = 0 + inWord = false + } else if b == '\r' { + lineLen = 0 + inWord = false + } else if b == '\t' { + lineLen = (lineLen/8 + 1) * 8 + inWord = false + } else if b == ' ' || b == '\v' || b == '\f' { + lineLen++ + inWord = false + } else { + if !inWord { + c.words++ + inWord = true + } + if b&0xC0 != 0x80 { + lineLen++ + } + } + } + } + if err == io.EOF { + if carryN > 0 { + c.chars += int64(utf8.RuneCount(carry[:carryN])) + c.bytes += int64(carryN) + carryN = 0 + } + break + } + if err != nil { + return c, err + } + } + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } + return c, nil +} + +func fieldWidth(total counts, opts options) int { + max := int64(0) + if opts.showLines && total.lines > max { + max = total.lines + } + if opts.showWords && total.words > max { + max = total.words + } + if opts.showChars && total.chars > max { + max = total.chars + } + if opts.showBytes && total.bytes > max { + max = total.bytes + } + if opts.showMaxLineLen && total.maxLineLen > max { + max = total.maxLineLen + } + w := len(strconv.FormatInt(max, 10)) + return w +} + +func printCounts(callCtx *builtins.CallContext, c counts, opts options, width int, name string) { + first := true + printField := func(val int64) { + if first { + callCtx.Outf("%*d", width, val) + first = false + } else { + callCtx.Outf(" %*d", width, val) + } + } + if opts.showLines { + printField(c.lines) + } + if opts.showWords { + printField(c.words) + } + if opts.showChars { + printField(c.chars) + } + if opts.showBytes { + printField(c.bytes) + } + if opts.showMaxLineLen { + printField(c.maxLineLen) + } + if name != "" { + callCtx.Outf(" %s", name) + } + callCtx.Out("\n") +} diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go new file mode 100644 index 00000000..90966364 --- /dev/null +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -0,0 +1,172 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package wc_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestGNUCompatDefaultEmpty — no flags on empty input. +// +// GNU command: printf ” | gwc +// Expected: " 0 0 0\n" +func TestGNUCompatDefaultEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "empty.txt", "") + stdout, _, code := cmdRun(t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 0 0 empty.txt\n", stdout) +} + +// TestGNUCompatDefaultBasic — default counts on "a b\nc\n". +// +// GNU command: printf 'a b\nc\n' | gwc +// Expected: " 2 3 6\n" +func TestGNUCompatDefaultBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 file.txt\n", stdout) +} + +// TestGNUCompatLinesCount — -l on input with 2 newlines. +// +// GNU command: printf 'x\ny\n' | gwc -l +// Expected: "2\n" +func TestGNUCompatLinesCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x\ny\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatLinesNoNewline — -l on input with no newline. +// +// GNU command: printf 'x y' | gwc -l +// Expected: "0\n" +func TestGNUCompatLinesNoNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// TestGNUCompatWordsEmpty — -w on empty. +// +// GNU command: printf ” | gwc -w +// Expected: "0\n" +func TestGNUCompatWordsEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// TestGNUCompatWordsMulti — -w on "x y\nz". +// +// GNU command: printf 'x y\nz' | gwc -w +// Expected: "3\n" +func TestGNUCompatWordsMulti(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\nz") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +// TestGNUCompatBytesCount — -c on "x". +// +// GNU command: printf 'x' | gwc -c +// Expected: "1\n" +func TestGNUCompatBytesCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLen — -L on "1\n12\n". +// +// GNU command: printf '1\n12\n' | gwc -L +// Expected: "2\n" +func TestGNUCompatMaxLineLen(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n12\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLenLastLine — -L on "\n123456" (no trailing newline). +// +// GNU command: printf '\n123456' | gwc -L +// Expected: "6\n" +func TestGNUCompatMaxLineLenLastLine(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\n123456") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +// TestGNUCompatMultipleFiles — two files with total line. +// +// GNU command: gwc a.txt b.txt +// a.txt = "hello\n" (1 line, 1 word, 6 bytes) +// b.txt = "world foo\n" (1 line, 2 words, 10 bytes) +// Expected: +// +// " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" +func TestGNUCompatMultipleFiles(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + writeFile(t, dir, "b.txt", "world foo\n") + stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) +} + +// TestGNUCompatCharsMultibyte — -m on "café\n". +// +// GNU command: printf 'café\n' | gwc -m +// Expected: "5\n" (5 chars: c, a, f, é, \n) +func TestGNUCompatCharsMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +// TestGNUCompatControlCharIsWord — control byte \x01 counts as a word. +// +// GNU command: printf '\x01\n' | gwc -w +// Expected: "1\n" +func TestGNUCompatControlCharIsWord(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\x01\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// TestGNUCompatRejectedFlag — unknown flag exits 1. +// +// GNU command: gwc --follow +// Expected: exit 1, stderr contains "wc:" +func TestGNUCompatRejectedFlag(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --follow", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go new file mode 100644 index 00000000..50a8fd9d --- /dev/null +++ b/interp/builtins/wc/wc_test.go @@ -0,0 +1,436 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package wc_test + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func runScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return testutil.RunScript(t, script, dir, opts...) +} + +func runScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return testutil.RunScriptCtx(ctx, t, script, dir, opts...) +} + +func cmdRun(t *testing.T, script, dir string) (string, string, int) { + t.Helper() + return runScript(t, script, dir, interp.AllowedPaths([]string{dir})) +} + +func writeFile(t *testing.T, dir, name, content string) string { + t.Helper() + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) + return name +} + +// --- Default mode (lines, words, bytes) --- + +func TestWcDefaultEmptyStdin(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "empty.txt", "") + stdout, _, code := cmdRun(t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 0 0 empty.txt\n", stdout) +} + +func TestWcDefaultBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 file.txt\n", stdout) +} + +func TestWcDefaultNoTrailingNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello world") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 0 2 11 file.txt\n", stdout) +} + +// --- Lines --- + +func TestWcLinesEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcLinesNoNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcLinesOneNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcLinesTwoNewlines(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x\ny\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +func TestWcLinesLongForm(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\nb\nc\n") + stdout, _, code := cmdRun(t, "wc --lines file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +// --- Words --- + +func TestWcWordsEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcWordsSingle(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcWordsMultiple(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\nz") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +func TestWcWordsControlChar(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\x01\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// --- Bytes --- + +func TestWcBytesEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcBytesSingle(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcBytesMulti(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +// --- Chars --- + +func TestWcCharsASCII(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcCharsMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +func TestWcBytesMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcCharsAndBytes(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -cm file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 6 file.txt\n", stdout) +} + +// --- Max line length --- + +func TestWcMaxLineLenBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n12\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +func TestWcMaxLineLenThreeLines(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n123\n1\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +func TestWcMaxLineLenNoTrailingNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\n123456") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcMaxLineLenEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// --- Multiple files --- + +func TestWcMultipleFiles(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + writeFile(t, dir, "b.txt", "world foo\n") + stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) +} + +func TestWcMultipleFilesPartialFailure(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + stdout, stderr, code := cmdRun(t, "wc a.txt missing.txt", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stdout, "a.txt") + assert.Contains(t, stdout, "total") + assert.Contains(t, stderr, "wc:") +} + +// --- Stdin --- + +func TestWcStdinImplicit(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 2 3 6\n", stdout) +} + +func TestWcStdinDash(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc -", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 2 3 6 -\n", stdout) +} + +func TestWcNilStdin(t *testing.T) { + dir := t.TempDir() + stdout, _, code := runScript(t, "wc", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 0 0 0\n", stdout) +} + +// --- Help --- + +func TestWcHelp(t *testing.T) { + dir := t.TempDir() + stdout, _, code := cmdRun(t, "wc --help", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "Usage:") +} + +func TestWcHelpShort(t *testing.T) { + dir := t.TempDir() + stdout, _, code := cmdRun(t, "wc -h", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "Usage:") +} + +// --- Error cases --- + +func TestWcMissingFile(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, "wc nonexistent.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +func TestWcUnknownFlag(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --definitely-invalid", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +func TestWcFiles0FromRejected(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --files0-from=foo", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +func TestWcDirectory(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc .", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +// --- Hardening --- + +func TestWcDoubleDash(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -- file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 1 6 file.txt\n", stdout) +} + +func TestWcContextCancellation(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", strings.Repeat("x\n", 100)) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, code := runScriptCtx(ctx, t, "wc file.txt", dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) +} + +func TestWcPipeInput(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "alpha\nbeta\ngamma\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc -l", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 3\n", stdout) +} + +// --- Combined flags --- + +func TestWcAllFlags(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc -lwmcL file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 6 3 file.txt\n", stdout) +} + +func TestWcLinesAndWords(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc -lw file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 file.txt\n", stdout) +} + +// --- Width formatting --- + +func TestWcWidthDeterminedByTotal(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", strings.Repeat("word ", 20)+"\n") + writeFile(t, dir, "b.txt", "x\n") + stdout, _, code := cmdRun(t, "wc -w a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total\n") + lines := strings.Split(strings.TrimSpace(stdout), "\n") + assert.Equal(t, 3, len(lines)) +} + +// --- Max line length: tab and CR --- + +func TestWcMaxLineLenTab(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\tb\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "9 file.txt\n", stdout) +} + +func TestWcMaxLineLenCR(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\rworld\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +func TestWcCRLFLineCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\r\nb\r\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// --- Binary / non-UTF8 input --- + +func TestWcBinaryInput(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.bin", string([]byte{0x00, 0xFF, 0xFE, 0x0A, 0x41})) + stdout, _, code := cmdRun(t, "wc file.bin", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "file.bin") + assert.Equal(t, 0, code) +} + +// --- Multibyte chars --- + +func TestWcCharsMultibyteEmoji(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hi 💐\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} diff --git a/interp/builtins/wc/wc_unix_test.go b/interp/builtins/wc/wc_unix_test.go new file mode 100644 index 00000000..7882ae13 --- /dev/null +++ b/interp/builtins/wc/wc_unix_test.go @@ -0,0 +1,35 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build unix + +package wc_test + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWcSymlinkToFile(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "target.txt", "hello\n") + require.NoError(t, os.Symlink("target.txt", filepath.Join(dir, "link.txt"))) + stdout, _, code := cmdRun(t, "wc link.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 1 6 link.txt\n", stdout) +} + +func TestWcDanglingSymlink(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.Symlink("nonexistent", filepath.Join(dir, "dangle.txt"))) + stdout, stderr, code := cmdRun(t, "wc dangle.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/register_builtins.go b/interp/register_builtins.go index f6ff973d..d78d84f9 100644 --- a/interp/register_builtins.go +++ b/interp/register_builtins.go @@ -17,6 +17,7 @@ import ( falsecmd "github.com/DataDog/rshell/interp/builtins/false" "github.com/DataDog/rshell/interp/builtins/head" truecmd "github.com/DataDog/rshell/interp/builtins/true" + "github.com/DataDog/rshell/interp/builtins/wc" ) var registerOnce sync.Once @@ -32,6 +33,7 @@ func registerBuiltins() { falsecmd.Cmd, head.Cmd, truecmd.Cmd, + wc.Cmd, } { builtins.Register(cmd.Name, cmd.Run) } diff --git a/tests/import_allowlist_test.go b/tests/import_allowlist_test.go index 3ba7053a..73a4cd86 100644 --- a/tests/import_allowlist_test.go +++ b/tests/import_allowlist_test.go @@ -60,6 +60,14 @@ var builtinAllowedSymbols = []string{ "strconv.Atoi", // strconv.ParseInt — string-to-int conversion with base/bit-size; pure function, no I/O. "strconv.ParseInt", + // strconv.FormatInt — int-to-string conversion; pure function, no I/O. + "strconv.FormatInt", + // unicode/utf8.RuneCount — counts UTF-8 runes in a byte slice; pure function, no I/O. + "unicode/utf8.RuneCount", + // unicode/utf8.UTFMax — maximum number of bytes in a UTF-8 encoding; constant, no I/O. + "unicode/utf8.UTFMax", + // unicode/utf8.Valid — checks if a byte slice is valid UTF-8; pure function, no I/O. + "unicode/utf8.Valid", } // permanentlyBanned lists packages that may never be imported by builtin diff --git a/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml b/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml new file mode 100644 index 00000000..f1a19458 --- /dev/null +++ b/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a0 +description: wc -c on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -c empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/bytes/single_byte.yaml b/tests/scenarios/cmd/wc/bytes/single_byte.yaml new file mode 100644 index 00000000..a4f69956 --- /dev/null +++ b/tests/scenarios/cmd/wc/bytes/single_byte.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a3 +description: wc -c on single byte input outputs 1. +setup: + files: + - path: file.txt + content: "x" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -c file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/chars/basic.yaml b/tests/scenarios/cmd/wc/chars/basic.yaml new file mode 100644 index 00000000..25aaceb6 --- /dev/null +++ b/tests/scenarios/cmd/wc/chars/basic.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX wc -m behavior +description: wc -m counts characters (bytes for ASCII). +setup: + files: + - path: file.txt + content: "hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -m file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/basic.yaml b/tests/scenarios/cmd/wc/default/basic.yaml new file mode 100644 index 00000000..a52874db --- /dev/null +++ b/tests/scenarios/cmd/wc/default/basic.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test b1 +description: wc default counts lines, words, bytes. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc file.txt +expect: + stdout: "2 3 6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_file.yaml b/tests/scenarios/cmd/wc/default/empty_file.yaml new file mode 100644 index 00000000..5b00343c --- /dev/null +++ b/tests/scenarios/cmd/wc/default/empty_file.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_file_empty +description: wc on an empty file shows all zeros. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc empty.txt +expect: + stdout: "0 0 0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_stdin.yaml b/tests/scenarios/cmd/wc/default/empty_stdin.yaml new file mode 100644 index 00000000..5249634e --- /dev/null +++ b/tests/scenarios/cmd/wc/default/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test b0 +description: wc with no args and empty stdin outputs all zeros. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc empty.txt +expect: + stdout: "0 0 0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/single_file.yaml b/tests/scenarios/cmd/wc/default/single_file.yaml new file mode 100644 index 00000000..f6c1f873 --- /dev/null +++ b/tests/scenarios/cmd/wc/default/single_file.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_single_default +description: wc with a single file shows lines words bytes and filename. +setup: + files: + - path: file.txt + content: "alpha\nbeta\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc file.txt +expect: + stdout: " 2 2 11 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml new file mode 100644 index 00000000..d74a82d9 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml @@ -0,0 +1,10 @@ +# Derived from GTFOBins safety requirement +description: wc rejects --files0-from flag (security risk). +input: + allowed_paths: ["$DIR"] + script: |+ + wc --files0-from=foo +expect: + stdout: "" + stderr_contains: ["wc:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/missing_file.yaml b/tests/scenarios/cmd/wc/errors/missing_file.yaml new file mode 100644 index 00000000..5cc5aeb4 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/missing_file.yaml @@ -0,0 +1,10 @@ +# Derived from uutils test_read_from_nonexistent_file +description: wc exits 1 and prints error for nonexistent file. +input: + allowed_paths: ["$DIR"] + script: |+ + wc bogusfile +expect: + stdout: "" + stderr_contains: ["wc: bogusfile:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/unknown_flag.yaml b/tests/scenarios/cmd/wc/errors/unknown_flag.yaml new file mode 100644 index 00000000..f14f0ba4 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/unknown_flag.yaml @@ -0,0 +1,10 @@ +# Derived from uutils test_invalid_arg +description: wc rejects unknown flags with exit code 1. +input: + allowed_paths: ["$DIR"] + script: |+ + wc --definitely-invalid +expect: + stdout: "" + stderr_contains: ["wc:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml b/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml new file mode 100644 index 00000000..50b91f43 --- /dev/null +++ b/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX -- convention +description: wc accepts -- to end flag parsing. +setup: + files: + - path: file.txt + content: "hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -- file.txt +expect: + stdout: "1 1 6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/empty_stdin.yaml b/tests/scenarios/cmd/wc/lines/empty_stdin.yaml new file mode 100644 index 00000000..33775902 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a1 +description: wc -l on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml new file mode 100644 index 00000000..ecb45502 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a7 +description: wc -l counts newline bytes; text with no newline counts as 0 lines. +setup: + files: + - path: file.txt + content: "x y" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "0 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/one_newline.yaml b/tests/scenarios/cmd/wc/lines/one_newline.yaml new file mode 100644 index 00000000..817ea07f --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/one_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a8 +description: wc -l counts 1 for a single newline-terminated line. +setup: + files: + - path: file.txt + content: "x y\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/two_newlines.yaml b/tests/scenarios/cmd/wc/lines/two_newlines.yaml new file mode 100644 index 00000000..e9109634 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/two_newlines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a9 +description: wc -l counts 2 for two newline-terminated lines. +setup: + files: + - path: file.txt + content: "x\ny\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/basic.yaml b/tests/scenarios/cmd/wc/max_line_length/basic.yaml new file mode 100644 index 00000000..e7461fde --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/basic.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c0 +description: wc -L reports the length of the longest line. +setup: + files: + - path: file.txt + content: "1\n12\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml new file mode 100644 index 00000000..c417d540 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c2 +description: wc -L counts a final line with no trailing newline. +setup: + files: + - path: file.txt + content: "\n123456" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml b/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml new file mode 100644 index 00000000..d70b6a20 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c1 +description: wc -L picks the max among multiple lines. +setup: + files: + - path: file.txt + content: "1\n123\n1\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "3 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/multiple_files/total_line.yaml b/tests/scenarios/cmd/wc/multiple_files/total_line.yaml new file mode 100644 index 00000000..b374e75f --- /dev/null +++ b/tests/scenarios/cmd/wc/multiple_files/total_line.yaml @@ -0,0 +1,16 @@ +# Derived from GNU coreutils wc-total.sh +description: wc prints a total line when given multiple files. +setup: + files: + - path: a.txt + content: "hello\n" + - path: b.txt + content: "world foo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc a.txt b.txt +expect: + stdout: " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml b/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml new file mode 100644 index 00000000..1804c0ad --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_stdin_explicit +description: wc with explicit - reads stdin and shows filename -. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc - +expect: + stdout: " 2 3 6 -\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/implicit.yaml b/tests/scenarios/cmd/wc/stdin/implicit.yaml new file mode 100644 index 00000000..ed40861c --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/implicit.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX wc behavior +description: wc reads stdin implicitly when no files are given. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc +expect: + stdout: " 2 3 6\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/across_lines.yaml b/tests/scenarios/cmd/wc/words/across_lines.yaml new file mode 100644 index 00000000..b57e7ff8 --- /dev/null +++ b/tests/scenarios/cmd/wc/words/across_lines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a6 +description: wc -w counts words across lines including a line without trailing newline. +setup: + files: + - path: file.txt + content: "x y\nz" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "3 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/empty_stdin.yaml b/tests/scenarios/cmd/wc/words/empty_stdin.yaml new file mode 100644 index 00000000..2f77334b --- /dev/null +++ b/tests/scenarios/cmd/wc/words/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a2 +description: wc -w on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/single_word.yaml b/tests/scenarios/cmd/wc/words/single_word.yaml new file mode 100644 index 00000000..c5fb92da --- /dev/null +++ b/tests/scenarios/cmd/wc/words/single_word.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a4 +description: wc -w counts 1 word for a single non-whitespace token. +setup: + files: + - path: file.txt + content: "x" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/two_words.yaml b/tests/scenarios/cmd/wc/words/two_words.yaml new file mode 100644 index 00000000..9981335d --- /dev/null +++ b/tests/scenarios/cmd/wc/words/two_words.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a5 +description: wc -w counts 2 words on a line with two tokens. +setup: + files: + - path: file.txt + content: "x y\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 From 6faaa34bef292e4b66f5776cbb48a3955971094b Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 12:33:24 +0100 Subject: [PATCH 2/8] Fix wc -L display width and chunk boundary double-counting - Use rune-level iteration over trimmed chunk (not buf[:n]) to prevent double-processing of UTF-8 tail bytes carried across 32 KiB boundaries - Use go-runewidth for -L flag so full-width characters (CJK, emoji) correctly count as 2 display columns instead of 1 Co-Authored-By: Claude Opus 4.6 --- go.mod | 2 ++ go.sum | 4 ++++ interp/builtins/wc/wc.go | 17 +++++++++-------- interp/builtins/wc/wc_test.go | 18 ++++++++++++++++++ .../cmd/wc/max_line_length/fullwidth_cjk.yaml | 14 ++++++++++++++ .../wc/max_line_length/fullwidth_emoji.yaml | 14 ++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml diff --git a/go.mod b/go.mod index 244a6f3e..4389c4d8 100644 --- a/go.mod +++ b/go.mod @@ -9,8 +9,10 @@ require ( ) require ( + github.com/clipperhouse/uax29/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/mattn/go-runewidth v0.0.21 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spf13/cobra v1.10.2 // indirect github.com/spf13/pflag v1.0.9 // indirect diff --git a/go.sum b/go.sum index e4ce3020..a4e7b163 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY= +github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -11,6 +13,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mattn/go-runewidth v0.0.21 h1:jJKAZiQH+2mIinzCJIaIG9Be1+0NR+5sz/lYEEjdM8w= +github.com/mattn/go-runewidth v0.0.21/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 1d61d641..66f7157b 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -62,6 +62,7 @@ import ( "strconv" "unicode/utf8" + "github.com/mattn/go-runewidth" "github.com/spf13/pflag" "github.com/DataDog/rshell/interp/builtins" @@ -260,21 +261,23 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { c.chars += int64(utf8.RuneCount(chunk)) c.bytes -= int64(carryN) - for _, b := range buf[:n] { - if b == '\n' { + for i := 0; i < len(chunk); { + r, size := utf8.DecodeRune(chunk[i:]) + i += size + if r == '\n' { c.lines++ if lineLen > c.maxLineLen { c.maxLineLen = lineLen } lineLen = 0 inWord = false - } else if b == '\r' { + } else if r == '\r' { lineLen = 0 inWord = false - } else if b == '\t' { + } else if r == '\t' { lineLen = (lineLen/8 + 1) * 8 inWord = false - } else if b == ' ' || b == '\v' || b == '\f' { + } else if r == ' ' || r == '\v' || r == '\f' { lineLen++ inWord = false } else { @@ -282,9 +285,7 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { c.words++ inWord = true } - if b&0xC0 != 0x80 { - lineLen++ - } + lineLen += int64(runewidth.RuneWidth(r)) } } } diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go index 50a8fd9d..4707b0dd 100644 --- a/interp/builtins/wc/wc_test.go +++ b/interp/builtins/wc/wc_test.go @@ -434,3 +434,21 @@ func TestWcCharsMultibyteEmoji(t *testing.T) { assert.Equal(t, 0, code) assert.Equal(t, "5 file.txt\n", stdout) } + +// TestWcChunkBoundaryMultibyte verifies that a multibyte character straddling +// the 32 KiB read-buffer boundary is not double-counted. This requires +// programmatic file generation so it lives as a Go test rather than a scenario. +func TestWcChunkBoundaryMultibyte(t *testing.T) { + dir := t.TempDir() + // 💐 is 4 bytes; placing it at offset 32766 means it spans bytes 32766-32769, + // straddling the 32768-byte chunk boundary and exercising the carry logic. + prefix := strings.Repeat("a", 32*1024-2) + content := prefix + "💐\n" + writeFile(t, dir, "file.txt", content) + stdout, _, code := cmdRun(t, "wc -mL file.txt", dir) + assert.Equal(t, 0, code) + // chars: 32766 'a' + 1 emoji + 1 newline = 32768 + // max line length: 32766 + 2 (emoji display width) = 32768 + assert.Equal(t, "32768 32768 file.txt\n", stdout) +} + diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml new file mode 100644 index 00000000..b8b50009 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml @@ -0,0 +1,14 @@ +description: wc -L counts display columns, CJK characters are width 2. +skip_assert_against_bash: true # display width depends on locale; we always use Unicode width +setup: + files: + - path: file.txt + content: "你好\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "4 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml new file mode 100644 index 00000000..88329252 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml @@ -0,0 +1,14 @@ +description: wc -L counts display columns, emoji characters are width 2. +skip_assert_against_bash: true # display width depends on locale; we always use Unicode width +setup: + files: + - path: file.txt + content: "ab💐\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "4 file.txt\n" + stderr: "" + exit_code: 0 From ab67e37fd9ea36adc5ef72ff273d7b844bd8576a Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 12:45:52 +0100 Subject: [PATCH 3/8] Replace go-runewidth dependency with inline wcwidth implementation Co-Authored-By: Claude Opus 4.6 --- go.mod | 6 ++-- go.sum | 4 --- interp/builtins/wc/wc.go | 50 ++++++++++++++++++++++++++++++++-- tests/import_allowlist_test.go | 18 ++++++++++++ 4 files changed, 68 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 4389c4d8..fb149a7b 100644 --- a/go.mod +++ b/go.mod @@ -3,17 +3,15 @@ module github.com/DataDog/rshell go 1.25.6 require ( + github.com/spf13/cobra v1.10.2 + github.com/spf13/pflag v1.0.9 github.com/stretchr/testify v1.11.1 gopkg.in/yaml.v3 v3.0.1 mvdan.cc/sh/v3 v3.12.0 ) require ( - github.com/clipperhouse/uax29/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/mattn/go-runewidth v0.0.21 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/spf13/cobra v1.10.2 // indirect - github.com/spf13/pflag v1.0.9 // indirect ) diff --git a/go.sum b/go.sum index a4e7b163..e4ce3020 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY= -github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -13,8 +11,6 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/mattn/go-runewidth v0.0.21 h1:jJKAZiQH+2mIinzCJIaIG9Be1+0NR+5sz/lYEEjdM8w= -github.com/mattn/go-runewidth v0.0.21/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 66f7157b..7b7a755a 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -60,9 +60,9 @@ import ( "io" "os" "strconv" + "unicode" "unicode/utf8" - "github.com/mattn/go-runewidth" "github.com/spf13/pflag" "github.com/DataDog/rshell/interp/builtins" @@ -285,7 +285,7 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { c.words++ inWord = true } - lineLen += int64(runewidth.RuneWidth(r)) + lineLen += int64(runeWidth(r)) } } } @@ -328,6 +328,52 @@ func fieldWidth(total counts, opts options) int { return w } +// runeWidth returns the display width of a rune following wcwidth(3) rules: +// 0 for controls, combining marks, and format chars; 2 for East Asian +// Wide/Fullwidth; 1 for everything else. +func runeWidth(r rune) int { + if unicode.Is(unicode.Cc, r) { + return 0 + } + if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Cf, r) { + return 0 + } + // Hangul Jamo medial vowels and final consonants (zero-width in syllable composition). + if r >= 0x1160 && r <= 0x11FF { + return 0 + } + if unicode.Is(eastAsianWide, r) { + return 2 + } + return 1 +} + +// eastAsianWide is a RangeTable covering East Asian Wide and Fullwidth +// codepoints per UAX #11, matching the ranges used by wcwidth(3). +var eastAsianWide = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x1100, 0x115F, 1}, // Hangul Jamo initials + {0x2329, 0x232A, 1}, // CJK angle brackets + {0x2E80, 0x303E, 1}, // CJK Radicals Supplement .. CJK Symbols + {0x3040, 0x33BF, 1}, // Hiragana .. CJK Compatibility + {0x33C0, 0x33FF, 1}, // CJK Compatibility (cont.) + {0x3400, 0x4DBF, 1}, // CJK Unified Ideographs Extension A + {0x4E00, 0xA4CF, 1}, // CJK Unified Ideographs .. Yi + {0xAC00, 0xD7A3, 1}, // Hangul Syllables + {0xF900, 0xFAFF, 1}, // CJK Compatibility Ideographs + {0xFE10, 0xFE19, 1}, // Vertical Forms + {0xFE30, 0xFE6F, 1}, // CJK Compatibility Forms + Small Form Variants + {0xFF01, 0xFF60, 1}, // Fullwidth Forms + {0xFFE0, 0xFFE6, 1}, // Fullwidth Signs + }, + R32: []unicode.Range32{ + {0x1F300, 0x1F64F, 1}, // Misc Symbols/Pictographs + Emoticons + {0x1F900, 0x1F9FF, 1}, // Supplemental Symbols and Pictographs + {0x20000, 0x2FFFD, 1}, // CJK Extension B..F + {0x30000, 0x3FFFD, 1}, // CJK Extension G+ + }, +} + func printCounts(callCtx *builtins.CallContext, c counts, opts options, width int, name string) { first := true printField := func(val int64) { diff --git a/tests/import_allowlist_test.go b/tests/import_allowlist_test.go index 73a4cd86..52d764eb 100644 --- a/tests/import_allowlist_test.go +++ b/tests/import_allowlist_test.go @@ -62,6 +62,24 @@ var builtinAllowedSymbols = []string{ "strconv.ParseInt", // strconv.FormatInt — int-to-string conversion; pure function, no I/O. "strconv.FormatInt", + // unicode.Cc — control character category range table; pure data, no I/O. + "unicode.Cc", + // unicode.Cf — format character category range table; pure data, no I/O. + "unicode.Cf", + // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. + "unicode.Is", + // unicode.Me — enclosing mark category range table; pure data, no I/O. + "unicode.Me", + // unicode.Mn — nonspacing mark category range table; pure data, no I/O. + "unicode.Mn", + // unicode.Range16 — struct type for 16-bit Unicode ranges; pure data. + "unicode.Range16", + // unicode.Range32 — struct type for 32-bit Unicode ranges; pure data. + "unicode.Range32", + // unicode.RangeTable — struct type for Unicode range tables; pure data. + "unicode.RangeTable", + // unicode/utf8.DecodeRune — decodes first UTF-8 rune from a byte slice; pure function, no I/O. + "unicode/utf8.DecodeRune", // unicode/utf8.RuneCount — counts UTF-8 runes in a byte slice; pure function, no I/O. "unicode/utf8.RuneCount", // unicode/utf8.UTFMax — maximum number of bytes in a UTF-8 encoding; constant, no I/O. From f99045b77c56be7dba03069f18ea21643a82f8cb Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 12:46:35 +0100 Subject: [PATCH 4/8] revert changes to go.mod --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index fb149a7b..244a6f3e 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,6 @@ module github.com/DataDog/rshell go 1.25.6 require ( - github.com/spf13/cobra v1.10.2 - github.com/spf13/pflag v1.0.9 github.com/stretchr/testify v1.11.1 gopkg.in/yaml.v3 v3.0.1 mvdan.cc/sh/v3 v3.12.0 @@ -14,4 +12,6 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/spf13/cobra v1.10.2 // indirect + github.com/spf13/pflag v1.0.9 // indirect ) From ff6c511dfdeb555226d99d8800df66f7792e7c9d Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 12:51:15 +0100 Subject: [PATCH 5/8] Use strings.ReplaceAll instead of strings.Replace with -1 Co-Authored-By: Claude Opus 4.6 --- interp/builtin_wc_pentest_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interp/builtin_wc_pentest_test.go b/interp/builtin_wc_pentest_test.go index 98cbee6e..2733cd8d 100644 --- a/interp/builtin_wc_pentest_test.go +++ b/interp/builtin_wc_pentest_test.go @@ -160,7 +160,7 @@ func TestWcPentestManyFiles(t *testing.T) { dir := t.TempDir() var args []string for i := 0; i < 50; i++ { - name := filepath.Join(dir, strings.Replace(filepath.Base(t.Name()), "/", "_", -1)+"_"+string(rune('a'+i%26))+string(rune('0'+i/26))+".txt") + name := filepath.Join(dir, strings.ReplaceAll(filepath.Base(t.Name()), "/", "_")+"_"+string(rune('a'+i%26))+string(rune('0'+i/26))+".txt") require.NoError(t, os.WriteFile(name, []byte("x\n"), 0644)) args = append(args, filepath.Base(name)) } From a67a02964330ce8409f0ccfa9928009b3876589a Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 13:54:42 +0100 Subject: [PATCH 6/8] Fix UTF-8 carry buffer overflow when no valid prefix found When all 1-3 byte trims fail to produce a valid UTF-8 prefix, tail exits the loop at 4 but carry is only 3 bytes. Add tail <= 3 guard so we only carry when bytes fit; otherwise process as-is with DecodeRune replacing invalid bytes. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 7b7a755a..aed75349 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -251,7 +251,7 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { break } } - if tail > 0 && tail < n { + if tail > 0 && tail <= 3 && tail < n { carryN = copy(carry[:], chunk[n-tail:]) chunk = chunk[:n-tail] } else { From f383b1da795f4b7437dbec8bd54c94e2b8d6a2a0 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 15:50:55 +0100 Subject: [PATCH 7/8] Address review: add UTF-8 carry buffer edge case comment and test - Add comment explaining why single-byte invalid UTF-8 chunks are safely handled in-place by DecodeRune (not carried) - Add test case with invalid UTF-8 bytes (0xC0 0x80) at the exact 32 KiB chunk boundary to verify no panic or incorrect counts Co-Authored-By: Claude Opus 4.6 --- interp/builtin_wc_pentest_test.go | 31 +++++++++++++++++++++++++++++++ interp/builtins/wc/wc.go | 4 ++++ 2 files changed, 35 insertions(+) diff --git a/interp/builtin_wc_pentest_test.go b/interp/builtin_wc_pentest_test.go index 2733cd8d..a5c88636 100644 --- a/interp/builtin_wc_pentest_test.go +++ b/interp/builtin_wc_pentest_test.go @@ -195,6 +195,37 @@ func TestWcPentestLongLine(t *testing.T) { assert.Contains(t, stdout, "1048576") } +// --- Invalid UTF-8 at chunk boundary --- + +func TestWcPentestInvalidUTF8AtChunkBoundary(t *testing.T) { + dir := t.TempDir() + // Build content so that invalid UTF-8 bytes (0xC0 0x80) land at the + // exact 32 KiB read boundary. This exercises the carry buffer edge + // case where invalid bytes must be handled in-place (not carried). + const chunkSize = 32 * 1024 + padding := strings.Repeat("A", chunkSize-1) // fills up to byte 32767 + // Place 0xC0 at offset 32767 (last byte of first chunk) and 0x80 at + // offset 32768 (first byte of second chunk). + content := []byte(padding) + content = append(content, 0xC0, 0x80) + content = append(content, '\n') + + require.NoError(t, os.WriteFile(filepath.Join(dir, "invalid_utf8.txt"), content, 0644)) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // -c should report exact byte count + stdout, _, code := wcRunCtx(ctx, t, "wc -c invalid_utf8.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "32770") // chunkSize - 1 + 2 invalid bytes + newline + + // -l should count the newline + stdout, _, code = wcRunCtx(ctx, t, "wc -l invalid_utf8.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "1") +} + // --- Flag expansion in loop --- func TestWcPentestFlagExpansion(t *testing.T) { diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index aed75349..7aecb2a4 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -244,6 +244,10 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { c.bytes += int64(n) // Handle incomplete UTF-8 at end of chunk. + // When tail >= n (e.g., n == 1 with a single invalid byte), the + // condition below is false, so the byte stays in chunk and + // DecodeRune processes it as a replacement character — this is + // correct and matches utf8.DecodeRune semantics. tail := 0 if !utf8.Valid(chunk) { for tail = 1; tail <= 3 && tail < n; tail++ { From a6d12fd3ce7636fcc1dc2d06211bdc86a46659ec Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Tue, 10 Mar 2026 17:18:38 +0100 Subject: [PATCH 8/8] Address review: add defensive comments for --files0-from and carry byte-count invariant - S1: Add security comment near flag definitions explaining why --files0-from is intentionally not implemented (GTFOBins data exfiltration risk) - S2: Add clarifying comment for the non-obvious c.bytes -= carryN invariant Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 7aecb2a4..d5d3680f 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -101,6 +101,10 @@ func run(ctx context.Context, callCtx *builtins.CallContext, args []string) buil chars := fs.BoolP("chars", "m", false, "print the character counts") maxLineLen := fs.BoolP("max-line-length", "L", false, "print the maximum display width") + // Security: --files0-from is intentionally NOT implemented. + // GTFOBins: this flag reads filenames from a file, enabling + // data exfiltration in sandboxed environments. + if err := fs.Parse(args); err != nil { callCtx.Errf("wc: %v\n", err) return builtins.Result{Code: 1} @@ -263,7 +267,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { } } c.chars += int64(utf8.RuneCount(chunk)) - c.bytes -= int64(carryN) + // carryN bytes are subtracted here and will be re-added via + // n += carryN at the top of the next iteration. + c.bytes -= int64(carryN) for i := 0; i < len(chunk); { r, size := utf8.DecodeRune(chunk[i:])