From d35e98403a9010419bfcfe21898a22ad509e1014 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 20:15:57 +0100 Subject: [PATCH 01/28] remove wc --- README.md | 2 +- SHELL_FEATURES.md | 1 - interp/builtin_wc_pentest_test.go | 237 --------- interp/builtins/wc/builtin_wc_pentest_test.go | 64 --- interp/builtins/wc/wc.go | 407 ---------------- interp/builtins/wc/wc_gnu_compat_test.go | 172 ------- interp/builtins/wc/wc_test.go | 453 ------------------ interp/builtins/wc/wc_unix_test.go | 35 -- interp/register_builtins.go | 2 - tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml | 22 - tests/scenarios/cmd/wc/bytes/empty_stdin.yaml | 14 - tests/scenarios/cmd/wc/bytes/single_byte.yaml | 14 - tests/scenarios/cmd/wc/chars/basic.yaml | 14 - tests/scenarios/cmd/wc/default/basic.yaml | 14 - .../scenarios/cmd/wc/default/empty_file.yaml | 14 - .../scenarios/cmd/wc/default/empty_stdin.yaml | 14 - .../scenarios/cmd/wc/default/single_file.yaml | 14 - .../cmd/wc/errors/files0_from_rejected.yaml | 10 - .../scenarios/cmd/wc/errors/missing_file.yaml | 10 - .../scenarios/cmd/wc/errors/unknown_flag.yaml | 10 - .../wc/hardening/double_dash_separator.yaml | 14 - tests/scenarios/cmd/wc/lines/empty_stdin.yaml | 14 - .../cmd/wc/lines/no_trailing_newline.yaml | 14 - tests/scenarios/cmd/wc/lines/one_newline.yaml | 14 - .../scenarios/cmd/wc/lines/two_newlines.yaml | 14 - .../cmd/wc/max_line_length/basic.yaml | 14 - .../cmd/wc/max_line_length/fullwidth_cjk.yaml | 14 - .../wc/max_line_length/fullwidth_emoji.yaml | 14 - .../max_line_length/no_trailing_newline.yaml | 14 - .../cmd/wc/max_line_length/three_lines.yaml | 14 - .../cmd/wc/multiple_files/total_line.yaml | 16 - .../scenarios/cmd/wc/stdin/dash_explicit.yaml | 14 - tests/scenarios/cmd/wc/stdin/implicit.yaml | 14 - .../scenarios/cmd/wc/words/across_lines.yaml | 14 - tests/scenarios/cmd/wc/words/empty_stdin.yaml | 14 - tests/scenarios/cmd/wc/words/single_word.yaml | 14 - tests/scenarios/cmd/wc/words/two_words.yaml | 14 - 37 files changed, 1 insertion(+), 1762 deletions(-) delete mode 100644 interp/builtin_wc_pentest_test.go delete mode 100644 interp/builtins/wc/builtin_wc_pentest_test.go delete mode 100644 interp/builtins/wc/wc.go delete mode 100644 interp/builtins/wc/wc_gnu_compat_test.go delete mode 100644 interp/builtins/wc/wc_test.go delete mode 100644 interp/builtins/wc/wc_unix_test.go delete mode 100644 tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml delete mode 100644 tests/scenarios/cmd/wc/bytes/empty_stdin.yaml delete mode 100644 tests/scenarios/cmd/wc/bytes/single_byte.yaml delete mode 100644 tests/scenarios/cmd/wc/chars/basic.yaml delete mode 100644 tests/scenarios/cmd/wc/default/basic.yaml delete mode 100644 tests/scenarios/cmd/wc/default/empty_file.yaml delete mode 100644 tests/scenarios/cmd/wc/default/empty_stdin.yaml delete mode 100644 tests/scenarios/cmd/wc/default/single_file.yaml delete mode 100644 tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml delete mode 100644 tests/scenarios/cmd/wc/errors/missing_file.yaml delete mode 100644 tests/scenarios/cmd/wc/errors/unknown_flag.yaml delete mode 100644 tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml delete mode 100644 tests/scenarios/cmd/wc/lines/empty_stdin.yaml delete mode 100644 tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml delete mode 100644 tests/scenarios/cmd/wc/lines/one_newline.yaml delete mode 100644 tests/scenarios/cmd/wc/lines/two_newlines.yaml delete mode 100644 tests/scenarios/cmd/wc/max_line_length/basic.yaml delete mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml delete mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml delete mode 100644 tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml delete mode 100644 tests/scenarios/cmd/wc/max_line_length/three_lines.yaml delete mode 100644 tests/scenarios/cmd/wc/multiple_files/total_line.yaml delete mode 100644 tests/scenarios/cmd/wc/stdin/dash_explicit.yaml delete mode 100644 tests/scenarios/cmd/wc/stdin/implicit.yaml delete mode 100644 tests/scenarios/cmd/wc/words/across_lines.yaml delete mode 100644 tests/scenarios/cmd/wc/words/empty_stdin.yaml delete mode 100644 tests/scenarios/cmd/wc/words/single_word.yaml delete mode 100644 tests/scenarios/cmd/wc/words/two_words.yaml diff --git a/README.md b/README.md index cc2df95a..0b9ede76 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Linux, macOS, and Windows. ``` tests/scenarios/ -├── cmd/ # builtin command tests (echo, cat, grep, head, tail, test, uniq, wc, ...) +├── cmd/ # builtin command tests (echo, cat, grep, head, tail, test, uniq, ...) └── shell/ # shell feature tests (pipes, variables, control flow, ...) ``` diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 9a42affe..0a9f8200 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -23,7 +23,6 @@ Blocked features are rejected before execution with exit code 2. - ✅ `tr [-cdsCt] SET1 [SET2]` — translate, squeeze, and/or delete characters from stdin - ✅ `true` — return exit code 0 - ✅ `uniq [OPTION]... [INPUT]` — report or omit repeated lines -- ✅ `wc [-l] [-w] [-c] [-m] [FILE]...` — count lines, words, bytes, or characters in files - ❌ All other commands — return exit code 127 with `: not found` unless an ExecHandler is configured ## Variables diff --git a/interp/builtin_wc_pentest_test.go b/interp/builtin_wc_pentest_test.go deleted file mode 100644 index a5c88636..00000000 --- a/interp/builtin_wc_pentest_test.go +++ /dev/null @@ -1,237 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -package interp_test - -import ( - "bytes" - "context" - "errors" - "os" - "path/filepath" - "strings" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "mvdan.cc/sh/v3/syntax" - - "github.com/DataDog/rshell/interp" -) - -func wcRun(t *testing.T, script, dir string) (string, string, int) { - t.Helper() - return wcRunCtx(context.Background(), t, script, dir) -} - -func wcRunCtx(ctx context.Context, t *testing.T, script, dir string) (string, string, int) { - t.Helper() - parser := syntax.NewParser() - prog, err := parser.Parse(strings.NewReader(script), "") - require.NoError(t, err) - - var outBuf, errBuf bytes.Buffer - opts := []interp.RunnerOption{ - interp.StdIO(nil, &outBuf, &errBuf), - interp.AllowedPaths([]string{dir}), - } - - runner, err := interp.New(opts...) - require.NoError(t, err) - defer runner.Close() - - if dir != "" { - runner.Dir = dir - } - - err = runner.Run(ctx, prog) - exitCode := 0 - if err != nil { - var es interp.ExitStatus - if errors.As(err, &es) { - exitCode = int(es) - } else if ctx.Err() == nil { - t.Fatalf("unexpected error: %v", err) - } - } - return outBuf.String(), errBuf.String(), exitCode -} - -func wcWriteFile(t *testing.T, dir, name, content string) { - t.Helper() - require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) -} - -// --- Flag and argument injection --- - -func TestWcPentestUnknownFlags(t *testing.T) { - dir := t.TempDir() - for _, flag := range []string{"-f", "--follow", "--no-such-flag", "--files0-from=foo"} { - _, stderr, code := wcRun(t, "wc "+flag, dir) - assert.Equal(t, 1, code, "flag: %s", flag) - assert.Contains(t, stderr, "wc:", "flag: %s", flag) - } -} - -func TestWcPentestDoubleDashFlagLikeFile(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "-v", "hello\n") - stdout, _, code := wcRun(t, "wc -- -v", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "-v") -} - -func TestWcPentestMultipleStdin(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "file.txt", "hello\n") - stdout, _, code := wcRun(t, "cat file.txt | wc - -", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "total") -} - -// --- Path edge cases --- - -func TestWcPentestNonexistentFile(t *testing.T) { - dir := t.TempDir() - stdout, stderr, code := wcRun(t, "wc nonexistent.txt", dir) - assert.Equal(t, 1, code) - assert.Equal(t, "", stdout) - assert.Contains(t, stderr, "wc:") -} - -func TestWcPentestEmptyFilename(t *testing.T) { - dir := t.TempDir() - stdout, stderr, code := wcRun(t, "wc ''", dir) - assert.Equal(t, 1, code) - assert.Equal(t, "", stdout) - assert.Contains(t, stderr, "wc:") -} - -// --- Special files --- - -func TestWcPentestDevNull(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "empty.txt", "") - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - stdout, _, code := wcRunCtx(ctx, t, "wc empty.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "0") -} - -// --- Context cancellation --- - -func TestWcPentestContextCancelled(t *testing.T) { - dir := t.TempDir() - ctx, cancel := context.WithCancel(context.Background()) - cancel() - _, _, _ = wcRunCtx(ctx, t, "wc", dir) -} - -func TestWcPentestContextTimeout(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "file.txt", strings.Repeat("hello\n", 10000)) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - stdout, _, code := wcRunCtx(ctx, t, "wc file.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "10000") -} - -// --- Large input --- - -func TestWcPentestLargeFile(t *testing.T) { - dir := t.TempDir() - content := strings.Repeat("word word word word word\n", 40000) - wcWriteFile(t, dir, "large.txt", content) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - stdout, _, code := wcRunCtx(ctx, t, "wc -l large.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "40000") -} - -// --- Many files (FD leak check) --- - -func TestWcPentestManyFiles(t *testing.T) { - dir := t.TempDir() - var args []string - for i := 0; i < 50; i++ { - name := filepath.Join(dir, strings.ReplaceAll(filepath.Base(t.Name()), "/", "_")+"_"+string(rune('a'+i%26))+string(rune('0'+i/26))+".txt") - require.NoError(t, os.WriteFile(name, []byte("x\n"), 0644)) - args = append(args, filepath.Base(name)) - } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - stdout, _, code := wcRunCtx(ctx, t, "wc "+strings.Join(args, " "), dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "total") -} - -// --- Edge case: file with only newlines --- - -func TestWcPentestOnlyNewlines(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "file.txt", strings.Repeat("\n", 100)) - stdout, _, code := wcRun(t, "wc file.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "100") - assert.Contains(t, stdout, " 0") -} - -// --- Edge case: long line --- - -func TestWcPentestLongLine(t *testing.T) { - dir := t.TempDir() - longLine := strings.Repeat("x", 1024*1024) + "\n" - wcWriteFile(t, dir, "file.txt", longLine) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - stdout, _, code := wcRunCtx(ctx, t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "1048576") -} - -// --- Invalid UTF-8 at chunk boundary --- - -func TestWcPentestInvalidUTF8AtChunkBoundary(t *testing.T) { - dir := t.TempDir() - // Build content so that invalid UTF-8 bytes (0xC0 0x80) land at the - // exact 32 KiB read boundary. This exercises the carry buffer edge - // case where invalid bytes must be handled in-place (not carried). - const chunkSize = 32 * 1024 - padding := strings.Repeat("A", chunkSize-1) // fills up to byte 32767 - // Place 0xC0 at offset 32767 (last byte of first chunk) and 0x80 at - // offset 32768 (first byte of second chunk). - content := []byte(padding) - content = append(content, 0xC0, 0x80) - content = append(content, '\n') - - require.NoError(t, os.WriteFile(filepath.Join(dir, "invalid_utf8.txt"), content, 0644)) - - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - - // -c should report exact byte count - stdout, _, code := wcRunCtx(ctx, t, "wc -c invalid_utf8.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "32770") // chunkSize - 1 + 2 invalid bytes + newline - - // -l should count the newline - stdout, _, code = wcRunCtx(ctx, t, "wc -l invalid_utf8.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "1") -} - -// --- Flag expansion in loop --- - -func TestWcPentestFlagExpansion(t *testing.T) { - dir := t.TempDir() - wcWriteFile(t, dir, "file.txt", "hello\n") - _, stderr, code := wcRun(t, "for flag in --follow; do wc $flag file.txt; done", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} diff --git a/interp/builtins/wc/builtin_wc_pentest_test.go b/interp/builtins/wc/builtin_wc_pentest_test.go deleted file mode 100644 index f7a0e53e..00000000 --- a/interp/builtins/wc/builtin_wc_pentest_test.go +++ /dev/null @@ -1,64 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -// Exploratory pentest for the wc builtin — GTFOBins validation. -// -// GTFOBins documents wc as capable of reading file contents via -// "wc --files0-from /path/to/file". This flag is intentionally not -// implemented in rshell, so pflag rejects it as an unknown flag. -// See: https://gtfobins.org/gtfobins/wc/ - -package wc_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/DataDog/rshell/interp" -) - -// TestWcGTFOBinsFiles0FromRejected verifies that the GTFOBins file-read -// technique "wc --files0-from" is blocked because the flag is not implemented. -// -// GTFOBins: https://gtfobins.org/gtfobins/wc/ -// Technique: wc --files0-from /path/to/input-file -func TestWcGTFOBinsFiles0FromRejected(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "target.txt", "secret data\n") - _, stderr, code := cmdRun(t, "wc --files0-from target.txt", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} - -// TestWcGTFOBinsFileReadSandboxEscape verifies that wc cannot read files -// outside the AllowedPaths sandbox. -// -// GTFOBins: https://gtfobins.org/gtfobins/wc/ -// Technique: wc /path/to/input-file (reads file to count lines/words/bytes) -func TestWcGTFOBinsFileReadSandboxEscape(t *testing.T) { - allowed := t.TempDir() - secret := t.TempDir() - require.NoError(t, os.WriteFile(filepath.Join(secret, "secret.txt"), []byte("secret data"), 0644)) - secretPath := filepath.ToSlash(filepath.Join(secret, "secret.txt")) - _, stderr, code := runScript(t, "wc "+secretPath, allowed, interp.AllowedPaths([]string{allowed})) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} - -// TestWcGTFOBinsFileReadTraversal verifies that path traversal cannot -// escape the sandbox when using wc. -// -// GTFOBins: https://gtfobins.org/gtfobins/wc/ -// Technique: wc ../../etc/passwd -func TestWcGTFOBinsFileReadTraversal(t *testing.T) { - dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc ../../etc/passwd", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go deleted file mode 100644 index 7fbf50ee..00000000 --- a/interp/builtins/wc/wc.go +++ /dev/null @@ -1,407 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -// Package wc implements the wc builtin command. -// -// wc — print newline, word, and byte counts for each file -// -// Usage: wc [OPTION]... [FILE]... -// -// Print newline, word, and byte counts for each FILE, and a total line -// if more than one FILE is specified. A word is a non-zero-length sequence -// of characters delimited by white space. With no FILE, or when FILE is -, -// read standard input. -// -// When no flags are given, -l, -w, and -c are assumed (lines, words, bytes). -// -// Accepted flags: -// -// -l, --lines -// Print the newline count. -// -// -w, --words -// Print the word count. -// -// -c, --bytes -// Print the byte count. -// -// -m, --chars -// Print the character count. In a multibyte locale, the number of -// characters may differ from the number of bytes. -// -// -L, --max-line-length -// Print the length of the longest line. -// -// -h, --help -// Print this usage message to stdout and exit 0. -// -// Output columns always appear in a fixed order: lines, words, chars, -// bytes, max-line-length. Only the requested columns are shown. Column -// widths are right-justified to the width of the largest count across -// all files (including the total line, if any). -// -// Exit codes: -// -// 0 All files processed successfully. -// 1 At least one error occurred (missing file, invalid argument, etc.). -// -// Memory safety: -// -// Input is read in fixed-size chunks (32 KiB). Lines longer than -// MaxLineBytes (1 MiB) are split across chunks for counting purposes -// but never fully buffered. All loops check ctx.Err() at each -// iteration to honour the shell's execution timeout. -package wc - -import ( - "context" - "io" - "os" - "strconv" - "unicode" - "unicode/utf8" - - "github.com/DataDog/rshell/interp/builtins" -) - -// Cmd is the wc builtin command descriptor. -var Cmd = builtins.Command{Name: "wc", MakeFlags: registerFlags} - -const chunkSize = 32 * 1024 // 32 KiB read buffer -const stdinMinWidth = 7 // GNU wc minimum column width for stdin - -type counts struct { - lines int64 - words int64 - chars int64 - bytes int64 - maxLineLen int64 -} - -type options struct { - showLines bool - showWords bool - showBytes bool - showChars bool - showMaxLineLen bool -} - -func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { - help := fs.BoolP("help", "h", false, "print usage and exit") - lines := fs.BoolP("lines", "l", false, "print the newline counts") - words := fs.BoolP("words", "w", false, "print the word counts") - bytesFlag := fs.BoolP("bytes", "c", false, "print the byte counts") - chars := fs.BoolP("chars", "m", false, "print the character counts") - maxLineLen := fs.BoolP("max-line-length", "L", false, "print the maximum display width") - - // Security: --files0-from is intentionally NOT implemented. - // GTFOBins: this flag reads filenames from a file, enabling - // data exfiltration in sandboxed environments. - - return func(ctx context.Context, callCtx *builtins.CallContext, files []string) builtins.Result { - if *help { - callCtx.Out("Usage: wc [OPTION]... [FILE]...\n") - callCtx.Out("Print newline, word, and byte counts for each FILE.\n") - callCtx.Out("With no FILE, or when FILE is -, read standard input.\n\n") - fs.SetOutput(callCtx.Stdout) - fs.PrintDefaults() - return builtins.Result{} - } - - opts := options{ - showLines: *lines, - showWords: *words, - showBytes: *bytesFlag, - showChars: *chars, - showMaxLineLen: *maxLineLen, - } - - if !opts.showLines && !opts.showWords && !opts.showBytes && !opts.showChars && !opts.showMaxLineLen { - opts.showLines = true - opts.showWords = true - opts.showBytes = true - } - - stdinImplicit := len(files) == 0 - if stdinImplicit { - files = []string{"-"} - } - - hasStdin := stdinImplicit - if !hasStdin { - for _, f := range files { - if f == "-" { - hasStdin = true - break - } - } - } - - var total counts - var failed bool - - type fileResult struct { - name string - c counts - } - results := make([]fileResult, 0, len(files)) - - for _, file := range files { - if ctx.Err() != nil { - break - } - c, err := countFile(ctx, callCtx, file) - if err != nil { - name := file - if file == "-" { - name = "standard input" - } - callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) - failed = true - if c == (counts{}) { - continue - } - } - results = append(results, fileResult{name: file, c: c}) - total.lines += c.lines - total.words += c.words - total.chars += c.chars - total.bytes += c.bytes - if c.maxLineLen > total.maxLineLen { - total.maxLineLen = c.maxLineLen - } - } - - width := fieldWidth(total, opts) - if hasStdin && width < stdinMinWidth { - width = stdinMinWidth - } - - for _, fr := range results { - name := fr.name - if name == "-" && stdinImplicit { - name = "" - } - printCounts(callCtx, fr.c, opts, width, name) - } - - if len(files) > 1 { - printCounts(callCtx, total, opts, width, "total") - } - - if failed { - return builtins.Result{Code: 1} - } - return builtins.Result{} - } -} - -func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) { - var rc io.ReadCloser - if path == "-" { - if callCtx.Stdin == nil { - return counts{}, nil - } - rc = io.NopCloser(callCtx.Stdin) - } else { - f, err := callCtx.OpenFile(ctx, path, os.O_RDONLY, 0) - if err != nil { - return counts{}, err - } - rc = f - } - defer rc.Close() - return countReader(ctx, rc) -} - -func countReader(ctx context.Context, r io.Reader) (counts, error) { - buf := make([]byte, chunkSize) - var c counts - var inWord bool - var lineLen int64 - var carry [utf8.UTFMax - 1]byte - var carryN int - - for { - if ctx.Err() != nil { - return c, ctx.Err() - } - n, err := r.Read(buf[carryN:]) - if carryN > 0 { - copy(buf, carry[:carryN]) - n += carryN - carryN = 0 - } - if n > 0 { - chunk := buf[:n] - c.bytes += int64(n) - - // Handle incomplete UTF-8 at end of chunk. - // When tail >= n (e.g., n == 1 with a single invalid byte), the - // condition below is false, so the byte stays in chunk and - // DecodeRune processes it as a replacement character — this is - // correct and matches utf8.DecodeRune semantics. - tail := 0 - if !utf8.Valid(chunk) { - for tail = 1; tail <= 3 && tail < n; tail++ { - if utf8.Valid(chunk[:n-tail]) { - break - } - } - if tail > 0 && tail <= 3 && tail < n { - carryN = copy(carry[:], chunk[n-tail:]) - chunk = chunk[:n-tail] - } else { - tail = 0 - } - } - c.chars += int64(utf8.RuneCount(chunk)) - // carryN bytes are subtracted here and will be re-added via - // n += carryN at the top of the next iteration. - c.bytes -= int64(carryN) - - for i := 0; i < len(chunk); { - r, size := utf8.DecodeRune(chunk[i:]) - i += size - if r == '\n' { - c.lines++ - if lineLen > c.maxLineLen { - c.maxLineLen = lineLen - } - lineLen = 0 - inWord = false - } else if r == '\r' { - lineLen = 0 - inWord = false - } else if r == '\t' { - lineLen = (lineLen/8 + 1) * 8 - inWord = false - } else if r == ' ' || r == '\v' || r == '\f' { - lineLen++ - inWord = false - } else { - if !inWord { - c.words++ - inWord = true - } - lineLen += int64(runeWidth(r)) - } - } - } - if err == io.EOF { - if carryN > 0 { - c.chars += int64(utf8.RuneCount(carry[:carryN])) - c.bytes += int64(carryN) - carryN = 0 - } - break - } - if err != nil { - return c, err - } - } - if lineLen > c.maxLineLen { - c.maxLineLen = lineLen - } - return c, nil -} - -func fieldWidth(total counts, opts options) int { - max := int64(0) - if opts.showLines && total.lines > max { - max = total.lines - } - if opts.showWords && total.words > max { - max = total.words - } - if opts.showChars && total.chars > max { - max = total.chars - } - if opts.showBytes && total.bytes > max { - max = total.bytes - } - if opts.showMaxLineLen && total.maxLineLen > max { - max = total.maxLineLen - } - w := len(strconv.FormatInt(max, 10)) - return w -} - -// runeWidth returns the display width of a rune following wcwidth(3) rules: -// 0 for controls, combining marks, and format chars; 2 for East Asian -// Wide/Fullwidth; 1 for everything else. -func runeWidth(r rune) int { - if unicode.Is(unicode.Cc, r) { - return 0 - } - if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Cf, r) { - return 0 - } - // Hangul Jamo medial vowels and final consonants (zero-width in syllable composition). - if r >= 0x1160 && r <= 0x11FF { - return 0 - } - if unicode.Is(eastAsianWide, r) { - return 2 - } - return 1 -} - -// eastAsianWide is a RangeTable covering East Asian Wide and Fullwidth -// codepoints per UAX #11, matching the ranges used by wcwidth(3). -var eastAsianWide = &unicode.RangeTable{ - R16: []unicode.Range16{ - {0x1100, 0x115F, 1}, // Hangul Jamo initials - {0x2329, 0x232A, 1}, // CJK angle brackets - {0x2E80, 0x303E, 1}, // CJK Radicals Supplement .. CJK Symbols - {0x3040, 0x33BF, 1}, // Hiragana .. CJK Compatibility - {0x33C0, 0x33FF, 1}, // CJK Compatibility (cont.) - {0x3400, 0x4DBF, 1}, // CJK Unified Ideographs Extension A - {0x4E00, 0xA4CF, 1}, // CJK Unified Ideographs .. Yi - {0xAC00, 0xD7A3, 1}, // Hangul Syllables - {0xF900, 0xFAFF, 1}, // CJK Compatibility Ideographs - {0xFE10, 0xFE19, 1}, // Vertical Forms - {0xFE30, 0xFE6F, 1}, // CJK Compatibility Forms + Small Form Variants - {0xFF01, 0xFF60, 1}, // Fullwidth Forms - {0xFFE0, 0xFFE6, 1}, // Fullwidth Signs - }, - R32: []unicode.Range32{ - {0x1F300, 0x1F64F, 1}, // Misc Symbols/Pictographs + Emoticons - {0x1F900, 0x1F9FF, 1}, // Supplemental Symbols and Pictographs - {0x20000, 0x2FFFD, 1}, // CJK Extension B..F - {0x30000, 0x3FFFD, 1}, // CJK Extension G+ - }, -} - -func printCounts(callCtx *builtins.CallContext, c counts, opts options, width int, name string) { - first := true - printField := func(val int64) { - if first { - callCtx.Outf("%*d", width, val) - first = false - } else { - callCtx.Outf(" %*d", width, val) - } - } - if opts.showLines { - printField(c.lines) - } - if opts.showWords { - printField(c.words) - } - if opts.showChars { - printField(c.chars) - } - if opts.showBytes { - printField(c.bytes) - } - if opts.showMaxLineLen { - printField(c.maxLineLen) - } - if name != "" { - callCtx.Outf(" %s", name) - } - callCtx.Out("\n") -} diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go deleted file mode 100644 index 90966364..00000000 --- a/interp/builtins/wc/wc_gnu_compat_test.go +++ /dev/null @@ -1,172 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -package wc_test - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -// TestGNUCompatDefaultEmpty — no flags on empty input. -// -// GNU command: printf ” | gwc -// Expected: " 0 0 0\n" -func TestGNUCompatDefaultEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "empty.txt", "") - stdout, _, code := cmdRun(t, "wc empty.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 0 0 empty.txt\n", stdout) -} - -// TestGNUCompatDefaultBasic — default counts on "a b\nc\n". -// -// GNU command: printf 'a b\nc\n' | gwc -// Expected: " 2 3 6\n" -func TestGNUCompatDefaultBasic(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "wc file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 3 6 file.txt\n", stdout) -} - -// TestGNUCompatLinesCount — -l on input with 2 newlines. -// -// GNU command: printf 'x\ny\n' | gwc -l -// Expected: "2\n" -func TestGNUCompatLinesCount(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x\ny\n") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 file.txt\n", stdout) -} - -// TestGNUCompatLinesNoNewline — -l on input with no newline. -// -// GNU command: printf 'x y' | gwc -l -// Expected: "0\n" -func TestGNUCompatLinesNoNewline(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x y") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -// TestGNUCompatWordsEmpty — -w on empty. -// -// GNU command: printf ” | gwc -w -// Expected: "0\n" -func TestGNUCompatWordsEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -// TestGNUCompatWordsMulti — -w on "x y\nz". -// -// GNU command: printf 'x y\nz' | gwc -w -// Expected: "3\n" -func TestGNUCompatWordsMulti(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x y\nz") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "3 file.txt\n", stdout) -} - -// TestGNUCompatBytesCount — -c on "x". -// -// GNU command: printf 'x' | gwc -c -// Expected: "1\n" -func TestGNUCompatBytesCount(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x") - stdout, _, code := cmdRun(t, "wc -c file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -// TestGNUCompatMaxLineLen — -L on "1\n12\n". -// -// GNU command: printf '1\n12\n' | gwc -L -// Expected: "2\n" -func TestGNUCompatMaxLineLen(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "1\n12\n") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 file.txt\n", stdout) -} - -// TestGNUCompatMaxLineLenLastLine — -L on "\n123456" (no trailing newline). -// -// GNU command: printf '\n123456' | gwc -L -// Expected: "6\n" -func TestGNUCompatMaxLineLenLastLine(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "\n123456") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "6 file.txt\n", stdout) -} - -// TestGNUCompatMultipleFiles — two files with total line. -// -// GNU command: gwc a.txt b.txt -// a.txt = "hello\n" (1 line, 1 word, 6 bytes) -// b.txt = "world foo\n" (1 line, 2 words, 10 bytes) -// Expected: -// -// " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" -func TestGNUCompatMultipleFiles(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "a.txt", "hello\n") - writeFile(t, dir, "b.txt", "world foo\n") - stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) -} - -// TestGNUCompatCharsMultibyte — -m on "café\n". -// -// GNU command: printf 'café\n' | gwc -m -// Expected: "5\n" (5 chars: c, a, f, é, \n) -func TestGNUCompatCharsMultibyte(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "café\n") - stdout, _, code := cmdRun(t, "wc -m file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "5 file.txt\n", stdout) -} - -// TestGNUCompatControlCharIsWord — control byte \x01 counts as a word. -// -// GNU command: printf '\x01\n' | gwc -w -// Expected: "1\n" -func TestGNUCompatControlCharIsWord(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "\x01\n") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -// TestGNUCompatRejectedFlag — unknown flag exits 1. -// -// GNU command: gwc --follow -// Expected: exit 1, stderr contains "wc:" -func TestGNUCompatRejectedFlag(t *testing.T) { - dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc --follow", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go deleted file mode 100644 index dd2e3d20..00000000 --- a/interp/builtins/wc/wc_test.go +++ /dev/null @@ -1,453 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -package wc_test - -import ( - "context" - "os" - "path/filepath" - "strings" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/DataDog/rshell/interp" - "github.com/DataDog/rshell/interp/builtins/testutil" -) - -func runScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { - t.Helper() - return testutil.RunScript(t, script, dir, opts...) -} - -func runScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { - t.Helper() - return testutil.RunScriptCtx(ctx, t, script, dir, opts...) -} - -func cmdRun(t *testing.T, script, dir string) (string, string, int) { - t.Helper() - return runScript(t, script, dir, interp.AllowedPaths([]string{dir})) -} - -func writeFile(t *testing.T, dir, name, content string) string { - t.Helper() - require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) - return name -} - -// --- Default mode (lines, words, bytes) --- - -func TestWcDefaultEmptyStdin(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "empty.txt", "") - stdout, _, code := cmdRun(t, "wc empty.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 0 0 empty.txt\n", stdout) -} - -func TestWcDefaultBasic(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "wc file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 3 6 file.txt\n", stdout) -} - -func TestWcDefaultNoTrailingNewline(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hello world") - stdout, _, code := cmdRun(t, "wc file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 0 2 11 file.txt\n", stdout) -} - -// --- Lines --- - -func TestWcLinesEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -func TestWcLinesNoNewline(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x y") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -func TestWcLinesOneNewline(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x y\n") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -func TestWcLinesTwoNewlines(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x\ny\n") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 file.txt\n", stdout) -} - -func TestWcLinesLongForm(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a\nb\nc\n") - stdout, _, code := cmdRun(t, "wc --lines file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "3 file.txt\n", stdout) -} - -// --- Words --- - -func TestWcWordsEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -func TestWcWordsSingle(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -func TestWcWordsMultiple(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x y\nz") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "3 file.txt\n", stdout) -} - -func TestWcWordsControlChar(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "\x01\n") - stdout, _, code := cmdRun(t, "wc -w file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -// --- Bytes --- - -func TestWcBytesEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "") - stdout, _, code := cmdRun(t, "wc -c file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -func TestWcBytesSingle(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "x") - stdout, _, code := cmdRun(t, "wc -c file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) -} - -func TestWcBytesMulti(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hello\n") - stdout, _, code := cmdRun(t, "wc -c file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "6 file.txt\n", stdout) -} - -// --- Chars --- - -func TestWcCharsASCII(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hello\n") - stdout, _, code := cmdRun(t, "wc -m file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "6 file.txt\n", stdout) -} - -func TestWcCharsMultibyte(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "café\n") - stdout, _, code := cmdRun(t, "wc -m file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "5 file.txt\n", stdout) -} - -func TestWcBytesMultibyte(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "café\n") - stdout, _, code := cmdRun(t, "wc -c file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "6 file.txt\n", stdout) -} - -func TestWcCharsAndBytes(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "café\n") - stdout, _, code := cmdRun(t, "wc -cm file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "5 6 file.txt\n", stdout) -} - -// --- Max line length --- - -func TestWcMaxLineLenBasic(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "1\n12\n") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 file.txt\n", stdout) -} - -func TestWcMaxLineLenThreeLines(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "1\n123\n1\n") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "3 file.txt\n", stdout) -} - -func TestWcMaxLineLenNoTrailingNewline(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "\n123456") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "6 file.txt\n", stdout) -} - -func TestWcMaxLineLenEmpty(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "0 file.txt\n", stdout) -} - -// --- Multiple files --- - -func TestWcMultipleFiles(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "a.txt", "hello\n") - writeFile(t, dir, "b.txt", "world foo\n") - stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) -} - -func TestWcMultipleFilesPartialFailure(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "a.txt", "hello\n") - stdout, stderr, code := cmdRun(t, "wc a.txt missing.txt", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stdout, "a.txt") - assert.Contains(t, stdout, "total") - assert.Contains(t, stderr, "wc:") -} - -// --- Stdin --- - -func TestWcStdinImplicit(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "cat file.txt | wc", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 2 3 6\n", stdout) -} - -func TestWcStdinDash(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "cat file.txt | wc -", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 2 3 6 -\n", stdout) -} - -func TestWcNilStdin(t *testing.T) { - dir := t.TempDir() - stdout, _, code := runScript(t, "wc", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 0 0 0\n", stdout) -} - -// --- Help --- - -func TestWcHelp(t *testing.T) { - dir := t.TempDir() - stdout, _, code := cmdRun(t, "wc --help", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "Usage:") -} - -func TestWcHelpShort(t *testing.T) { - dir := t.TempDir() - stdout, _, code := cmdRun(t, "wc -h", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "Usage:") -} - -// --- Error cases --- - -func TestWcMissingFile(t *testing.T) { - dir := t.TempDir() - stdout, stderr, code := cmdRun(t, "wc nonexistent.txt", dir) - assert.Equal(t, 1, code) - assert.Equal(t, "", stdout) - assert.Contains(t, stderr, "wc:") -} - -func TestWcUnknownFlag(t *testing.T) { - dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc --definitely-invalid", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} - -func TestWcFiles0FromRejected(t *testing.T) { - dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc --files0-from=foo", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} - -func TestWcDirectory(t *testing.T) { - dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc .", dir) - assert.Equal(t, 1, code) - assert.Contains(t, stderr, "wc:") -} - -// --- Hardening --- - -func TestWcDoubleDash(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hello\n") - stdout, _, code := cmdRun(t, "wc -- file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 1 6 file.txt\n", stdout) -} - -func TestWcContextCancellation(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", strings.Repeat("x\n", 100)) - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - _, _, code := runScriptCtx(ctx, t, "wc file.txt", dir, interp.AllowedPaths([]string{dir})) - assert.Equal(t, 0, code) -} - -func TestWcPipeInput(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "alpha\nbeta\ngamma\n") - stdout, _, code := cmdRun(t, "cat file.txt | wc -l", dir) - assert.Equal(t, 0, code) - assert.Equal(t, " 3\n", stdout) -} - -// --- Combined flags --- - -func TestWcAllFlags(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "wc -lwmcL file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 3 6 6 3 file.txt\n", stdout) -} - -func TestWcLinesAndWords(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a b\nc\n") - stdout, _, code := cmdRun(t, "wc -lw file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 3 file.txt\n", stdout) -} - -// --- Width formatting --- - -func TestWcWidthDeterminedByTotal(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "a.txt", strings.Repeat("word ", 20)+"\n") - writeFile(t, dir, "b.txt", "x\n") - stdout, _, code := cmdRun(t, "wc -w a.txt b.txt", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "total\n") - lines := strings.Split(strings.TrimSpace(stdout), "\n") - assert.Equal(t, 3, len(lines)) -} - -// --- Max line length: tab and CR --- - -func TestWcMaxLineLenTab(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a\tb\n") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "9 file.txt\n", stdout) -} - -func TestWcMaxLineLenCR(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hello\rworld\n") - stdout, _, code := cmdRun(t, "wc -L file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "5 file.txt\n", stdout) -} - -func TestWcCRLFLineCount(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "a\r\nb\r\n") - stdout, _, code := cmdRun(t, "wc -l file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "2 file.txt\n", stdout) -} - -// --- Binary / non-UTF8 input --- - -func TestWcBinaryInput(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.bin", string([]byte{0x00, 0xFF, 0xFE, 0x0A, 0x41})) - stdout, _, code := cmdRun(t, "wc file.bin", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "file.bin") - assert.Equal(t, 0, code) -} - -// --- Multibyte chars --- - -func TestWcCharsMultibyteEmoji(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "file.txt", "hi 💐\n") - stdout, _, code := cmdRun(t, "wc -m file.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "5 file.txt\n", stdout) -} - -// TestWcChunkBoundaryMultibyte verifies that a multibyte character straddling -// the 32 KiB read-buffer boundary is not double-counted. This requires -// programmatic file generation so it lives as a Go test rather than a scenario. -func TestWcChunkBoundaryMultibyte(t *testing.T) { - dir := t.TempDir() - // 💐 is 4 bytes; placing it at offset 32766 means it spans bytes 32766-32769, - // straddling the 32768-byte chunk boundary and exercising the carry logic. - prefix := strings.Repeat("a", 32*1024-2) - content := prefix + "💐\n" - writeFile(t, dir, "file.txt", content) - stdout, _, code := cmdRun(t, "wc -mL file.txt", dir) - assert.Equal(t, 0, code) - // chars: 32766 'a' + 1 emoji + 1 newline = 32768 - // max line length: 32766 + 2 (emoji display width) = 32768 - assert.Equal(t, "32768 32768 file.txt\n", stdout) -} diff --git a/interp/builtins/wc/wc_unix_test.go b/interp/builtins/wc/wc_unix_test.go deleted file mode 100644 index 7882ae13..00000000 --- a/interp/builtins/wc/wc_unix_test.go +++ /dev/null @@ -1,35 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026-present Datadog, Inc. - -//go:build unix - -package wc_test - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestWcSymlinkToFile(t *testing.T) { - dir := t.TempDir() - writeFile(t, dir, "target.txt", "hello\n") - require.NoError(t, os.Symlink("target.txt", filepath.Join(dir, "link.txt"))) - stdout, _, code := cmdRun(t, "wc link.txt", dir) - assert.Equal(t, 0, code) - assert.Equal(t, "1 1 6 link.txt\n", stdout) -} - -func TestWcDanglingSymlink(t *testing.T) { - dir := t.TempDir() - require.NoError(t, os.Symlink("nonexistent", filepath.Join(dir, "dangle.txt"))) - stdout, stderr, code := cmdRun(t, "wc dangle.txt", dir) - assert.Equal(t, 1, code) - assert.Equal(t, "", stdout) - assert.Contains(t, stderr, "wc:") -} diff --git a/interp/register_builtins.go b/interp/register_builtins.go index 03608922..6859c8b8 100644 --- a/interp/register_builtins.go +++ b/interp/register_builtins.go @@ -27,7 +27,6 @@ import ( "github.com/DataDog/rshell/interp/builtins/tr" truecmd "github.com/DataDog/rshell/interp/builtins/true" "github.com/DataDog/rshell/interp/builtins/uniq" - "github.com/DataDog/rshell/interp/builtins/wc" ) var registerOnce sync.Once @@ -54,7 +53,6 @@ func registerBuiltins() { tr.Cmd, truecmd.Cmd, uniq.Cmd, - wc.Cmd, } { cmd.Register() } diff --git a/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml b/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml deleted file mode 100644 index 9c687821..00000000 --- a/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml +++ /dev/null @@ -1,22 +0,0 @@ -description: ls piped to wc -l counts the number of entries. -skip_assert_against_bash: true -setup: - files: - - path: a.txt - content: "a" - chmod: 0644 - - path: b.txt - content: "b" - chmod: 0644 - - path: c.txt - content: "c" - chmod: 0644 -input: - allowed_paths: ["$DIR"] - script: |+ - ls | wc -l -expect: - stdout: |2+ - 3 - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml b/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml deleted file mode 100644 index f1a19458..00000000 --- a/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a0 -description: wc -c on empty file outputs 0. -setup: - files: - - path: empty.txt - content: "" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -c empty.txt -expect: - stdout: "0 empty.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/bytes/single_byte.yaml b/tests/scenarios/cmd/wc/bytes/single_byte.yaml deleted file mode 100644 index a4f69956..00000000 --- a/tests/scenarios/cmd/wc/bytes/single_byte.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a3 -description: wc -c on single byte input outputs 1. -setup: - files: - - path: file.txt - content: "x" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -c file.txt -expect: - stdout: "1 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/chars/basic.yaml b/tests/scenarios/cmd/wc/chars/basic.yaml deleted file mode 100644 index 25aaceb6..00000000 --- a/tests/scenarios/cmd/wc/chars/basic.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from standard POSIX wc -m behavior -description: wc -m counts characters (bytes for ASCII). -setup: - files: - - path: file.txt - content: "hello\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -m file.txt -expect: - stdout: "6 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/basic.yaml b/tests/scenarios/cmd/wc/default/basic.yaml deleted file mode 100644 index a52874db..00000000 --- a/tests/scenarios/cmd/wc/default/basic.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test b1 -description: wc default counts lines, words, bytes. -setup: - files: - - path: file.txt - content: "a b\nc\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc file.txt -expect: - stdout: "2 3 6 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_file.yaml b/tests/scenarios/cmd/wc/default/empty_file.yaml deleted file mode 100644 index 5b00343c..00000000 --- a/tests/scenarios/cmd/wc/default/empty_file.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from uutils test_file_empty -description: wc on an empty file shows all zeros. -setup: - files: - - path: empty.txt - content: "" -input: - allowed_paths: ["$DIR"] - script: |+ - wc empty.txt -expect: - stdout: "0 0 0 empty.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_stdin.yaml b/tests/scenarios/cmd/wc/default/empty_stdin.yaml deleted file mode 100644 index 5249634e..00000000 --- a/tests/scenarios/cmd/wc/default/empty_stdin.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test b0 -description: wc with no args and empty stdin outputs all zeros. -setup: - files: - - path: empty.txt - content: "" -input: - allowed_paths: ["$DIR"] - script: |+ - wc empty.txt -expect: - stdout: "0 0 0 empty.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/single_file.yaml b/tests/scenarios/cmd/wc/default/single_file.yaml deleted file mode 100644 index f6c1f873..00000000 --- a/tests/scenarios/cmd/wc/default/single_file.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from uutils test_single_default -description: wc with a single file shows lines words bytes and filename. -setup: - files: - - path: file.txt - content: "alpha\nbeta\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc file.txt -expect: - stdout: " 2 2 11 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml deleted file mode 100644 index d74a82d9..00000000 --- a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# Derived from GTFOBins safety requirement -description: wc rejects --files0-from flag (security risk). -input: - allowed_paths: ["$DIR"] - script: |+ - wc --files0-from=foo -expect: - stdout: "" - stderr_contains: ["wc:"] - exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/missing_file.yaml b/tests/scenarios/cmd/wc/errors/missing_file.yaml deleted file mode 100644 index 5cc5aeb4..00000000 --- a/tests/scenarios/cmd/wc/errors/missing_file.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# Derived from uutils test_read_from_nonexistent_file -description: wc exits 1 and prints error for nonexistent file. -input: - allowed_paths: ["$DIR"] - script: |+ - wc bogusfile -expect: - stdout: "" - stderr_contains: ["wc: bogusfile:"] - exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/unknown_flag.yaml b/tests/scenarios/cmd/wc/errors/unknown_flag.yaml deleted file mode 100644 index f14f0ba4..00000000 --- a/tests/scenarios/cmd/wc/errors/unknown_flag.yaml +++ /dev/null @@ -1,10 +0,0 @@ -# Derived from uutils test_invalid_arg -description: wc rejects unknown flags with exit code 1. -input: - allowed_paths: ["$DIR"] - script: |+ - wc --definitely-invalid -expect: - stdout: "" - stderr_contains: ["wc:"] - exit_code: 1 diff --git a/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml b/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml deleted file mode 100644 index 50b91f43..00000000 --- a/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from standard POSIX -- convention -description: wc accepts -- to end flag parsing. -setup: - files: - - path: file.txt - content: "hello\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -- file.txt -expect: - stdout: "1 1 6 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/empty_stdin.yaml b/tests/scenarios/cmd/wc/lines/empty_stdin.yaml deleted file mode 100644 index 33775902..00000000 --- a/tests/scenarios/cmd/wc/lines/empty_stdin.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a1 -description: wc -l on empty file outputs 0. -setup: - files: - - path: empty.txt - content: "" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -l empty.txt -expect: - stdout: "0 empty.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml deleted file mode 100644 index ecb45502..00000000 --- a/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a7 -description: wc -l counts newline bytes; text with no newline counts as 0 lines. -setup: - files: - - path: file.txt - content: "x y" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -l file.txt -expect: - stdout: "0 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/one_newline.yaml b/tests/scenarios/cmd/wc/lines/one_newline.yaml deleted file mode 100644 index 817ea07f..00000000 --- a/tests/scenarios/cmd/wc/lines/one_newline.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a8 -description: wc -l counts 1 for a single newline-terminated line. -setup: - files: - - path: file.txt - content: "x y\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -l file.txt -expect: - stdout: "1 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/two_newlines.yaml b/tests/scenarios/cmd/wc/lines/two_newlines.yaml deleted file mode 100644 index e9109634..00000000 --- a/tests/scenarios/cmd/wc/lines/two_newlines.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a9 -description: wc -l counts 2 for two newline-terminated lines. -setup: - files: - - path: file.txt - content: "x\ny\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -l file.txt -expect: - stdout: "2 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/basic.yaml b/tests/scenarios/cmd/wc/max_line_length/basic.yaml deleted file mode 100644 index e7461fde..00000000 --- a/tests/scenarios/cmd/wc/max_line_length/basic.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test c0 -description: wc -L reports the length of the longest line. -setup: - files: - - path: file.txt - content: "1\n12\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -L file.txt -expect: - stdout: "2 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml deleted file mode 100644 index b8b50009..00000000 --- a/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml +++ /dev/null @@ -1,14 +0,0 @@ -description: wc -L counts display columns, CJK characters are width 2. -skip_assert_against_bash: true # display width depends on locale; we always use Unicode width -setup: - files: - - path: file.txt - content: "你好\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -L file.txt -expect: - stdout: "4 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml deleted file mode 100644 index 88329252..00000000 --- a/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml +++ /dev/null @@ -1,14 +0,0 @@ -description: wc -L counts display columns, emoji characters are width 2. -skip_assert_against_bash: true # display width depends on locale; we always use Unicode width -setup: - files: - - path: file.txt - content: "ab💐\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -L file.txt -expect: - stdout: "4 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml deleted file mode 100644 index c417d540..00000000 --- a/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test c2 -description: wc -L counts a final line with no trailing newline. -setup: - files: - - path: file.txt - content: "\n123456" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -L file.txt -expect: - stdout: "6 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml b/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml deleted file mode 100644 index d70b6a20..00000000 --- a/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test c1 -description: wc -L picks the max among multiple lines. -setup: - files: - - path: file.txt - content: "1\n123\n1\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -L file.txt -expect: - stdout: "3 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/multiple_files/total_line.yaml b/tests/scenarios/cmd/wc/multiple_files/total_line.yaml deleted file mode 100644 index b374e75f..00000000 --- a/tests/scenarios/cmd/wc/multiple_files/total_line.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Derived from GNU coreutils wc-total.sh -description: wc prints a total line when given multiple files. -setup: - files: - - path: a.txt - content: "hello\n" - - path: b.txt - content: "world foo\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc a.txt b.txt -expect: - stdout: " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml b/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml deleted file mode 100644 index 1804c0ad..00000000 --- a/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from uutils test_stdin_explicit -description: wc with explicit - reads stdin and shows filename -. -setup: - files: - - path: file.txt - content: "a b\nc\n" -input: - allowed_paths: ["$DIR"] - script: |+ - cat file.txt | wc - -expect: - stdout: " 2 3 6 -\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/implicit.yaml b/tests/scenarios/cmd/wc/stdin/implicit.yaml deleted file mode 100644 index ed40861c..00000000 --- a/tests/scenarios/cmd/wc/stdin/implicit.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from standard POSIX wc behavior -description: wc reads stdin implicitly when no files are given. -setup: - files: - - path: file.txt - content: "a b\nc\n" -input: - allowed_paths: ["$DIR"] - script: |+ - cat file.txt | wc -expect: - stdout: " 2 3 6\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/across_lines.yaml b/tests/scenarios/cmd/wc/words/across_lines.yaml deleted file mode 100644 index b57e7ff8..00000000 --- a/tests/scenarios/cmd/wc/words/across_lines.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a6 -description: wc -w counts words across lines including a line without trailing newline. -setup: - files: - - path: file.txt - content: "x y\nz" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -w file.txt -expect: - stdout: "3 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/empty_stdin.yaml b/tests/scenarios/cmd/wc/words/empty_stdin.yaml deleted file mode 100644 index 2f77334b..00000000 --- a/tests/scenarios/cmd/wc/words/empty_stdin.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a2 -description: wc -w on empty file outputs 0. -setup: - files: - - path: empty.txt - content: "" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -w empty.txt -expect: - stdout: "0 empty.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/single_word.yaml b/tests/scenarios/cmd/wc/words/single_word.yaml deleted file mode 100644 index c5fb92da..00000000 --- a/tests/scenarios/cmd/wc/words/single_word.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a4 -description: wc -w counts 1 word for a single non-whitespace token. -setup: - files: - - path: file.txt - content: "x" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -w file.txt -expect: - stdout: "1 file.txt\n" - stderr: "" - exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/two_words.yaml b/tests/scenarios/cmd/wc/words/two_words.yaml deleted file mode 100644 index 9981335d..00000000 --- a/tests/scenarios/cmd/wc/words/two_words.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Derived from GNU coreutils wc.pl test a5 -description: wc -w counts 2 words on a line with two tokens. -setup: - files: - - path: file.txt - content: "x y\n" -input: - allowed_paths: ["$DIR"] - script: |+ - wc -w file.txt -expect: - stdout: "2 file.txt\n" - stderr: "" - exit_code: 0 From 63e05a773e6f0a7b05f266aafeb56e02cea6fd77 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 20:16:42 +0100 Subject: [PATCH 02/28] Revert "remove wc" This reverts commit d35e98403a9010419bfcfe21898a22ad509e1014. --- README.md | 2 +- SHELL_FEATURES.md | 1 + interp/builtin_wc_pentest_test.go | 237 +++++++++ interp/builtins/wc/builtin_wc_pentest_test.go | 64 +++ interp/builtins/wc/wc.go | 407 ++++++++++++++++ interp/builtins/wc/wc_gnu_compat_test.go | 172 +++++++ interp/builtins/wc/wc_test.go | 453 ++++++++++++++++++ interp/builtins/wc/wc_unix_test.go | 35 ++ interp/register_builtins.go | 2 + tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml | 22 + tests/scenarios/cmd/wc/bytes/empty_stdin.yaml | 14 + tests/scenarios/cmd/wc/bytes/single_byte.yaml | 14 + tests/scenarios/cmd/wc/chars/basic.yaml | 14 + tests/scenarios/cmd/wc/default/basic.yaml | 14 + .../scenarios/cmd/wc/default/empty_file.yaml | 14 + .../scenarios/cmd/wc/default/empty_stdin.yaml | 14 + .../scenarios/cmd/wc/default/single_file.yaml | 14 + .../cmd/wc/errors/files0_from_rejected.yaml | 10 + .../scenarios/cmd/wc/errors/missing_file.yaml | 10 + .../scenarios/cmd/wc/errors/unknown_flag.yaml | 10 + .../wc/hardening/double_dash_separator.yaml | 14 + tests/scenarios/cmd/wc/lines/empty_stdin.yaml | 14 + .../cmd/wc/lines/no_trailing_newline.yaml | 14 + tests/scenarios/cmd/wc/lines/one_newline.yaml | 14 + .../scenarios/cmd/wc/lines/two_newlines.yaml | 14 + .../cmd/wc/max_line_length/basic.yaml | 14 + .../cmd/wc/max_line_length/fullwidth_cjk.yaml | 14 + .../wc/max_line_length/fullwidth_emoji.yaml | 14 + .../max_line_length/no_trailing_newline.yaml | 14 + .../cmd/wc/max_line_length/three_lines.yaml | 14 + .../cmd/wc/multiple_files/total_line.yaml | 16 + .../scenarios/cmd/wc/stdin/dash_explicit.yaml | 14 + tests/scenarios/cmd/wc/stdin/implicit.yaml | 14 + .../scenarios/cmd/wc/words/across_lines.yaml | 14 + tests/scenarios/cmd/wc/words/empty_stdin.yaml | 14 + tests/scenarios/cmd/wc/words/single_word.yaml | 14 + tests/scenarios/cmd/wc/words/two_words.yaml | 14 + 37 files changed, 1762 insertions(+), 1 deletion(-) create mode 100644 interp/builtin_wc_pentest_test.go create mode 100644 interp/builtins/wc/builtin_wc_pentest_test.go create mode 100644 interp/builtins/wc/wc.go create mode 100644 interp/builtins/wc/wc_gnu_compat_test.go create mode 100644 interp/builtins/wc/wc_test.go create mode 100644 interp/builtins/wc/wc_unix_test.go create mode 100644 tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml create mode 100644 tests/scenarios/cmd/wc/bytes/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/bytes/single_byte.yaml create mode 100644 tests/scenarios/cmd/wc/chars/basic.yaml create mode 100644 tests/scenarios/cmd/wc/default/basic.yaml create mode 100644 tests/scenarios/cmd/wc/default/empty_file.yaml create mode 100644 tests/scenarios/cmd/wc/default/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/default/single_file.yaml create mode 100644 tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml create mode 100644 tests/scenarios/cmd/wc/errors/missing_file.yaml create mode 100644 tests/scenarios/cmd/wc/errors/unknown_flag.yaml create mode 100644 tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml create mode 100644 tests/scenarios/cmd/wc/lines/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml create mode 100644 tests/scenarios/cmd/wc/lines/one_newline.yaml create mode 100644 tests/scenarios/cmd/wc/lines/two_newlines.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/basic.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/three_lines.yaml create mode 100644 tests/scenarios/cmd/wc/multiple_files/total_line.yaml create mode 100644 tests/scenarios/cmd/wc/stdin/dash_explicit.yaml create mode 100644 tests/scenarios/cmd/wc/stdin/implicit.yaml create mode 100644 tests/scenarios/cmd/wc/words/across_lines.yaml create mode 100644 tests/scenarios/cmd/wc/words/empty_stdin.yaml create mode 100644 tests/scenarios/cmd/wc/words/single_word.yaml create mode 100644 tests/scenarios/cmd/wc/words/two_words.yaml diff --git a/README.md b/README.md index 0b9ede76..cc2df95a 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Linux, macOS, and Windows. ``` tests/scenarios/ -├── cmd/ # builtin command tests (echo, cat, grep, head, tail, test, uniq, ...) +├── cmd/ # builtin command tests (echo, cat, grep, head, tail, test, uniq, wc, ...) └── shell/ # shell feature tests (pipes, variables, control flow, ...) ``` diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 0a9f8200..9a42affe 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -23,6 +23,7 @@ Blocked features are rejected before execution with exit code 2. - ✅ `tr [-cdsCt] SET1 [SET2]` — translate, squeeze, and/or delete characters from stdin - ✅ `true` — return exit code 0 - ✅ `uniq [OPTION]... [INPUT]` — report or omit repeated lines +- ✅ `wc [-l] [-w] [-c] [-m] [FILE]...` — count lines, words, bytes, or characters in files - ❌ All other commands — return exit code 127 with `: not found` unless an ExecHandler is configured ## Variables diff --git a/interp/builtin_wc_pentest_test.go b/interp/builtin_wc_pentest_test.go new file mode 100644 index 00000000..a5c88636 --- /dev/null +++ b/interp/builtin_wc_pentest_test.go @@ -0,0 +1,237 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package interp_test + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "mvdan.cc/sh/v3/syntax" + + "github.com/DataDog/rshell/interp" +) + +func wcRun(t *testing.T, script, dir string) (string, string, int) { + t.Helper() + return wcRunCtx(context.Background(), t, script, dir) +} + +func wcRunCtx(ctx context.Context, t *testing.T, script, dir string) (string, string, int) { + t.Helper() + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + + var outBuf, errBuf bytes.Buffer + opts := []interp.RunnerOption{ + interp.StdIO(nil, &outBuf, &errBuf), + interp.AllowedPaths([]string{dir}), + } + + runner, err := interp.New(opts...) + require.NoError(t, err) + defer runner.Close() + + if dir != "" { + runner.Dir = dir + } + + err = runner.Run(ctx, prog) + exitCode := 0 + if err != nil { + var es interp.ExitStatus + if errors.As(err, &es) { + exitCode = int(es) + } else if ctx.Err() == nil { + t.Fatalf("unexpected error: %v", err) + } + } + return outBuf.String(), errBuf.String(), exitCode +} + +func wcWriteFile(t *testing.T, dir, name, content string) { + t.Helper() + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) +} + +// --- Flag and argument injection --- + +func TestWcPentestUnknownFlags(t *testing.T) { + dir := t.TempDir() + for _, flag := range []string{"-f", "--follow", "--no-such-flag", "--files0-from=foo"} { + _, stderr, code := wcRun(t, "wc "+flag, dir) + assert.Equal(t, 1, code, "flag: %s", flag) + assert.Contains(t, stderr, "wc:", "flag: %s", flag) + } +} + +func TestWcPentestDoubleDashFlagLikeFile(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "-v", "hello\n") + stdout, _, code := wcRun(t, "wc -- -v", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "-v") +} + +func TestWcPentestMultipleStdin(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", "hello\n") + stdout, _, code := wcRun(t, "cat file.txt | wc - -", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total") +} + +// --- Path edge cases --- + +func TestWcPentestNonexistentFile(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := wcRun(t, "wc nonexistent.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +func TestWcPentestEmptyFilename(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := wcRun(t, "wc ''", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +// --- Special files --- + +func TestWcPentestDevNull(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "empty.txt", "") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "0") +} + +// --- Context cancellation --- + +func TestWcPentestContextCancelled(t *testing.T) { + dir := t.TempDir() + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, _, _ = wcRunCtx(ctx, t, "wc", dir) +} + +func TestWcPentestContextTimeout(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", strings.Repeat("hello\n", 10000)) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "10000") +} + +// --- Large input --- + +func TestWcPentestLargeFile(t *testing.T) { + dir := t.TempDir() + content := strings.Repeat("word word word word word\n", 40000) + wcWriteFile(t, dir, "large.txt", content) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc -l large.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "40000") +} + +// --- Many files (FD leak check) --- + +func TestWcPentestManyFiles(t *testing.T) { + dir := t.TempDir() + var args []string + for i := 0; i < 50; i++ { + name := filepath.Join(dir, strings.ReplaceAll(filepath.Base(t.Name()), "/", "_")+"_"+string(rune('a'+i%26))+string(rune('0'+i/26))+".txt") + require.NoError(t, os.WriteFile(name, []byte("x\n"), 0644)) + args = append(args, filepath.Base(name)) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc "+strings.Join(args, " "), dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total") +} + +// --- Edge case: file with only newlines --- + +func TestWcPentestOnlyNewlines(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", strings.Repeat("\n", 100)) + stdout, _, code := wcRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "100") + assert.Contains(t, stdout, " 0") +} + +// --- Edge case: long line --- + +func TestWcPentestLongLine(t *testing.T) { + dir := t.TempDir() + longLine := strings.Repeat("x", 1024*1024) + "\n" + wcWriteFile(t, dir, "file.txt", longLine) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + stdout, _, code := wcRunCtx(ctx, t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "1048576") +} + +// --- Invalid UTF-8 at chunk boundary --- + +func TestWcPentestInvalidUTF8AtChunkBoundary(t *testing.T) { + dir := t.TempDir() + // Build content so that invalid UTF-8 bytes (0xC0 0x80) land at the + // exact 32 KiB read boundary. This exercises the carry buffer edge + // case where invalid bytes must be handled in-place (not carried). + const chunkSize = 32 * 1024 + padding := strings.Repeat("A", chunkSize-1) // fills up to byte 32767 + // Place 0xC0 at offset 32767 (last byte of first chunk) and 0x80 at + // offset 32768 (first byte of second chunk). + content := []byte(padding) + content = append(content, 0xC0, 0x80) + content = append(content, '\n') + + require.NoError(t, os.WriteFile(filepath.Join(dir, "invalid_utf8.txt"), content, 0644)) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // -c should report exact byte count + stdout, _, code := wcRunCtx(ctx, t, "wc -c invalid_utf8.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "32770") // chunkSize - 1 + 2 invalid bytes + newline + + // -l should count the newline + stdout, _, code = wcRunCtx(ctx, t, "wc -l invalid_utf8.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "1") +} + +// --- Flag expansion in loop --- + +func TestWcPentestFlagExpansion(t *testing.T) { + dir := t.TempDir() + wcWriteFile(t, dir, "file.txt", "hello\n") + _, stderr, code := wcRun(t, "for flag in --follow; do wc $flag file.txt; done", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/builtins/wc/builtin_wc_pentest_test.go b/interp/builtins/wc/builtin_wc_pentest_test.go new file mode 100644 index 00000000..f7a0e53e --- /dev/null +++ b/interp/builtins/wc/builtin_wc_pentest_test.go @@ -0,0 +1,64 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +// Exploratory pentest for the wc builtin — GTFOBins validation. +// +// GTFOBins documents wc as capable of reading file contents via +// "wc --files0-from /path/to/file". This flag is intentionally not +// implemented in rshell, so pflag rejects it as an unknown flag. +// See: https://gtfobins.org/gtfobins/wc/ + +package wc_test + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/DataDog/rshell/interp" +) + +// TestWcGTFOBinsFiles0FromRejected verifies that the GTFOBins file-read +// technique "wc --files0-from" is blocked because the flag is not implemented. +// +// GTFOBins: https://gtfobins.org/gtfobins/wc/ +// Technique: wc --files0-from /path/to/input-file +func TestWcGTFOBinsFiles0FromRejected(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "target.txt", "secret data\n") + _, stderr, code := cmdRun(t, "wc --files0-from target.txt", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +// TestWcGTFOBinsFileReadSandboxEscape verifies that wc cannot read files +// outside the AllowedPaths sandbox. +// +// GTFOBins: https://gtfobins.org/gtfobins/wc/ +// Technique: wc /path/to/input-file (reads file to count lines/words/bytes) +func TestWcGTFOBinsFileReadSandboxEscape(t *testing.T) { + allowed := t.TempDir() + secret := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(secret, "secret.txt"), []byte("secret data"), 0644)) + secretPath := filepath.ToSlash(filepath.Join(secret, "secret.txt")) + _, stderr, code := runScript(t, "wc "+secretPath, allowed, interp.AllowedPaths([]string{allowed})) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +// TestWcGTFOBinsFileReadTraversal verifies that path traversal cannot +// escape the sandbox when using wc. +// +// GTFOBins: https://gtfobins.org/gtfobins/wc/ +// Technique: wc ../../etc/passwd +func TestWcGTFOBinsFileReadTraversal(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc ../../etc/passwd", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go new file mode 100644 index 00000000..7fbf50ee --- /dev/null +++ b/interp/builtins/wc/wc.go @@ -0,0 +1,407 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +// Package wc implements the wc builtin command. +// +// wc — print newline, word, and byte counts for each file +// +// Usage: wc [OPTION]... [FILE]... +// +// Print newline, word, and byte counts for each FILE, and a total line +// if more than one FILE is specified. A word is a non-zero-length sequence +// of characters delimited by white space. With no FILE, or when FILE is -, +// read standard input. +// +// When no flags are given, -l, -w, and -c are assumed (lines, words, bytes). +// +// Accepted flags: +// +// -l, --lines +// Print the newline count. +// +// -w, --words +// Print the word count. +// +// -c, --bytes +// Print the byte count. +// +// -m, --chars +// Print the character count. In a multibyte locale, the number of +// characters may differ from the number of bytes. +// +// -L, --max-line-length +// Print the length of the longest line. +// +// -h, --help +// Print this usage message to stdout and exit 0. +// +// Output columns always appear in a fixed order: lines, words, chars, +// bytes, max-line-length. Only the requested columns are shown. Column +// widths are right-justified to the width of the largest count across +// all files (including the total line, if any). +// +// Exit codes: +// +// 0 All files processed successfully. +// 1 At least one error occurred (missing file, invalid argument, etc.). +// +// Memory safety: +// +// Input is read in fixed-size chunks (32 KiB). Lines longer than +// MaxLineBytes (1 MiB) are split across chunks for counting purposes +// but never fully buffered. All loops check ctx.Err() at each +// iteration to honour the shell's execution timeout. +package wc + +import ( + "context" + "io" + "os" + "strconv" + "unicode" + "unicode/utf8" + + "github.com/DataDog/rshell/interp/builtins" +) + +// Cmd is the wc builtin command descriptor. +var Cmd = builtins.Command{Name: "wc", MakeFlags: registerFlags} + +const chunkSize = 32 * 1024 // 32 KiB read buffer +const stdinMinWidth = 7 // GNU wc minimum column width for stdin + +type counts struct { + lines int64 + words int64 + chars int64 + bytes int64 + maxLineLen int64 +} + +type options struct { + showLines bool + showWords bool + showBytes bool + showChars bool + showMaxLineLen bool +} + +func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { + help := fs.BoolP("help", "h", false, "print usage and exit") + lines := fs.BoolP("lines", "l", false, "print the newline counts") + words := fs.BoolP("words", "w", false, "print the word counts") + bytesFlag := fs.BoolP("bytes", "c", false, "print the byte counts") + chars := fs.BoolP("chars", "m", false, "print the character counts") + maxLineLen := fs.BoolP("max-line-length", "L", false, "print the maximum display width") + + // Security: --files0-from is intentionally NOT implemented. + // GTFOBins: this flag reads filenames from a file, enabling + // data exfiltration in sandboxed environments. + + return func(ctx context.Context, callCtx *builtins.CallContext, files []string) builtins.Result { + if *help { + callCtx.Out("Usage: wc [OPTION]... [FILE]...\n") + callCtx.Out("Print newline, word, and byte counts for each FILE.\n") + callCtx.Out("With no FILE, or when FILE is -, read standard input.\n\n") + fs.SetOutput(callCtx.Stdout) + fs.PrintDefaults() + return builtins.Result{} + } + + opts := options{ + showLines: *lines, + showWords: *words, + showBytes: *bytesFlag, + showChars: *chars, + showMaxLineLen: *maxLineLen, + } + + if !opts.showLines && !opts.showWords && !opts.showBytes && !opts.showChars && !opts.showMaxLineLen { + opts.showLines = true + opts.showWords = true + opts.showBytes = true + } + + stdinImplicit := len(files) == 0 + if stdinImplicit { + files = []string{"-"} + } + + hasStdin := stdinImplicit + if !hasStdin { + for _, f := range files { + if f == "-" { + hasStdin = true + break + } + } + } + + var total counts + var failed bool + + type fileResult struct { + name string + c counts + } + results := make([]fileResult, 0, len(files)) + + for _, file := range files { + if ctx.Err() != nil { + break + } + c, err := countFile(ctx, callCtx, file) + if err != nil { + name := file + if file == "-" { + name = "standard input" + } + callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) + failed = true + if c == (counts{}) { + continue + } + } + results = append(results, fileResult{name: file, c: c}) + total.lines += c.lines + total.words += c.words + total.chars += c.chars + total.bytes += c.bytes + if c.maxLineLen > total.maxLineLen { + total.maxLineLen = c.maxLineLen + } + } + + width := fieldWidth(total, opts) + if hasStdin && width < stdinMinWidth { + width = stdinMinWidth + } + + for _, fr := range results { + name := fr.name + if name == "-" && stdinImplicit { + name = "" + } + printCounts(callCtx, fr.c, opts, width, name) + } + + if len(files) > 1 { + printCounts(callCtx, total, opts, width, "total") + } + + if failed { + return builtins.Result{Code: 1} + } + return builtins.Result{} + } +} + +func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) { + var rc io.ReadCloser + if path == "-" { + if callCtx.Stdin == nil { + return counts{}, nil + } + rc = io.NopCloser(callCtx.Stdin) + } else { + f, err := callCtx.OpenFile(ctx, path, os.O_RDONLY, 0) + if err != nil { + return counts{}, err + } + rc = f + } + defer rc.Close() + return countReader(ctx, rc) +} + +func countReader(ctx context.Context, r io.Reader) (counts, error) { + buf := make([]byte, chunkSize) + var c counts + var inWord bool + var lineLen int64 + var carry [utf8.UTFMax - 1]byte + var carryN int + + for { + if ctx.Err() != nil { + return c, ctx.Err() + } + n, err := r.Read(buf[carryN:]) + if carryN > 0 { + copy(buf, carry[:carryN]) + n += carryN + carryN = 0 + } + if n > 0 { + chunk := buf[:n] + c.bytes += int64(n) + + // Handle incomplete UTF-8 at end of chunk. + // When tail >= n (e.g., n == 1 with a single invalid byte), the + // condition below is false, so the byte stays in chunk and + // DecodeRune processes it as a replacement character — this is + // correct and matches utf8.DecodeRune semantics. + tail := 0 + if !utf8.Valid(chunk) { + for tail = 1; tail <= 3 && tail < n; tail++ { + if utf8.Valid(chunk[:n-tail]) { + break + } + } + if tail > 0 && tail <= 3 && tail < n { + carryN = copy(carry[:], chunk[n-tail:]) + chunk = chunk[:n-tail] + } else { + tail = 0 + } + } + c.chars += int64(utf8.RuneCount(chunk)) + // carryN bytes are subtracted here and will be re-added via + // n += carryN at the top of the next iteration. + c.bytes -= int64(carryN) + + for i := 0; i < len(chunk); { + r, size := utf8.DecodeRune(chunk[i:]) + i += size + if r == '\n' { + c.lines++ + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } + lineLen = 0 + inWord = false + } else if r == '\r' { + lineLen = 0 + inWord = false + } else if r == '\t' { + lineLen = (lineLen/8 + 1) * 8 + inWord = false + } else if r == ' ' || r == '\v' || r == '\f' { + lineLen++ + inWord = false + } else { + if !inWord { + c.words++ + inWord = true + } + lineLen += int64(runeWidth(r)) + } + } + } + if err == io.EOF { + if carryN > 0 { + c.chars += int64(utf8.RuneCount(carry[:carryN])) + c.bytes += int64(carryN) + carryN = 0 + } + break + } + if err != nil { + return c, err + } + } + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } + return c, nil +} + +func fieldWidth(total counts, opts options) int { + max := int64(0) + if opts.showLines && total.lines > max { + max = total.lines + } + if opts.showWords && total.words > max { + max = total.words + } + if opts.showChars && total.chars > max { + max = total.chars + } + if opts.showBytes && total.bytes > max { + max = total.bytes + } + if opts.showMaxLineLen && total.maxLineLen > max { + max = total.maxLineLen + } + w := len(strconv.FormatInt(max, 10)) + return w +} + +// runeWidth returns the display width of a rune following wcwidth(3) rules: +// 0 for controls, combining marks, and format chars; 2 for East Asian +// Wide/Fullwidth; 1 for everything else. +func runeWidth(r rune) int { + if unicode.Is(unicode.Cc, r) { + return 0 + } + if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Cf, r) { + return 0 + } + // Hangul Jamo medial vowels and final consonants (zero-width in syllable composition). + if r >= 0x1160 && r <= 0x11FF { + return 0 + } + if unicode.Is(eastAsianWide, r) { + return 2 + } + return 1 +} + +// eastAsianWide is a RangeTable covering East Asian Wide and Fullwidth +// codepoints per UAX #11, matching the ranges used by wcwidth(3). +var eastAsianWide = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x1100, 0x115F, 1}, // Hangul Jamo initials + {0x2329, 0x232A, 1}, // CJK angle brackets + {0x2E80, 0x303E, 1}, // CJK Radicals Supplement .. CJK Symbols + {0x3040, 0x33BF, 1}, // Hiragana .. CJK Compatibility + {0x33C0, 0x33FF, 1}, // CJK Compatibility (cont.) + {0x3400, 0x4DBF, 1}, // CJK Unified Ideographs Extension A + {0x4E00, 0xA4CF, 1}, // CJK Unified Ideographs .. Yi + {0xAC00, 0xD7A3, 1}, // Hangul Syllables + {0xF900, 0xFAFF, 1}, // CJK Compatibility Ideographs + {0xFE10, 0xFE19, 1}, // Vertical Forms + {0xFE30, 0xFE6F, 1}, // CJK Compatibility Forms + Small Form Variants + {0xFF01, 0xFF60, 1}, // Fullwidth Forms + {0xFFE0, 0xFFE6, 1}, // Fullwidth Signs + }, + R32: []unicode.Range32{ + {0x1F300, 0x1F64F, 1}, // Misc Symbols/Pictographs + Emoticons + {0x1F900, 0x1F9FF, 1}, // Supplemental Symbols and Pictographs + {0x20000, 0x2FFFD, 1}, // CJK Extension B..F + {0x30000, 0x3FFFD, 1}, // CJK Extension G+ + }, +} + +func printCounts(callCtx *builtins.CallContext, c counts, opts options, width int, name string) { + first := true + printField := func(val int64) { + if first { + callCtx.Outf("%*d", width, val) + first = false + } else { + callCtx.Outf(" %*d", width, val) + } + } + if opts.showLines { + printField(c.lines) + } + if opts.showWords { + printField(c.words) + } + if opts.showChars { + printField(c.chars) + } + if opts.showBytes { + printField(c.bytes) + } + if opts.showMaxLineLen { + printField(c.maxLineLen) + } + if name != "" { + callCtx.Outf(" %s", name) + } + callCtx.Out("\n") +} diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go new file mode 100644 index 00000000..90966364 --- /dev/null +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -0,0 +1,172 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package wc_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestGNUCompatDefaultEmpty — no flags on empty input. +// +// GNU command: printf ” | gwc +// Expected: " 0 0 0\n" +func TestGNUCompatDefaultEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "empty.txt", "") + stdout, _, code := cmdRun(t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 0 0 empty.txt\n", stdout) +} + +// TestGNUCompatDefaultBasic — default counts on "a b\nc\n". +// +// GNU command: printf 'a b\nc\n' | gwc +// Expected: " 2 3 6\n" +func TestGNUCompatDefaultBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 file.txt\n", stdout) +} + +// TestGNUCompatLinesCount — -l on input with 2 newlines. +// +// GNU command: printf 'x\ny\n' | gwc -l +// Expected: "2\n" +func TestGNUCompatLinesCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x\ny\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatLinesNoNewline — -l on input with no newline. +// +// GNU command: printf 'x y' | gwc -l +// Expected: "0\n" +func TestGNUCompatLinesNoNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// TestGNUCompatWordsEmpty — -w on empty. +// +// GNU command: printf ” | gwc -w +// Expected: "0\n" +func TestGNUCompatWordsEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// TestGNUCompatWordsMulti — -w on "x y\nz". +// +// GNU command: printf 'x y\nz' | gwc -w +// Expected: "3\n" +func TestGNUCompatWordsMulti(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\nz") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +// TestGNUCompatBytesCount — -c on "x". +// +// GNU command: printf 'x' | gwc -c +// Expected: "1\n" +func TestGNUCompatBytesCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLen — -L on "1\n12\n". +// +// GNU command: printf '1\n12\n' | gwc -L +// Expected: "2\n" +func TestGNUCompatMaxLineLen(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n12\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLenLastLine — -L on "\n123456" (no trailing newline). +// +// GNU command: printf '\n123456' | gwc -L +// Expected: "6\n" +func TestGNUCompatMaxLineLenLastLine(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\n123456") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +// TestGNUCompatMultipleFiles — two files with total line. +// +// GNU command: gwc a.txt b.txt +// a.txt = "hello\n" (1 line, 1 word, 6 bytes) +// b.txt = "world foo\n" (1 line, 2 words, 10 bytes) +// Expected: +// +// " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" +func TestGNUCompatMultipleFiles(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + writeFile(t, dir, "b.txt", "world foo\n") + stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) +} + +// TestGNUCompatCharsMultibyte — -m on "café\n". +// +// GNU command: printf 'café\n' | gwc -m +// Expected: "5\n" (5 chars: c, a, f, é, \n) +func TestGNUCompatCharsMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +// TestGNUCompatControlCharIsWord — control byte \x01 counts as a word. +// +// GNU command: printf '\x01\n' | gwc -w +// Expected: "1\n" +func TestGNUCompatControlCharIsWord(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\x01\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// TestGNUCompatRejectedFlag — unknown flag exits 1. +// +// GNU command: gwc --follow +// Expected: exit 1, stderr contains "wc:" +func TestGNUCompatRejectedFlag(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --follow", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go new file mode 100644 index 00000000..dd2e3d20 --- /dev/null +++ b/interp/builtins/wc/wc_test.go @@ -0,0 +1,453 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package wc_test + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/DataDog/rshell/interp" + "github.com/DataDog/rshell/interp/builtins/testutil" +) + +func runScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return testutil.RunScript(t, script, dir, opts...) +} + +func runScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return testutil.RunScriptCtx(ctx, t, script, dir, opts...) +} + +func cmdRun(t *testing.T, script, dir string) (string, string, int) { + t.Helper() + return runScript(t, script, dir, interp.AllowedPaths([]string{dir})) +} + +func writeFile(t *testing.T, dir, name, content string) string { + t.Helper() + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) + return name +} + +// --- Default mode (lines, words, bytes) --- + +func TestWcDefaultEmptyStdin(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "empty.txt", "") + stdout, _, code := cmdRun(t, "wc empty.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 0 0 empty.txt\n", stdout) +} + +func TestWcDefaultBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 file.txt\n", stdout) +} + +func TestWcDefaultNoTrailingNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello world") + stdout, _, code := cmdRun(t, "wc file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 0 2 11 file.txt\n", stdout) +} + +// --- Lines --- + +func TestWcLinesEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcLinesNoNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcLinesOneNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcLinesTwoNewlines(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x\ny\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +func TestWcLinesLongForm(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\nb\nc\n") + stdout, _, code := cmdRun(t, "wc --lines file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +// --- Words --- + +func TestWcWordsEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcWordsSingle(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcWordsMultiple(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x y\nz") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +func TestWcWordsControlChar(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\x01\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +// --- Bytes --- + +func TestWcBytesEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +func TestWcBytesSingle(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "x") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 file.txt\n", stdout) +} + +func TestWcBytesMulti(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +// --- Chars --- + +func TestWcCharsASCII(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcCharsMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +func TestWcBytesMultibyte(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -c file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcCharsAndBytes(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "café\n") + stdout, _, code := cmdRun(t, "wc -cm file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 6 file.txt\n", stdout) +} + +// --- Max line length --- + +func TestWcMaxLineLenBasic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n12\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +func TestWcMaxLineLenThreeLines(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "1\n123\n1\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + +func TestWcMaxLineLenNoTrailingNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "\n123456") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +func TestWcMaxLineLenEmpty(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "0 file.txt\n", stdout) +} + +// --- Multiple files --- + +func TestWcMultipleFiles(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + writeFile(t, dir, "b.txt", "world foo\n") + stdout, _, code := cmdRun(t, "wc a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n", stdout) +} + +func TestWcMultipleFilesPartialFailure(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", "hello\n") + stdout, stderr, code := cmdRun(t, "wc a.txt missing.txt", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stdout, "a.txt") + assert.Contains(t, stdout, "total") + assert.Contains(t, stderr, "wc:") +} + +// --- Stdin --- + +func TestWcStdinImplicit(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 2 3 6\n", stdout) +} + +func TestWcStdinDash(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc -", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 2 3 6 -\n", stdout) +} + +func TestWcNilStdin(t *testing.T) { + dir := t.TempDir() + stdout, _, code := runScript(t, "wc", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 0 0 0\n", stdout) +} + +// --- Help --- + +func TestWcHelp(t *testing.T) { + dir := t.TempDir() + stdout, _, code := cmdRun(t, "wc --help", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "Usage:") +} + +func TestWcHelpShort(t *testing.T) { + dir := t.TempDir() + stdout, _, code := cmdRun(t, "wc -h", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "Usage:") +} + +// --- Error cases --- + +func TestWcMissingFile(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, "wc nonexistent.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} + +func TestWcUnknownFlag(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --definitely-invalid", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +func TestWcFiles0FromRejected(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc --files0-from=foo", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +func TestWcDirectory(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, "wc .", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") +} + +// --- Hardening --- + +func TestWcDoubleDash(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\n") + stdout, _, code := cmdRun(t, "wc -- file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 1 6 file.txt\n", stdout) +} + +func TestWcContextCancellation(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", strings.Repeat("x\n", 100)) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, code := runScriptCtx(ctx, t, "wc file.txt", dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) +} + +func TestWcPipeInput(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "alpha\nbeta\ngamma\n") + stdout, _, code := cmdRun(t, "cat file.txt | wc -l", dir) + assert.Equal(t, 0, code) + assert.Equal(t, " 3\n", stdout) +} + +// --- Combined flags --- + +func TestWcAllFlags(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc -lwmcL file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 6 6 3 file.txt\n", stdout) +} + +func TestWcLinesAndWords(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a b\nc\n") + stdout, _, code := cmdRun(t, "wc -lw file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 3 file.txt\n", stdout) +} + +// --- Width formatting --- + +func TestWcWidthDeterminedByTotal(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "a.txt", strings.Repeat("word ", 20)+"\n") + writeFile(t, dir, "b.txt", "x\n") + stdout, _, code := cmdRun(t, "wc -w a.txt b.txt", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "total\n") + lines := strings.Split(strings.TrimSpace(stdout), "\n") + assert.Equal(t, 3, len(lines)) +} + +// --- Max line length: tab and CR --- + +func TestWcMaxLineLenTab(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\tb\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "9 file.txt\n", stdout) +} + +func TestWcMaxLineLenCR(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hello\rworld\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +func TestWcCRLFLineCount(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\r\nb\r\n") + stdout, _, code := cmdRun(t, "wc -l file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// --- Binary / non-UTF8 input --- + +func TestWcBinaryInput(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.bin", string([]byte{0x00, 0xFF, 0xFE, 0x0A, 0x41})) + stdout, _, code := cmdRun(t, "wc file.bin", dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "file.bin") + assert.Equal(t, 0, code) +} + +// --- Multibyte chars --- + +func TestWcCharsMultibyteEmoji(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "hi 💐\n") + stdout, _, code := cmdRun(t, "wc -m file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "5 file.txt\n", stdout) +} + +// TestWcChunkBoundaryMultibyte verifies that a multibyte character straddling +// the 32 KiB read-buffer boundary is not double-counted. This requires +// programmatic file generation so it lives as a Go test rather than a scenario. +func TestWcChunkBoundaryMultibyte(t *testing.T) { + dir := t.TempDir() + // 💐 is 4 bytes; placing it at offset 32766 means it spans bytes 32766-32769, + // straddling the 32768-byte chunk boundary and exercising the carry logic. + prefix := strings.Repeat("a", 32*1024-2) + content := prefix + "💐\n" + writeFile(t, dir, "file.txt", content) + stdout, _, code := cmdRun(t, "wc -mL file.txt", dir) + assert.Equal(t, 0, code) + // chars: 32766 'a' + 1 emoji + 1 newline = 32768 + // max line length: 32766 + 2 (emoji display width) = 32768 + assert.Equal(t, "32768 32768 file.txt\n", stdout) +} diff --git a/interp/builtins/wc/wc_unix_test.go b/interp/builtins/wc/wc_unix_test.go new file mode 100644 index 00000000..7882ae13 --- /dev/null +++ b/interp/builtins/wc/wc_unix_test.go @@ -0,0 +1,35 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build unix + +package wc_test + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWcSymlinkToFile(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "target.txt", "hello\n") + require.NoError(t, os.Symlink("target.txt", filepath.Join(dir, "link.txt"))) + stdout, _, code := cmdRun(t, "wc link.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1 1 6 link.txt\n", stdout) +} + +func TestWcDanglingSymlink(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.Symlink("nonexistent", filepath.Join(dir, "dangle.txt"))) + stdout, stderr, code := cmdRun(t, "wc dangle.txt", dir) + assert.Equal(t, 1, code) + assert.Equal(t, "", stdout) + assert.Contains(t, stderr, "wc:") +} diff --git a/interp/register_builtins.go b/interp/register_builtins.go index 6859c8b8..03608922 100644 --- a/interp/register_builtins.go +++ b/interp/register_builtins.go @@ -27,6 +27,7 @@ import ( "github.com/DataDog/rshell/interp/builtins/tr" truecmd "github.com/DataDog/rshell/interp/builtins/true" "github.com/DataDog/rshell/interp/builtins/uniq" + "github.com/DataDog/rshell/interp/builtins/wc" ) var registerOnce sync.Once @@ -53,6 +54,7 @@ func registerBuiltins() { tr.Cmd, truecmd.Cmd, uniq.Cmd, + wc.Cmd, } { cmd.Register() } diff --git a/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml b/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml new file mode 100644 index 00000000..9c687821 --- /dev/null +++ b/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml @@ -0,0 +1,22 @@ +description: ls piped to wc -l counts the number of entries. +skip_assert_against_bash: true +setup: + files: + - path: a.txt + content: "a" + chmod: 0644 + - path: b.txt + content: "b" + chmod: 0644 + - path: c.txt + content: "c" + chmod: 0644 +input: + allowed_paths: ["$DIR"] + script: |+ + ls | wc -l +expect: + stdout: |2+ + 3 + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml b/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml new file mode 100644 index 00000000..f1a19458 --- /dev/null +++ b/tests/scenarios/cmd/wc/bytes/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a0 +description: wc -c on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -c empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/bytes/single_byte.yaml b/tests/scenarios/cmd/wc/bytes/single_byte.yaml new file mode 100644 index 00000000..a4f69956 --- /dev/null +++ b/tests/scenarios/cmd/wc/bytes/single_byte.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a3 +description: wc -c on single byte input outputs 1. +setup: + files: + - path: file.txt + content: "x" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -c file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/chars/basic.yaml b/tests/scenarios/cmd/wc/chars/basic.yaml new file mode 100644 index 00000000..25aaceb6 --- /dev/null +++ b/tests/scenarios/cmd/wc/chars/basic.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX wc -m behavior +description: wc -m counts characters (bytes for ASCII). +setup: + files: + - path: file.txt + content: "hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -m file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/basic.yaml b/tests/scenarios/cmd/wc/default/basic.yaml new file mode 100644 index 00000000..a52874db --- /dev/null +++ b/tests/scenarios/cmd/wc/default/basic.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test b1 +description: wc default counts lines, words, bytes. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc file.txt +expect: + stdout: "2 3 6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_file.yaml b/tests/scenarios/cmd/wc/default/empty_file.yaml new file mode 100644 index 00000000..5b00343c --- /dev/null +++ b/tests/scenarios/cmd/wc/default/empty_file.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_file_empty +description: wc on an empty file shows all zeros. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc empty.txt +expect: + stdout: "0 0 0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/empty_stdin.yaml b/tests/scenarios/cmd/wc/default/empty_stdin.yaml new file mode 100644 index 00000000..5249634e --- /dev/null +++ b/tests/scenarios/cmd/wc/default/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test b0 +description: wc with no args and empty stdin outputs all zeros. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc empty.txt +expect: + stdout: "0 0 0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/default/single_file.yaml b/tests/scenarios/cmd/wc/default/single_file.yaml new file mode 100644 index 00000000..f6c1f873 --- /dev/null +++ b/tests/scenarios/cmd/wc/default/single_file.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_single_default +description: wc with a single file shows lines words bytes and filename. +setup: + files: + - path: file.txt + content: "alpha\nbeta\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc file.txt +expect: + stdout: " 2 2 11 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml new file mode 100644 index 00000000..d74a82d9 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml @@ -0,0 +1,10 @@ +# Derived from GTFOBins safety requirement +description: wc rejects --files0-from flag (security risk). +input: + allowed_paths: ["$DIR"] + script: |+ + wc --files0-from=foo +expect: + stdout: "" + stderr_contains: ["wc:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/missing_file.yaml b/tests/scenarios/cmd/wc/errors/missing_file.yaml new file mode 100644 index 00000000..5cc5aeb4 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/missing_file.yaml @@ -0,0 +1,10 @@ +# Derived from uutils test_read_from_nonexistent_file +description: wc exits 1 and prints error for nonexistent file. +input: + allowed_paths: ["$DIR"] + script: |+ + wc bogusfile +expect: + stdout: "" + stderr_contains: ["wc: bogusfile:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/errors/unknown_flag.yaml b/tests/scenarios/cmd/wc/errors/unknown_flag.yaml new file mode 100644 index 00000000..f14f0ba4 --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/unknown_flag.yaml @@ -0,0 +1,10 @@ +# Derived from uutils test_invalid_arg +description: wc rejects unknown flags with exit code 1. +input: + allowed_paths: ["$DIR"] + script: |+ + wc --definitely-invalid +expect: + stdout: "" + stderr_contains: ["wc:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml b/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml new file mode 100644 index 00000000..50b91f43 --- /dev/null +++ b/tests/scenarios/cmd/wc/hardening/double_dash_separator.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX -- convention +description: wc accepts -- to end flag parsing. +setup: + files: + - path: file.txt + content: "hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -- file.txt +expect: + stdout: "1 1 6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/empty_stdin.yaml b/tests/scenarios/cmd/wc/lines/empty_stdin.yaml new file mode 100644 index 00000000..33775902 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a1 +description: wc -l on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml new file mode 100644 index 00000000..ecb45502 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/no_trailing_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a7 +description: wc -l counts newline bytes; text with no newline counts as 0 lines. +setup: + files: + - path: file.txt + content: "x y" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "0 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/one_newline.yaml b/tests/scenarios/cmd/wc/lines/one_newline.yaml new file mode 100644 index 00000000..817ea07f --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/one_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a8 +description: wc -l counts 1 for a single newline-terminated line. +setup: + files: + - path: file.txt + content: "x y\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/lines/two_newlines.yaml b/tests/scenarios/cmd/wc/lines/two_newlines.yaml new file mode 100644 index 00000000..e9109634 --- /dev/null +++ b/tests/scenarios/cmd/wc/lines/two_newlines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a9 +description: wc -l counts 2 for two newline-terminated lines. +setup: + files: + - path: file.txt + content: "x\ny\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/basic.yaml b/tests/scenarios/cmd/wc/max_line_length/basic.yaml new file mode 100644 index 00000000..e7461fde --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/basic.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c0 +description: wc -L reports the length of the longest line. +setup: + files: + - path: file.txt + content: "1\n12\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml new file mode 100644 index 00000000..b8b50009 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/fullwidth_cjk.yaml @@ -0,0 +1,14 @@ +description: wc -L counts display columns, CJK characters are width 2. +skip_assert_against_bash: true # display width depends on locale; we always use Unicode width +setup: + files: + - path: file.txt + content: "你好\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "4 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml b/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml new file mode 100644 index 00000000..88329252 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/fullwidth_emoji.yaml @@ -0,0 +1,14 @@ +description: wc -L counts display columns, emoji characters are width 2. +skip_assert_against_bash: true # display width depends on locale; we always use Unicode width +setup: + files: + - path: file.txt + content: "ab💐\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "4 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml b/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml new file mode 100644 index 00000000..c417d540 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/no_trailing_newline.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c2 +description: wc -L counts a final line with no trailing newline. +setup: + files: + - path: file.txt + content: "\n123456" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml b/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml new file mode 100644 index 00000000..d70b6a20 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/three_lines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test c1 +description: wc -L picks the max among multiple lines. +setup: + files: + - path: file.txt + content: "1\n123\n1\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "3 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/multiple_files/total_line.yaml b/tests/scenarios/cmd/wc/multiple_files/total_line.yaml new file mode 100644 index 00000000..b374e75f --- /dev/null +++ b/tests/scenarios/cmd/wc/multiple_files/total_line.yaml @@ -0,0 +1,16 @@ +# Derived from GNU coreutils wc-total.sh +description: wc prints a total line when given multiple files. +setup: + files: + - path: a.txt + content: "hello\n" + - path: b.txt + content: "world foo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc a.txt b.txt +expect: + stdout: " 1 1 6 a.txt\n 1 2 10 b.txt\n 2 3 16 total\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml b/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml new file mode 100644 index 00000000..1804c0ad --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/dash_explicit.yaml @@ -0,0 +1,14 @@ +# Derived from uutils test_stdin_explicit +description: wc with explicit - reads stdin and shows filename -. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc - +expect: + stdout: " 2 3 6 -\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/implicit.yaml b/tests/scenarios/cmd/wc/stdin/implicit.yaml new file mode 100644 index 00000000..ed40861c --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/implicit.yaml @@ -0,0 +1,14 @@ +# Derived from standard POSIX wc behavior +description: wc reads stdin implicitly when no files are given. +setup: + files: + - path: file.txt + content: "a b\nc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc +expect: + stdout: " 2 3 6\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/across_lines.yaml b/tests/scenarios/cmd/wc/words/across_lines.yaml new file mode 100644 index 00000000..b57e7ff8 --- /dev/null +++ b/tests/scenarios/cmd/wc/words/across_lines.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a6 +description: wc -w counts words across lines including a line without trailing newline. +setup: + files: + - path: file.txt + content: "x y\nz" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "3 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/empty_stdin.yaml b/tests/scenarios/cmd/wc/words/empty_stdin.yaml new file mode 100644 index 00000000..2f77334b --- /dev/null +++ b/tests/scenarios/cmd/wc/words/empty_stdin.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a2 +description: wc -w on empty file outputs 0. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w empty.txt +expect: + stdout: "0 empty.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/single_word.yaml b/tests/scenarios/cmd/wc/words/single_word.yaml new file mode 100644 index 00000000..c5fb92da --- /dev/null +++ b/tests/scenarios/cmd/wc/words/single_word.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a4 +description: wc -w counts 1 word for a single non-whitespace token. +setup: + files: + - path: file.txt + content: "x" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "1 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/words/two_words.yaml b/tests/scenarios/cmd/wc/words/two_words.yaml new file mode 100644 index 00000000..9981335d --- /dev/null +++ b/tests/scenarios/cmd/wc/words/two_words.yaml @@ -0,0 +1,14 @@ +# Derived from GNU coreutils wc.pl test a5 +description: wc -w counts 2 words on a line with two tokens. +setup: + files: + - path: file.txt + content: "x y\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "2 file.txt\n" + stderr: "" + exit_code: 0 From 5d8527e72ba2fac44349cf6a87290755d4ea5f54 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 20:25:25 +0100 Subject: [PATCH 03/28] [iter 1] Fix \v and \f display width for wc -L, rename shadowed variable - \v (vertical tab) now has zero display width instead of 1, matching GNU wc - \f (form feed) now resets line position to 0 (like \r), matching GNU wc - Renamed loop variable `r` to `ch` to avoid shadowing the `io.Reader` parameter - Added GNU compat tests for both \v and \f cases Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 18 ++++++++++++------ interp/builtins/wc/wc_gnu_compat_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 7fbf50ee..953d9d6e 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -263,30 +263,36 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { c.bytes -= int64(carryN) for i := 0; i < len(chunk); { - r, size := utf8.DecodeRune(chunk[i:]) + ch, size := utf8.DecodeRune(chunk[i:]) i += size - if r == '\n' { + if ch == '\n' { c.lines++ if lineLen > c.maxLineLen { c.maxLineLen = lineLen } lineLen = 0 inWord = false - } else if r == '\r' { + } else if ch == '\r' { lineLen = 0 inWord = false - } else if r == '\t' { + } else if ch == '\t' { lineLen = (lineLen/8 + 1) * 8 inWord = false - } else if r == ' ' || r == '\v' || r == '\f' { + } else if ch == ' ' { lineLen++ inWord = false + } else if ch == '\f' { + lineLen = 0 + inWord = false + } else if ch == '\v' { + // vertical tab: zero display width, but breaks words + inWord = false } else { if !inWord { c.words++ inWord = true } - lineLen += int64(runeWidth(r)) + lineLen += int64(runeWidth(ch)) } } } diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go index 90966364..aadfcbe6 100644 --- a/interp/builtins/wc/wc_gnu_compat_test.go +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -160,6 +160,30 @@ func TestGNUCompatControlCharIsWord(t *testing.T) { assert.Equal(t, "1 file.txt\n", stdout) } +// TestGNUCompatMaxLineLenVerticalTab — -L with \v (zero display width). +// +// GNU command: printf 'a\vb\n' | wc -L +// Expected: "2\n" — \v has zero width, so a(1) + b(1) = 2. +func TestGNUCompatMaxLineLenVerticalTab(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\vb\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLenFormFeed — -L with \f (resets line position). +// +// GNU command: printf 'abc\fdef\n' | wc -L +// Expected: "3\n" — \f resets position, so def = 3. +func TestGNUCompatMaxLineLenFormFeed(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "abc\fdef\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + // TestGNUCompatRejectedFlag — unknown flag exits 1. // // GNU command: gwc --follow From d711e5a9103b3e28f186188a6a3c19be803f7eef Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 20:37:08 +0100 Subject: [PATCH 04/28] [iter 2] Fix control characters incorrectly counted as word characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GNU wc treats control characters (C0/C1 controls, unicode Cc category) as transparent to word counting — they neither start nor end words. The implementation was falling through to the else branch which would start a new word for control characters like \x01. Added a unicode.Is(unicode.Cc, ch) check before the word-starting else branch so control characters are skipped for word counting while still contributing to lineLen via runeWidth (which already returns 0 for Cc). Also updated TestGNUCompatControlCharIsWord → TestGNUCompatControlCharIsNotWord and TestWcWordsControlChar → TestWcWordsControlCharNotWord to assert 0 words instead of 1, matching GNU wc behavior. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 4 ++++ interp/builtins/wc/wc_gnu_compat_test.go | 8 ++++---- interp/builtins/wc/wc_test.go | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 953d9d6e..b9fc4937 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -287,6 +287,10 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { } else if ch == '\v' { // vertical tab: zero display width, but breaks words inWord = false + } else if unicode.Is(unicode.Cc, ch) { + // Control characters are transparent to word counting: + // they don't start or end words, matching GNU wc. + lineLen += int64(runeWidth(ch)) } else { if !inWord { c.words++ diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go index aadfcbe6..c4bc86d5 100644 --- a/interp/builtins/wc/wc_gnu_compat_test.go +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -148,16 +148,16 @@ func TestGNUCompatCharsMultibyte(t *testing.T) { assert.Equal(t, "5 file.txt\n", stdout) } -// TestGNUCompatControlCharIsWord — control byte \x01 counts as a word. +// TestGNUCompatControlCharIsNotWord — control byte \x01 is transparent to word counting. // // GNU command: printf '\x01\n' | gwc -w -// Expected: "1\n" -func TestGNUCompatControlCharIsWord(t *testing.T) { +// Expected: "0\n" +func TestGNUCompatControlCharIsNotWord(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "file.txt", "\x01\n") stdout, _, code := cmdRun(t, "wc -w file.txt", dir) assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) + assert.Equal(t, "0 file.txt\n", stdout) } // TestGNUCompatMaxLineLenVerticalTab — -L with \v (zero display width). diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go index dd2e3d20..fae40399 100644 --- a/interp/builtins/wc/wc_test.go +++ b/interp/builtins/wc/wc_test.go @@ -135,12 +135,12 @@ func TestWcWordsMultiple(t *testing.T) { assert.Equal(t, "3 file.txt\n", stdout) } -func TestWcWordsControlChar(t *testing.T) { +func TestWcWordsControlCharNotWord(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "file.txt", "\x01\n") stdout, _, code := cmdRun(t, "wc -w file.txt", dir) assert.Equal(t, 0, code) - assert.Equal(t, "1 file.txt\n", stdout) + assert.Equal(t, "0 file.txt\n", stdout) } // --- Bytes --- From 92849a47f1fbf3bbd253ae9255c43f02ff05cfde Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 20:52:15 +0100 Subject: [PATCH 05/28] [iter 3] Fix \r/\f max preservation and stdin min width in wc - Preserve maxLineLen before resetting lineLen on \r and \f, so wc -L correctly reports the longest segment (e.g. "abcdef\rxy\n" now outputs 6 instead of 2, matching GNU wc). - Apply stdinMinWidth=7 only in default mode (all 3 columns), not when explicit flags are given. "printf 'a\n' | wc -l" now outputs "1" instead of " 1", matching GNU wc. - Update pipe_to_wc.yaml expected output and remove skip_assert_against_bash (now validated against GNU bash + coreutils). - Add asymmetric \r and \f scenario tests and GNU compat tests to cover the max preservation edge case. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 13 +++++++++- interp/builtins/wc/wc_gnu_compat_test.go | 24 +++++++++++++++++++ interp/builtins/wc/wc_test.go | 2 +- tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml | 4 +--- .../cmd/wc/max_line_length/cr_asymmetric.yaml | 13 ++++++++++ .../cmd/wc/max_line_length/ff_asymmetric.yaml | 13 ++++++++++ 6 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 tests/scenarios/cmd/wc/max_line_length/cr_asymmetric.yaml create mode 100644 tests/scenarios/cmd/wc/max_line_length/ff_asymmetric.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index b9fc4937..7cdebfd4 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -175,7 +175,12 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { } width := fieldWidth(total, opts) - if hasStdin && width < stdinMinWidth { + // GNU wc uses a minimum column width of 7 for stdin, but only + // in default mode (all three columns: lines, words, bytes). + // When explicit flags are given (e.g. wc -l), the width is + // determined solely by the count values. + defaultMode := !*lines && !*words && !*bytesFlag && !*chars && !*maxLineLen + if hasStdin && defaultMode && width < stdinMinWidth { width = stdinMinWidth } @@ -273,6 +278,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { lineLen = 0 inWord = false } else if ch == '\r' { + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } lineLen = 0 inWord = false } else if ch == '\t' { @@ -282,6 +290,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { lineLen++ inWord = false } else if ch == '\f' { + if lineLen > c.maxLineLen { + c.maxLineLen = lineLen + } lineLen = 0 inWord = false } else if ch == '\v' { diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go index c4bc86d5..b2564503 100644 --- a/interp/builtins/wc/wc_gnu_compat_test.go +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -184,6 +184,30 @@ func TestGNUCompatMaxLineLenFormFeed(t *testing.T) { assert.Equal(t, "3 file.txt\n", stdout) } +// TestGNUCompatMaxLineLenCRAsymmetric — -L with \r where text before \r is longer. +// +// GNU command: printf 'abcdef\rxy\n' | wc -L +// Expected: "6\n" — max(6, 2) = 6; \r resets position but preserves max. +func TestGNUCompatMaxLineLenCRAsymmetric(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "abcdef\rxy\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + +// TestGNUCompatMaxLineLenFFAsymmetric — -L with \f where text before \f is longer. +// +// GNU command: printf 'abcdef\fxy\n' | wc -L +// Expected: "6\n" — max(6, 2) = 6; \f resets position but preserves max. +func TestGNUCompatMaxLineLenFFAsymmetric(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "abcdef\fxy\n") + stdout, _, code := cmdRun(t, "wc -L file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "6 file.txt\n", stdout) +} + // TestGNUCompatRejectedFlag — unknown flag exits 1. // // GNU command: gwc --follow diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go index fae40399..47ca7740 100644 --- a/interp/builtins/wc/wc_test.go +++ b/interp/builtins/wc/wc_test.go @@ -354,7 +354,7 @@ func TestWcPipeInput(t *testing.T) { writeFile(t, dir, "file.txt", "alpha\nbeta\ngamma\n") stdout, _, code := cmdRun(t, "cat file.txt | wc -l", dir) assert.Equal(t, 0, code) - assert.Equal(t, " 3\n", stdout) + assert.Equal(t, "3\n", stdout) } // --- Combined flags --- diff --git a/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml b/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml index 9c687821..875dd7e5 100644 --- a/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml +++ b/tests/scenarios/cmd/ls/pipes/pipe_to_wc.yaml @@ -1,5 +1,4 @@ description: ls piped to wc -l counts the number of entries. -skip_assert_against_bash: true setup: files: - path: a.txt @@ -16,7 +15,6 @@ input: script: |+ ls | wc -l expect: - stdout: |2+ - 3 + stdout: "3\n" stderr: "" exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/cr_asymmetric.yaml b/tests/scenarios/cmd/wc/max_line_length/cr_asymmetric.yaml new file mode 100644 index 00000000..14839363 --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/cr_asymmetric.yaml @@ -0,0 +1,13 @@ +description: wc -L preserves max line length across carriage return reset. +setup: + files: + - path: file.txt + content: "abcdef\rxy\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/max_line_length/ff_asymmetric.yaml b/tests/scenarios/cmd/wc/max_line_length/ff_asymmetric.yaml new file mode 100644 index 00000000..39664e0d --- /dev/null +++ b/tests/scenarios/cmd/wc/max_line_length/ff_asymmetric.yaml @@ -0,0 +1,13 @@ +description: wc -L preserves max line length across form feed reset. +setup: + files: + - path: file.txt + content: "abcdef\fxy\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -L file.txt +expect: + stdout: "6 file.txt\n" + stderr: "" + exit_code: 0 From 78af0ea74dbe1560af8c197e2e0c83e177590cf9 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 21:02:01 +0100 Subject: [PATCH 06/28] [iter 4] Add -L flag to wc documentation in SHELL_FEATURES.md The wc implementation supports -L/--max-line-length but the SHELL_FEATURES.md entry only listed [-l] [-w] [-c] [-m]. Co-Authored-By: Claude Opus 4.6 --- SHELL_FEATURES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 9a42affe..b497ef9d 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -23,7 +23,7 @@ Blocked features are rejected before execution with exit code 2. - ✅ `tr [-cdsCt] SET1 [SET2]` — translate, squeeze, and/or delete characters from stdin - ✅ `true` — return exit code 0 - ✅ `uniq [OPTION]... [INPUT]` — report or omit repeated lines -- ✅ `wc [-l] [-w] [-c] [-m] [FILE]...` — count lines, words, bytes, or characters in files +- ✅ `wc [-l] [-w] [-c] [-m] [-L] [FILE]...` — count lines, words, bytes, characters, or max line length - ❌ All other commands — return exit code 127 with `: not found` unless an ExecHandler is configured ## Variables From 101d222e70599b5e35eb6da98b686a97a17c7263 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 21:41:26 +0100 Subject: [PATCH 07/28] [iter 1] Fix -h flag, files0_from skip, and directory count line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1: Remove -h short flag for --help — GNU wc does not support -h. Running `wc -h` now correctly exits 1 with an error, matching bash. P2: Add skip_assert_against_bash: true to files0_from_rejected.yaml. rshell intentionally rejects --files0-from for security (GTFOBins), while GNU wc accepts it — this is a deliberate divergence. P3: Print zero count line when wc is given a directory argument. GNU wc prints both an error and a `0 0 0 .` count line for directories. Previously rshell only printed the error. Added platform-specific isErrIsDir helpers to detect directory errors and still emit the count line. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 8 ++++--- interp/builtins/wc/wc_isdir_unix.go | 18 ++++++++++++++++ interp/builtins/wc/wc_isdir_windows.go | 21 +++++++++++++++++++ interp/builtins/wc/wc_test.go | 13 +++++++----- .../cmd/wc/errors/files0_from_rejected.yaml | 1 + .../cmd/wc/errors/h_short_flag_rejected.yaml | 9 ++++++++ 6 files changed, 62 insertions(+), 8 deletions(-) create mode 100644 interp/builtins/wc/wc_isdir_unix.go create mode 100644 interp/builtins/wc/wc_isdir_windows.go create mode 100644 tests/scenarios/cmd/wc/errors/h_short_flag_rejected.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 7cdebfd4..5cc76ee6 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -34,7 +34,7 @@ // -L, --max-line-length // Print the length of the longest line. // -// -h, --help +// --help // Print this usage message to stdout and exit 0. // // Output columns always appear in a fixed order: lines, words, chars, @@ -89,7 +89,7 @@ type options struct { } func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { - help := fs.BoolP("help", "h", false, "print usage and exit") + help := fs.Bool("help", false, "print usage and exit") lines := fs.BoolP("lines", "l", false, "print the newline counts") words := fs.BoolP("words", "w", false, "print the word counts") bytesFlag := fs.BoolP("bytes", "c", false, "print the byte counts") @@ -160,7 +160,9 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { } callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) failed = true - if c == (counts{}) { + // GNU wc prints a zero count line for directories but not + // for missing files or other open errors. + if !isErrIsDir(err) { continue } } diff --git a/interp/builtins/wc/wc_isdir_unix.go b/interp/builtins/wc/wc_isdir_unix.go new file mode 100644 index 00000000..cf40c3a4 --- /dev/null +++ b/interp/builtins/wc/wc_isdir_unix.go @@ -0,0 +1,18 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +//go:build !windows + +package wc + +import ( + "errors" + "syscall" +) + +// isErrIsDir reports whether err wraps a "is a directory" error. +func isErrIsDir(err error) bool { + return errors.Is(err, syscall.EISDIR) +} diff --git a/interp/builtins/wc/wc_isdir_windows.go b/interp/builtins/wc/wc_isdir_windows.go new file mode 100644 index 00000000..6b9105fb --- /dev/null +++ b/interp/builtins/wc/wc_isdir_windows.go @@ -0,0 +1,21 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package wc + +import ( + "errors" + "syscall" +) + +// isErrIsDir reports whether err wraps the Windows equivalent of EISDIR. +// On Windows, reading a directory handle returns ERROR_INVALID_FUNCTION (errno 1). +func isErrIsDir(err error) bool { + var errno syscall.Errno + if errors.As(err, &errno) { + return errno == syscall.Errno(1) // ERROR_INVALID_FUNCTION + } + return false +} diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go index 47ca7740..d8b183e8 100644 --- a/interp/builtins/wc/wc_test.go +++ b/interp/builtins/wc/wc_test.go @@ -292,11 +292,12 @@ func TestWcHelp(t *testing.T) { assert.Contains(t, stdout, "Usage:") } -func TestWcHelpShort(t *testing.T) { +func TestWcHelpShortRejected(t *testing.T) { + // GNU wc does not support -h; it's an invalid option. dir := t.TempDir() - stdout, _, code := cmdRun(t, "wc -h", dir) - assert.Equal(t, 0, code) - assert.Contains(t, stdout, "Usage:") + _, stderr, code := cmdRun(t, "wc -h", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") } // --- Error cases --- @@ -325,9 +326,11 @@ func TestWcFiles0FromRejected(t *testing.T) { func TestWcDirectory(t *testing.T) { dir := t.TempDir() - _, stderr, code := cmdRun(t, "wc .", dir) + stdout, stderr, code := cmdRun(t, "wc .", dir) assert.Equal(t, 1, code) assert.Contains(t, stderr, "wc:") + // GNU wc prints a zero count line alongside the error + assert.Equal(t, "0 0 0 .\n", stdout) } // --- Hardening --- diff --git a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml index d74a82d9..a4ce305e 100644 --- a/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml +++ b/tests/scenarios/cmd/wc/errors/files0_from_rejected.yaml @@ -1,5 +1,6 @@ # Derived from GTFOBins safety requirement description: wc rejects --files0-from flag (security risk). +skip_assert_against_bash: true # intentionally rejects --files0-from (GTFOBins mitigation) input: allowed_paths: ["$DIR"] script: |+ diff --git a/tests/scenarios/cmd/wc/errors/h_short_flag_rejected.yaml b/tests/scenarios/cmd/wc/errors/h_short_flag_rejected.yaml new file mode 100644 index 00000000..d42b2cfa --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/h_short_flag_rejected.yaml @@ -0,0 +1,9 @@ +description: wc rejects -h flag (GNU wc does not support -h). +input: + allowed_paths: ["$DIR"] + script: |+ + wc -h +expect: + stdout: "" + stderr_contains: ["wc:"] + exit_code: 1 From 4436c58a25d7450cdc1b0290eee95183c5a17601 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 21:44:32 +0100 Subject: [PATCH 08/28] [iter 1] Add missing windows build tag to wc_isdir_windows.go Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc_isdir_windows.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interp/builtins/wc/wc_isdir_windows.go b/interp/builtins/wc/wc_isdir_windows.go index 6b9105fb..e731c4d3 100644 --- a/interp/builtins/wc/wc_isdir_windows.go +++ b/interp/builtins/wc/wc_isdir_windows.go @@ -3,6 +3,8 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2026-present Datadog, Inc. +//go:build windows + package wc import ( From 5e270859ff6325f4a0b65824ae97153b8a8ecc08 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 21:47:42 +0100 Subject: [PATCH 09/28] [iter 1] Fix empty filename handling in wc and add syscall to allowlist - wc: reject empty filename with "No such file or directory" error (matching bash behavior) instead of producing "0 0 0" output - allowlist: add syscall.EISDIR, syscall.Errno, os.PathError, and os.ErrNotExist used by wc_isdir_{unix,windows}.go and wc.go Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 3 +++ tests/allowed_symbols_test.go | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 5cc76ee6..8a901ef0 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -206,6 +206,9 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { } func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) { + if path == "" { + return counts{}, &os.PathError{Op: "open", Path: path, Err: os.ErrNotExist} + } var rc io.ReadCloser if path == "-" { if callCtx.Stdin == nil { diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index d86cdb0d..10245183 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -90,8 +90,12 @@ var builtinAllowedSymbols = []string{ "math.NaN", // os.FileInfo — file metadata interface returned by Stat; no I/O side effects. "os.FileInfo", + // os.ErrNotExist — sentinel error for "file does not exist"; pure constant, no I/O. + "os.ErrNotExist", // os.O_RDONLY — read-only file flag constant; cannot open files by itself. "os.O_RDONLY", + // os.PathError — error type for path-related OS errors; pure type, no I/O. + "os.PathError", // regexp.Compile — compiles a regular expression; pure function, no I/O. Uses RE2 engine (linear-time, no backtracking). "regexp.Compile", // regexp.QuoteMeta — escapes all special regex characters in a string; pure function, no I/O. @@ -112,6 +116,10 @@ var builtinAllowedSymbols = []string{ "strings.ReplaceAll", // strings.ToLower — converts string to lowercase; pure function, no I/O. "strings.ToLower", + // syscall.EISDIR — errno constant for "is a directory"; pure constant, no I/O. + "syscall.EISDIR", + // syscall.Errno — error type wrapping an OS errno value; pure type, no I/O. + "syscall.Errno", // strconv.IntSize — platform int size constant (32 or 64); pure constant, no I/O. "strconv.IntSize", // strings.Split — splits a string by separator into a slice; pure function, no I/O. From 3f233ffd43dea602b1753f0fde2671c853ff3e55 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 23:15:09 +0100 Subject: [PATCH 10/28] [iter 1] Match GNU wc error message for empty filename argument GNU wc outputs "wc: invalid zero-length file name" for empty filename arguments. Our implementation was producing "wc: : No such file or directory" due to using os.PathError with os.ErrNotExist. Changed to use errors.New with the correct message and handle the empty name case in error formatting to avoid the extra colon. Added scenario test verified against GNU bash/wc via Docker. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 9 +++++++-- tests/scenarios/cmd/wc/errors/empty_filename.yaml | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 tests/scenarios/cmd/wc/errors/empty_filename.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 8a901ef0..22980d4b 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -57,6 +57,7 @@ package wc import ( "context" + "errors" "io" "os" "strconv" @@ -158,7 +159,11 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { if file == "-" { name = "standard input" } - callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) + if name == "" { + callCtx.Errf("wc: %s\n", callCtx.PortableErr(err)) + } else { + callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err)) + } failed = true // GNU wc prints a zero count line for directories but not // for missing files or other open errors. @@ -207,7 +212,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) { if path == "" { - return counts{}, &os.PathError{Op: "open", Path: path, Err: os.ErrNotExist} + return counts{}, errors.New("invalid zero-length file name") } var rc io.ReadCloser if path == "-" { diff --git a/tests/scenarios/cmd/wc/errors/empty_filename.yaml b/tests/scenarios/cmd/wc/errors/empty_filename.yaml new file mode 100644 index 00000000..e4d92bce --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/empty_filename.yaml @@ -0,0 +1,8 @@ +description: wc exits 1 and prints error for empty filename argument. +input: + script: |+ + wc "" +expect: + stdout: "" + stderr: "wc: invalid zero-length file name\n" + exit_code: 1 From 64863db10875998cab5accbd5e13ee5b3efe259d Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 23:19:08 +0100 Subject: [PATCH 11/28] [iter 1] Remove unused os.ErrNotExist and os.PathError from builtinAllowedSymbols The wc refactor in 3f233ff stopped using os.ErrNotExist and os.PathError in builtins (replaced with errors.New for the empty filename case), causing TestBuiltinAllowedSymbols to fail on all platforms. Remove both unused entries from the allowlist. Co-Authored-By: Claude Opus 4.6 --- tests/allowed_symbols_test.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 10245183..89985dbe 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -90,12 +90,8 @@ var builtinAllowedSymbols = []string{ "math.NaN", // os.FileInfo — file metadata interface returned by Stat; no I/O side effects. "os.FileInfo", - // os.ErrNotExist — sentinel error for "file does not exist"; pure constant, no I/O. - "os.ErrNotExist", // os.O_RDONLY — read-only file flag constant; cannot open files by itself. "os.O_RDONLY", - // os.PathError — error type for path-related OS errors; pure type, no I/O. - "os.PathError", // regexp.Compile — compiles a regular expression; pure function, no I/O. Uses RE2 engine (linear-time, no backtracking). "regexp.Compile", // regexp.QuoteMeta — escapes all special regex characters in a string; pure function, no I/O. From 9e648d39c95f1b16a2661f0e66a90f8d4cbf8a98 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 23:34:35 +0100 Subject: [PATCH 12/28] [iter 2] Fix directory zero-count width and add \v word-separator tests 1. Apply minimum column width of 7 for non-regular files (directories, pipes) in default mode, matching GNU wc's compute_number_width logic which sets minimum_width=7 for any file where !S_ISREG(st_mode). Previously only stdin triggered the minimum width. 2. Add Go tests (TestGNUCompatVerticalTabWordsBreak, TestGNUCompatVerticalTabThreeWords) and a scenario test verifying that \v acts as a word separator for wc -w, matching GNU wc. 3. Add Go tests for directory width formatting in both default mode (width 7) and explicit-flag mode (width 1). Verified against GNU wc in debian:bookworm-slim via Docker. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 19 ++++---- interp/builtins/wc/wc_gnu_compat_test.go | 48 +++++++++++++++++++ interp/builtins/wc/wc_test.go | 4 +- .../cmd/wc/words/vertical_tab_separator.yaml | 13 +++++ 4 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 tests/scenarios/cmd/wc/words/vertical_tab_separator.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 22980d4b..358b6370 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -70,8 +70,8 @@ import ( // Cmd is the wc builtin command descriptor. var Cmd = builtins.Command{Name: "wc", MakeFlags: registerFlags} -const chunkSize = 32 * 1024 // 32 KiB read buffer -const stdinMinWidth = 7 // GNU wc minimum column width for stdin +const chunkSize = 32 * 1024 // 32 KiB read buffer +const nonRegularMinWidth = 7 // GNU wc minimum column width for non-regular files type counts struct { lines int64 @@ -148,6 +148,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { c counts } results := make([]fileResult, 0, len(files)) + hasNonRegular := hasStdin // stdin (pipe) is non-regular for _, file := range files { if ctx.Err() != nil { @@ -170,6 +171,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { if !isErrIsDir(err) { continue } + hasNonRegular = true } results = append(results, fileResult{name: file, c: c}) total.lines += c.lines @@ -182,13 +184,14 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { } width := fieldWidth(total, opts) - // GNU wc uses a minimum column width of 7 for stdin, but only - // in default mode (all three columns: lines, words, bytes). - // When explicit flags are given (e.g. wc -l), the width is - // determined solely by the count values. + // GNU wc uses a minimum column width of 7 for non-regular files + // (stdin pipes, directories, devices, etc.), but only in default + // mode (all three columns: lines, words, bytes). When explicit + // flags are given (e.g. wc -l), the width is determined solely + // by the count values. defaultMode := !*lines && !*words && !*bytesFlag && !*chars && !*maxLineLen - if hasStdin && defaultMode && width < stdinMinWidth { - width = stdinMinWidth + if hasNonRegular && defaultMode && width < nonRegularMinWidth { + width = nonRegularMinWidth } for _, fr := range results { diff --git a/interp/builtins/wc/wc_gnu_compat_test.go b/interp/builtins/wc/wc_gnu_compat_test.go index b2564503..c60da719 100644 --- a/interp/builtins/wc/wc_gnu_compat_test.go +++ b/interp/builtins/wc/wc_gnu_compat_test.go @@ -208,6 +208,54 @@ func TestGNUCompatMaxLineLenFFAsymmetric(t *testing.T) { assert.Equal(t, "6 file.txt\n", stdout) } +// TestGNUCompatDirectoryDefaultWidth — directory gets width-7 padding in default mode. +// +// GNU command: mkdir /tmp/d && wc /tmp/d +// Expected: " 0 0 0 .\n" (width 7, non-regular file) +func TestGNUCompatDirectoryDefaultWidth(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, "wc .", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") + assert.Equal(t, " 0 0 0 .\n", stdout) +} + +// TestGNUCompatDirectoryExplicitFlag — directory with explicit flag uses width 1. +// +// GNU command: mkdir /tmp/d && wc -l /tmp/d +// Expected: "0 .\n" (width 1, explicit flag) +func TestGNUCompatDirectoryExplicitFlag(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, "wc -l .", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "wc:") + assert.Equal(t, "0 .\n", stdout) +} + +// TestGNUCompatVerticalTabWordsBreak — \v breaks words for wc -w. +// +// GNU command: printf 'a\vb\n' | wc -w +// Expected: "2\n" — \v is a word delimiter. +func TestGNUCompatVerticalTabWordsBreak(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\vb\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2 file.txt\n", stdout) +} + +// TestGNUCompatVerticalTabThreeWords — \v separates three words. +// +// GNU command: printf 'a\vb\vc\n' | wc -w +// Expected: "3\n" +func TestGNUCompatVerticalTabThreeWords(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "file.txt", "a\vb\vc\n") + stdout, _, code := cmdRun(t, "wc -w file.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "3 file.txt\n", stdout) +} + // TestGNUCompatRejectedFlag — unknown flag exits 1. // // GNU command: gwc --follow diff --git a/interp/builtins/wc/wc_test.go b/interp/builtins/wc/wc_test.go index d8b183e8..1e6d303e 100644 --- a/interp/builtins/wc/wc_test.go +++ b/interp/builtins/wc/wc_test.go @@ -329,8 +329,8 @@ func TestWcDirectory(t *testing.T) { stdout, stderr, code := cmdRun(t, "wc .", dir) assert.Equal(t, 1, code) assert.Contains(t, stderr, "wc:") - // GNU wc prints a zero count line alongside the error - assert.Equal(t, "0 0 0 .\n", stdout) + // GNU wc prints a zero count line with width-7 padding (non-regular file) + assert.Equal(t, " 0 0 0 .\n", stdout) } // --- Hardening --- diff --git a/tests/scenarios/cmd/wc/words/vertical_tab_separator.yaml b/tests/scenarios/cmd/wc/words/vertical_tab_separator.yaml new file mode 100644 index 00000000..66fab7a7 --- /dev/null +++ b/tests/scenarios/cmd/wc/words/vertical_tab_separator.yaml @@ -0,0 +1,13 @@ +description: wc -w treats vertical tab (\v) as a word separator. +setup: + files: + - path: file.txt + content: "a\vb\vc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -w file.txt +expect: + stdout: "3 file.txt\n" + stderr: "" + exit_code: 0 From a3fe205b17c72faeeb951471279a02c34f25eaf2 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 23:47:57 +0100 Subject: [PATCH 13/28] [iter 4] Fix width-7 padding for non-regular input with explicit multi-column flags GNU wc applies the minimum width-7 padding for non-regular inputs (stdin, directories) whenever two or more columns are printed, not just in default mode. The previous code gated this on defaultMode only, which dropped padding for explicit multi-column invocations like `wc -lw`. Change the condition to count active columns and apply the minimum width when numCols >= 2, matching GNU behavior verified via Docker. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 28 +++++++++++++++---- .../multi_col_explicit_flags_padding.yaml | 15 ++++++++++ .../single_col_explicit_flag_no_padding.yaml | 15 ++++++++++ 3 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 tests/scenarios/cmd/wc/stdin/multi_col_explicit_flags_padding.yaml create mode 100644 tests/scenarios/cmd/wc/stdin/single_col_explicit_flag_no_padding.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 358b6370..2f762c8d 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -185,12 +185,28 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { width := fieldWidth(total, opts) // GNU wc uses a minimum column width of 7 for non-regular files - // (stdin pipes, directories, devices, etc.), but only in default - // mode (all three columns: lines, words, bytes). When explicit - // flags are given (e.g. wc -l), the width is determined solely - // by the count values. - defaultMode := !*lines && !*words && !*bytesFlag && !*chars && !*maxLineLen - if hasNonRegular && defaultMode && width < nonRegularMinWidth { + // (stdin pipes, directories, devices, etc.) when two or more + // columns are printed — whether in default mode or with explicit + // multi-column flags (e.g. wc -lw). When only a single column + // is active (e.g. wc -l), the width is determined solely by + // the count values. + numCols := 0 + if opts.showLines { + numCols++ + } + if opts.showWords { + numCols++ + } + if opts.showChars { + numCols++ + } + if opts.showBytes { + numCols++ + } + if opts.showMaxLineLen { + numCols++ + } + if hasNonRegular && numCols >= 2 && width < nonRegularMinWidth { width = nonRegularMinWidth } diff --git a/tests/scenarios/cmd/wc/stdin/multi_col_explicit_flags_padding.yaml b/tests/scenarios/cmd/wc/stdin/multi_col_explicit_flags_padding.yaml new file mode 100644 index 00000000..ba345696 --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/multi_col_explicit_flags_padding.yaml @@ -0,0 +1,15 @@ +# Verify that GNU wc minimum width-7 padding applies to non-regular +# (stdin) input when two or more columns are printed with explicit flags. +description: wc pads non-regular input to width 7 with explicit multi-column flags. +setup: + files: + - path: file.txt + content: "a\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc -lw +expect: + stdout: " 1 1\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/wc/stdin/single_col_explicit_flag_no_padding.yaml b/tests/scenarios/cmd/wc/stdin/single_col_explicit_flag_no_padding.yaml new file mode 100644 index 00000000..3420fa62 --- /dev/null +++ b/tests/scenarios/cmd/wc/stdin/single_col_explicit_flag_no_padding.yaml @@ -0,0 +1,15 @@ +# Verify that GNU wc does NOT apply width-7 padding for non-regular +# (stdin) input when only a single column is printed. +description: wc does not pad non-regular input when only one flag is given. +setup: + files: + - path: file.txt + content: "a\n" +input: + allowed_paths: ["$DIR"] + script: |+ + cat file.txt | wc -l +expect: + stdout: "1\n" + stderr: "" + exit_code: 0 From d33214b79d1c68e7158f782623d30e225bd54fe5 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:00:21 +0100 Subject: [PATCH 14/28] [iter 1] Fix width-7 padding for single-column multi-file with non-regular input and reorder allowed symbols - Apply GNU wc's minimum width-7 padding when multiple files are processed with non-regular input (directories), even with a single column flag (e.g. wc -l dir file). Previously the condition required numCols >= 2. - Reorder allowed symbols list to group entries by package alphabetically (strings.*, strconv.*, syscall.*). - Add scenario test for wc -l with directory + file verifying width-7 padding. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 10 ++++++---- tests/allowed_symbols_test.go | 20 +++++++++---------- .../cmd/wc/errors/dir_single_col_width7.yaml | 16 +++++++++++++++ 3 files changed, 32 insertions(+), 14 deletions(-) create mode 100644 tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index d45b5faa..12eecf3e 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -187,9 +187,11 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { // GNU wc uses a minimum column width of 7 for non-regular files // (stdin pipes, directories, devices, etc.) when two or more // columns are printed — whether in default mode or with explicit - // multi-column flags (e.g. wc -lw). When only a single column - // is active (e.g. wc -l), the width is determined solely by - // the count values. + // multi-column flags (e.g. wc -lw). GNU also applies this minimum + // when multiple files are processed (a total line is printed), even + // with a single column (e.g. wc -l dir file). When only a single + // column is active with a single file, the width is determined + // solely by the count values. numCols := 0 if opts.showLines { numCols++ @@ -206,7 +208,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { if opts.showMaxLineLen { numCols++ } - if hasNonRegular && numCols >= 2 && width < nonRegularMinWidth { + if hasNonRegular && (numCols >= 2 || len(files) > 1) && width < nonRegularMinWidth { width = nonRegularMinWidth } diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 9ce0821c..a741f69f 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -118,30 +118,30 @@ var builtinAllowedSymbols = []string{ "strings.ReplaceAll", // strings.ToLower — converts string to lowercase; pure function, no I/O. "strings.ToLower", - // syscall.EISDIR — errno constant for "is a directory"; pure constant, no I/O. - "syscall.EISDIR", - // syscall.Errno — error type wrapping an OS errno value; pure type, no I/O. - "syscall.Errno", - // strconv.IntSize — platform int size constant (32 or 64); pure constant, no I/O. - "strconv.IntSize", // strings.Split — splits a string by separator into a slice; pure function, no I/O. "strings.Split", // strconv.Atoi — string-to-int conversion; pure function, no I/O. "strconv.Atoi", - // strconv.ParseBool — string-to-bool conversion; pure function, no I/O. - "strconv.ParseBool", - // strconv.Itoa — int-to-string conversion; pure function, no I/O. - "strconv.Itoa", // strconv.ErrRange — sentinel error value for overflow; pure constant. "strconv.ErrRange", + // strconv.IntSize — platform int size constant (32 or 64); pure constant, no I/O. + "strconv.IntSize", + // strconv.Itoa — int-to-string conversion; pure function, no I/O. + "strconv.Itoa", // strconv.NumError — error type for numeric conversion failures; pure type. "strconv.NumError", + // strconv.ParseBool — string-to-bool conversion; pure function, no I/O. + "strconv.ParseBool", // strconv.ParseFloat — string-to-float conversion; pure function, no I/O. "strconv.ParseFloat", // strconv.ParseInt — string-to-int conversion with base/bit-size; pure function, no I/O. "strconv.ParseInt", // strconv.ParseUint — string-to-unsigned-int conversion; pure function, no I/O. "strconv.ParseUint", + // syscall.EISDIR — errno constant for "is a directory"; pure constant, no I/O. + "syscall.EISDIR", + // syscall.Errno — error type wrapping an OS errno value; pure type, no I/O. + "syscall.Errno", // strconv.FormatInt — int-to-string conversion; pure function, no I/O. "strconv.FormatInt", // strings.HasPrefix — pure function for prefix matching; no I/O. diff --git a/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml new file mode 100644 index 00000000..d376b24c --- /dev/null +++ b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml @@ -0,0 +1,16 @@ +# GNU wc applies width-7 padding for non-regular files (directories) even +# with a single column flag, when multiple files produce a total line. +description: wc -l with directory and file uses width-7 padding. +setup: + files: + - path: f.txt + content: "a\n" +input: + allowed_paths: ["$DIR"] + script: |+ + wc -l . f.txt +expect: + stdout: " 0 .\n 1 f.txt\n 1 total\n" + stderr: "wc: .: is a directory\n" + exit_code: 1 + skip_assert_against_bash: true From cd238179dba94d648ea8c9c1461d2383edd84082 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:07:09 +0100 Subject: [PATCH 15/28] [iter 1] Fix CI: move skip_assert_against_bash to top level and remove unused allowlist symbol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: - dir_single_col_width7.yaml had skip_assert_against_bash nested under expect: instead of at the scenario top level, so the bash comparison test was not skipping it. GNU wc outputs "Is a directory" (capital I) from strerror(EISDIR), while our shell normalizes to lowercase "is a directory" — this is an intentional divergence, so the skip is correct. - Remove unicode.IsControl from builtinAllowedSymbols since no builtin uses it (wc uses unicode.Is(unicode.Cc, ...) instead). Co-Authored-By: Claude Opus 4.6 --- tests/allowed_symbols_test.go | 2 -- tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index a741f69f..996a91ba 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -162,8 +162,6 @@ var builtinAllowedSymbols = []string{ "unicode.Co", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", - // unicode.IsControl — reports whether rune is a control character; pure function, no I/O. - "unicode.IsControl", // unicode.IsGraphic — reports whether rune is defined as a graphic character; pure function, no I/O. "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. diff --git a/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml index d376b24c..5a1fe6d0 100644 --- a/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml +++ b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml @@ -1,6 +1,7 @@ # GNU wc applies width-7 padding for non-regular files (directories) even # with a single column flag, when multiple files produce a total line. description: wc -l with directory and file uses width-7 padding. +skip_assert_against_bash: true setup: files: - path: f.txt @@ -13,4 +14,3 @@ expect: stdout: " 0 .\n 1 f.txt\n 1 total\n" stderr: "wc: .: is a directory\n" exit_code: 1 - skip_assert_against_bash: true From b1666fd1585f5e51b5105391479abc4aa3abe5ab Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:16:03 +0100 Subject: [PATCH 16/28] [iter 2] Fix P3 findings: alphabetical grouping and magic number - Move strconv.FormatInt to be grouped with other strconv.* entries in allowed_symbols_test.go (was separated by syscall.* entries) - Replace magic number syscall.Errno(1) with named constant errnoERROR_INVALID_FUNCTION in wc_isdir_windows.go Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc_isdir_windows.go | 6 +++++- tests/allowed_symbols_test.go | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/interp/builtins/wc/wc_isdir_windows.go b/interp/builtins/wc/wc_isdir_windows.go index e731c4d3..fa0cd3fe 100644 --- a/interp/builtins/wc/wc_isdir_windows.go +++ b/interp/builtins/wc/wc_isdir_windows.go @@ -12,12 +12,16 @@ import ( "syscall" ) +// errnoERROR_INVALID_FUNCTION is the Windows errno for ERROR_INVALID_FUNCTION. +// Go's syscall package does not export this constant, so we define it here. +const errnoERROR_INVALID_FUNCTION = syscall.Errno(1) + // isErrIsDir reports whether err wraps the Windows equivalent of EISDIR. // On Windows, reading a directory handle returns ERROR_INVALID_FUNCTION (errno 1). func isErrIsDir(err error) bool { var errno syscall.Errno if errors.As(err, &errno) { - return errno == syscall.Errno(1) // ERROR_INVALID_FUNCTION + return errno == errnoERROR_INVALID_FUNCTION } return false } diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 996a91ba..72bdf1ea 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -138,12 +138,12 @@ var builtinAllowedSymbols = []string{ "strconv.ParseInt", // strconv.ParseUint — string-to-unsigned-int conversion; pure function, no I/O. "strconv.ParseUint", + // strconv.FormatInt — int-to-string conversion; pure function, no I/O. + "strconv.FormatInt", // syscall.EISDIR — errno constant for "is a directory"; pure constant, no I/O. "syscall.EISDIR", // syscall.Errno — error type wrapping an OS errno value; pure type, no I/O. "syscall.Errno", - // strconv.FormatInt — int-to-string conversion; pure function, no I/O. - "strconv.FormatInt", // strings.HasPrefix — pure function for prefix matching; no I/O. "strings.HasPrefix", // strings.IndexByte — finds byte in string; pure function, no I/O. From 89747ea55d6d196c6f927f27bfc70a6db91683c1 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:25:01 +0100 Subject: [PATCH 17/28] [iter 3] Fix wc -w miscounting unassigned Unicode codepoints (Cn) The fuzz differential test found that U+2FFF (an unassigned codepoint in the Cn category) was treated as transparent to word counting, but GNU wc counts it as a word character. Remove the special-case branch that skipped Cn codepoints so they now fall through to the word- counting else branch, matching GNU wc behaviour. Also remove unicode.Co and unicode.IsGraphic from the allowed-symbols allowlist since they are no longer referenced by any builtin. Add fuzz corpus entry for the failing input as a regression test. Co-Authored-By: Claude Opus 4.6 --- .../wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f | 2 ++ interp/builtins/wc/wc.go | 3 --- tests/allowed_symbols_test.go | 4 ---- 3 files changed, 2 insertions(+), 7 deletions(-) create mode 100644 interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f diff --git a/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f b/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f new file mode 100644 index 00000000..f63218f7 --- /dev/null +++ b/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f @@ -0,0 +1,2 @@ +go test fuzz v1 +string("\xe2\xbf\xbf") diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 12eecf3e..54112250 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,9 +343,6 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else if !unicode.IsGraphic(ch) && !unicode.Is(unicode.Cf, ch) && !unicode.Is(unicode.Co, ch) { - // Cn (unassigned codepoints): transparent like control chars -- - // they do not start or end words, matching GNU wc under C.UTF-8. } else { if !inWord { c.words++ diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 72bdf1ea..91b47cc9 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -158,12 +158,8 @@ var builtinAllowedSymbols = []string{ "unicode.Cc", // unicode.Cf — format character category range table; pure data, no I/O. "unicode.Cf", - // unicode.Co — private-use character category range table; pure data, no I/O. - "unicode.Co", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", - // unicode.IsGraphic — reports whether rune is defined as a graphic character; pure function, no I/O. - "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From a485936f17dd63b68f5701c634fbe578f8b456a6 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:30:36 +0100 Subject: [PATCH 18/28] [iter 4] Add skip_assert_against_bash for empty filename wc test The rshell wc command returns "invalid zero-length file name" for empty filename arguments, which intentionally diverges from GNU wc's OS-level error message. Mark the scenario to skip bash comparison. Co-Authored-By: Claude Opus 4.6 --- tests/scenarios/cmd/wc/errors/empty_filename.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/scenarios/cmd/wc/errors/empty_filename.yaml b/tests/scenarios/cmd/wc/errors/empty_filename.yaml index e4d92bce..a00ce0ca 100644 --- a/tests/scenarios/cmd/wc/errors/empty_filename.yaml +++ b/tests/scenarios/cmd/wc/errors/empty_filename.yaml @@ -1,4 +1,5 @@ description: wc exits 1 and prints error for empty filename argument. +skip_assert_against_bash: true # rshell returns a clearer error than GNU wc for empty filenames input: script: |+ wc "" From 680a74261e9c4d6adc1c5f67b58dc6dcfe994c01 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:36:03 +0100 Subject: [PATCH 19/28] [iter 5] Fix fuzz corpus type mismatch in FuzzWcDifferentialWords The corpus file cd24dde99d3a6e0f used string() instead of []byte(), causing a type mismatch error that failed the "Test against Bash (Docker)" CI job. Co-Authored-By: Claude Opus 4.6 --- .../wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f b/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f index f63218f7..ff2def96 100644 --- a/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f +++ b/interp/builtins/tests/wc/testdata/fuzz/FuzzWcDifferentialWords/cd24dde99d3a6e0f @@ -1,2 +1,2 @@ go test fuzz v1 -string("\xe2\xbf\xbf") +[]byte("\xe2\xbf\xbf") From 9232dc95c7bfc019b4d936ecbe69713c84445062 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:43:30 +0100 Subject: [PATCH 20/28] [iter 6] Fix wc -w counting unassigned Unicode code points as words GNU wc in C.UTF-8 locale does not count non-graphic characters (such as unassigned code points like U+89249) as word characters. Our implementation was falling through to the word-counting else branch for any character that wasn't a control char or space separator. Now we check unicode.IsGraphic before counting a character as part of a word, matching GNU wc behaviour. Found by FuzzWcDifferentialWords corpus entry 1c6e2e9cd7371f3e. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 7 ++++++- tests/allowed_symbols_test.go | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 54112250..965c52ed 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,12 +343,17 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else { + } else if unicode.IsGraphic(ch) { if !inWord { c.words++ inWord = true } lineLen += int64(runeWidth(ch)) + } else { + // Non-graphic, non-control, non-space characters + // (e.g. unassigned code points) are transparent to + // word counting, matching GNU wc in C.UTF-8 locale. + lineLen += int64(runeWidth(ch)) } } } diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 91b47cc9..1a2ae8e3 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -160,6 +160,8 @@ var builtinAllowedSymbols = []string{ "unicode.Cf", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", + // unicode.IsGraphic — checks if rune is a graphic character; pure function, no I/O. + "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From 38cc6ec06c41f03c7c9f6a7036aa0e89fe3f477e Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:50:05 +0100 Subject: [PATCH 21/28] [iter 7] Fix wc -w to count non-graphic characters as word constituents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GNU wc uses iswspace() to determine word boundaries: any character that is not a space starts or continues a word. The previous fix using unicode.IsGraphic() was too restrictive — it excluded unassigned code points (Cn category, e.g. U+2FFF) from word counting, treating them as transparent. This caused the fuzz differential test to fail because GNU wc counts "\u2fff" as 1 word while rshell counted 0. Replace the IsGraphic guard with a plain else branch so that all characters not already handled (whitespace, control) are treated as word constituents. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 965c52ed..3d3d407e 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,17 +343,17 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else if unicode.IsGraphic(ch) { + } else { + // Any character that is not whitespace or control + // starts or continues a word, matching GNU wc which + // uses iswspace() in C.UTF-8 locale. This includes + // graphic characters, unassigned code points (Cn), + // and any other non-space, non-control runes. if !inWord { c.words++ inWord = true } lineLen += int64(runeWidth(ch)) - } else { - // Non-graphic, non-control, non-space characters - // (e.g. unassigned code points) are transparent to - // word counting, matching GNU wc in C.UTF-8 locale. - lineLen += int64(runeWidth(ch)) } } } From 104f04a8ba78a3a5a33750283d93c1df28f27fef Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 17:54:26 +0100 Subject: [PATCH 22/28] [iter 8] Remove unused unicode.IsGraphic from allowed symbols list Co-Authored-By: Claude Opus 4.6 --- tests/allowed_symbols_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 1a2ae8e3..91b47cc9 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -160,8 +160,6 @@ var builtinAllowedSymbols = []string{ "unicode.Cf", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", - // unicode.IsGraphic — checks if rune is a graphic character; pure function, no I/O. - "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From 67de03724dea3353f2f6abcd82e187c4a05f4876 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 18:04:25 +0100 Subject: [PATCH 23/28] [iter 9] Fix wc -w to not count unassigned codepoints as words GNU wc 8.32 gates word counting on iswprint(), which returns false for unassigned Unicode codepoints (Cn category). Our implementation was treating all non-whitespace, non-control characters as word characters, including Cn codepoints. This caused the fuzz differential test to fail for input U+89249 (an unassigned codepoint) where GNU wc counts 0 words but rshell counted 1. Fix: only count characters as word-eligible if they are graphic (unicode.IsGraphic) or private-use (Co), matching glibc's iswprint() behaviour. Non-printable characters like Cn are now transparent to word counting (they neither start nor end words). Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 3d3d407e..f94d7f11 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,17 +343,24 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else { - // Any character that is not whitespace or control - // starts or continues a word, matching GNU wc which - // uses iswspace() in C.UTF-8 locale. This includes - // graphic characters, unassigned code points (Cn), - // and any other non-space, non-control runes. + } else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) { + // Printable characters start or continue a word, + // matching GNU wc 8.32 which gates word counting + // on iswprint() in C.UTF-8 locale. IsGraphic + // covers letters, marks, numbers, punctuation, and + // symbols; Co adds private-use characters that + // glibc considers printable. if !inWord { c.words++ inWord = true } lineLen += int64(runeWidth(ch)) + } else { + // Non-printable, non-whitespace, non-control chars + // (e.g. unassigned Cn, format Cf not caught above) + // are transparent to word counting — they neither + // start nor end words, matching GNU wc behaviour. + lineLen += int64(runeWidth(ch)) } } } From 18bc6b1c825597c5278a280dda26d9d5208cc74a Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 18:05:17 +0100 Subject: [PATCH 24/28] [iter 9] Add unicode.IsGraphic and unicode.Co back to allowed symbols These are needed by the updated wc word counting logic that uses IsGraphic || Co to match glibc's iswprint() behavior. Co-Authored-By: Claude Opus 4.6 --- tests/allowed_symbols_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 91b47cc9..27367696 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -158,8 +158,12 @@ var builtinAllowedSymbols = []string{ "unicode.Cc", // unicode.Cf — format character category range table; pure data, no I/O. "unicode.Cf", + // unicode.Co — private use area category range table; pure data, no I/O. + "unicode.Co", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", + // unicode.IsGraphic — checks if rune is a graphic character; pure function, no I/O. + "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From 00556df0d37306475208c02217d698ecbd49cf50 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 18:13:33 +0100 Subject: [PATCH 25/28] [iter 10] Fix wc -w by matching glibc iswprint() exactly for word boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace IsGraphic || Co check with glibcIswprint() that precisely mirrors glibc's C.UTF-8 iswprint() semantics: everything is printable (and thus a word character) EXCEPT control chars (Cc), surrogates, and Unicode noncharacters (U+FDD0..U+FDEF, U+xFFFE/U+xFFFF). This handles unassigned codepoints like U+2FFF correctly — glibc treats them as printable, so they should count as words. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 44 ++++++++++++++++++++++++++++------- tests/allowed_symbols_test.go | 4 ---- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index f94d7f11..17fd5a90 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,13 +343,14 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) { + } else if glibcIswprint(ch) { // Printable characters start or continue a word, // matching GNU wc 8.32 which gates word counting - // on iswprint() in C.UTF-8 locale. IsGraphic - // covers letters, marks, numbers, punctuation, and - // symbols; Co adds private-use characters that - // glibc considers printable. + // on iswprint() in C.UTF-8 locale. We use + // glibcIswprint to match glibc's permissive + // definition: everything except controls (Cc), + // surrogates, and Unicode noncharacters is + // considered printable. if !inWord { c.words++ inWord = true @@ -357,9 +358,10 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { lineLen += int64(runeWidth(ch)) } else { // Non-printable, non-whitespace, non-control chars - // (e.g. unassigned Cn, format Cf not caught above) - // are transparent to word counting — they neither - // start nor end words, matching GNU wc behaviour. + // (Unicode noncharacters like U+FDD0..U+FDEF, + // U+xFFFE, U+xFFFF) are transparent to word + // counting — they neither start nor end words, + // matching GNU wc behaviour. lineLen += int64(runeWidth(ch)) } } @@ -403,6 +405,32 @@ func fieldWidth(total counts, opts options) int { return w } +// glibcIswprint reports whether r is considered printable by glibc's +// iswprint() in the C.UTF-8 locale. This is much more permissive than +// Go's unicode.IsPrint or unicode.IsGraphic: glibc treats everything as +// printable EXCEPT control characters (Cc), surrogates (U+D800..U+DFFF), +// and Unicode noncharacters (U+FDD0..U+FDEF, U+xFFFE, U+xFFFF). +// In particular, unassigned codepoints (Cn) that are not noncharacters +// ARE considered printable. +func glibcIswprint(r rune) bool { + // Control characters are not printable. + if unicode.Is(unicode.Cc, r) { + return false + } + // Surrogates are not valid runes in Go, but guard anyway. + if r >= 0xD800 && r <= 0xDFFF { + return false + } + // Unicode noncharacters are not printable. + if r >= 0xFDD0 && r <= 0xFDEF { + return false + } + if r&0xFFFF == 0xFFFE || r&0xFFFF == 0xFFFF { + return false + } + return true +} + // runeWidth returns the display width of a rune following wcwidth(3) rules: // 0 for controls, combining marks, and format chars; 2 for East Asian // Wide/Fullwidth; 1 for everything else. diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 27367696..91b47cc9 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -158,12 +158,8 @@ var builtinAllowedSymbols = []string{ "unicode.Cc", // unicode.Cf — format character category range table; pure data, no I/O. "unicode.Cf", - // unicode.Co — private use area category range table; pure data, no I/O. - "unicode.Co", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", - // unicode.IsGraphic — checks if rune is a graphic character; pure function, no I/O. - "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From c379b926d4fa5d46f8927dc281ade3fc0e5334e6 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 18:26:34 +0100 Subject: [PATCH 26/28] [iter 10] Fix wc -w with Unicode 15.1 supplement table for word boundaries The fuzz differential test runs on Ubuntu 24.04 (glibc 2.39, Unicode 15.1) but Go's unicode package ships Unicode 15.0. Characters assigned in Unicode 15.1 (e.g. U+2FFC-U+2FFF Ideographic Description Characters, U+31EF, CJK Extension I) are printable in glibc but unknown to Go's unicode.IsGraphic, causing word count mismatches. Replace the over-broad glibcIswprint (which treated ALL non-control, non-noncharacter codepoints as printable -- wrong for U+89249) with the original IsGraphic || Co check plus a targeted unicode151Print range table covering the specific Unicode 15.1 additions. This table can be removed once Go upgrades to Unicode 15.1+ (Go 1.27). Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 64 ++++++++++++++++------------------- tests/allowed_symbols_test.go | 4 +++ 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 17fd5a90..ad70dcbb 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,14 +343,16 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else if glibcIswprint(ch) { + } else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) || unicode.Is(unicode151Print, ch) { // Printable characters start or continue a word, - // matching GNU wc 8.32 which gates word counting - // on iswprint() in C.UTF-8 locale. We use - // glibcIswprint to match glibc's permissive - // definition: everything except controls (Cc), - // surrogates, and Unicode noncharacters is - // considered printable. + // matching GNU wc which gates word counting on + // iswprint() in C.UTF-8 locale. IsGraphic covers + // letters, marks, numbers, punctuation, and + // symbols from Unicode 15.0; Co adds private-use + // characters; unicode151Print adds characters + // assigned in Unicode 15.1 (e.g. new Ideographic + // Description Characters) that Go's tables don't + // yet include (Go ships Unicode 15.0). if !inWord { c.words++ inWord = true @@ -358,10 +360,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { lineLen += int64(runeWidth(ch)) } else { // Non-printable, non-whitespace, non-control chars - // (Unicode noncharacters like U+FDD0..U+FDEF, - // U+xFFFE, U+xFFFF) are transparent to word - // counting — they neither start nor end words, - // matching GNU wc behaviour. + // (e.g. unassigned Cn, format Cf not caught above) + // are transparent to word counting — they neither + // start nor end words, matching GNU wc behaviour. lineLen += int64(runeWidth(ch)) } } @@ -405,30 +406,23 @@ func fieldWidth(total counts, opts options) int { return w } -// glibcIswprint reports whether r is considered printable by glibc's -// iswprint() in the C.UTF-8 locale. This is much more permissive than -// Go's unicode.IsPrint or unicode.IsGraphic: glibc treats everything as -// printable EXCEPT control characters (Cc), surrogates (U+D800..U+DFFF), -// and Unicode noncharacters (U+FDD0..U+FDEF, U+xFFFE, U+xFFFF). -// In particular, unassigned codepoints (Cn) that are not noncharacters -// ARE considered printable. -func glibcIswprint(r rune) bool { - // Control characters are not printable. - if unicode.Is(unicode.Cc, r) { - return false - } - // Surrogates are not valid runes in Go, but guard anyway. - if r >= 0xD800 && r <= 0xDFFF { - return false - } - // Unicode noncharacters are not printable. - if r >= 0xFDD0 && r <= 0xFDEF { - return false - } - if r&0xFFFF == 0xFFFE || r&0xFFFF == 0xFFFF { - return false - } - return true +// unicode151Print covers characters assigned in Unicode 15.1 that are +// printable (graphic) but absent from Go's unicode package (Unicode 15.0). +// CI runs GNU wc linked against glibc ≥ 2.39 (Ubuntu 24.04) which uses +// Unicode 15.1+ character data, so these codepoints must be treated as +// word characters to match GNU wc output. +// +// This table can be removed once Go's unicode package is updated to +// Unicode 15.1 or later (tracked in https://github.com/golang/go/issues/65141, +// expected in Go 1.27). +var unicode151Print = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x2FFC, 0x2FFF, 1}, // Ideographic Description Characters (4 new IDCs) + {0x31EF, 0x31EF, 1}, // Ideographic Description Character OVERLAID + }, + R32: []unicode.Range32{ + {0x2EBF0, 0x2EE5D, 1}, // CJK Unified Ideographs Extension I + }, } // runeWidth returns the display width of a rune following wcwidth(3) rules: diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index 1ae7114f..819abfc0 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -160,8 +160,12 @@ var builtinAllowedSymbols = []string{ "unicode.Cc", // unicode.Cf — format character category range table; pure data, no I/O. "unicode.Cf", + // unicode.Co — private use area category range table; pure data, no I/O. + "unicode.Co", // unicode.Is — checks if rune belongs to a range table; pure function, no I/O. "unicode.Is", + // unicode.IsGraphic — checks if rune is a graphic character; pure function, no I/O. + "unicode.IsGraphic", // unicode.Zs — Unicode space separator category range table; pure data, no I/O. "unicode.Zs", // unicode.Me — enclosing mark category range table; pure data, no I/O. From 8e957c9a5655e12a476e5c3be849665a2328caae Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 18:34:00 +0100 Subject: [PATCH 27/28] [iter 10] Add Cf (format characters) to wc -w word character check The fuzz differential test found that U+06DD (ARABIC END OF AYAH, a Cf format character) is counted as a word character by GNU wc on Ubuntu 24.04 because glibc's iswprint() in C.UTF-8 returns true for all Cf characters. Go's unicode.IsGraphic excludes Cf, so add an explicit unicode.Cf check to match GNU wc behaviour. Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index ad70dcbb..6f79747f 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -343,16 +343,18 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { // matching GNU wc behaviour under C.UTF-8 locale. lineLen++ inWord = false - } else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) || unicode.Is(unicode151Print, ch) { + } else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) || unicode.Is(unicode.Cf, ch) || unicode.Is(unicode151Print, ch) { // Printable characters start or continue a word, // matching GNU wc which gates word counting on // iswprint() in C.UTF-8 locale. IsGraphic covers // letters, marks, numbers, punctuation, and - // symbols from Unicode 15.0; Co adds private-use - // characters; unicode151Print adds characters - // assigned in Unicode 15.1 (e.g. new Ideographic - // Description Characters) that Go's tables don't - // yet include (Go ships Unicode 15.0). + // symbols; Co adds private-use characters; Cf adds + // format characters (e.g. U+06DD ARABIC END OF + // AYAH, U+200B ZERO WIDTH SPACE) which glibc's + // iswprint considers printable; unicode151Print + // adds characters assigned in Unicode 15.1 that + // Go's tables don't yet include (Go ships + // Unicode 15.0). if !inWord { c.words++ inWord = true @@ -360,9 +362,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { lineLen += int64(runeWidth(ch)) } else { // Non-printable, non-whitespace, non-control chars - // (e.g. unassigned Cn, format Cf not caught above) - // are transparent to word counting — they neither - // start nor end words, matching GNU wc behaviour. + // (e.g. unassigned Cn codepoints) are transparent + // to word counting — they neither start nor end + // words, matching GNU wc behaviour. lineLen += int64(runeWidth(ch)) } } From 761faba0e49369ae627deff29b1233cc110fd5e6 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Fri, 13 Mar 2026 20:00:03 +0100 Subject: [PATCH 28/28] [iter 1] Address PR review comments: numCols helper, Cn width fix, skip comment - Extract numCols() helper method on options to reduce verbosity (P3 self-review) - Fix unassigned (Cn) codepoints contributing width to -L max-line-length; GNU wc treats them as non-printable (wcwidth=-1, width 0) - Add inline comment explaining skip_assert_against_bash in dir_single_col_width7.yaml (P3 self-review) Co-Authored-By: Claude Opus 4.6 --- interp/builtins/wc/wc.go | 45 ++++++++++--------- .../cmd/wc/errors/dir_single_col_width7.yaml | 2 +- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/interp/builtins/wc/wc.go b/interp/builtins/wc/wc.go index 6f79747f..bb37215f 100644 --- a/interp/builtins/wc/wc.go +++ b/interp/builtins/wc/wc.go @@ -89,6 +89,27 @@ type options struct { showMaxLineLen bool } +// numCols returns the number of output columns that will be printed. +func (o options) numCols() int { + n := 0 + if o.showLines { + n++ + } + if o.showWords { + n++ + } + if o.showChars { + n++ + } + if o.showBytes { + n++ + } + if o.showMaxLineLen { + n++ + } + return n +} + func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { help := fs.Bool("help", false, "print usage and exit") lines := fs.BoolP("lines", "l", false, "print the newline counts") @@ -192,23 +213,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { // with a single column (e.g. wc -l dir file). When only a single // column is active with a single file, the width is determined // solely by the count values. - numCols := 0 - if opts.showLines { - numCols++ - } - if opts.showWords { - numCols++ - } - if opts.showChars { - numCols++ - } - if opts.showBytes { - numCols++ - } - if opts.showMaxLineLen { - numCols++ - } - if hasNonRegular && (numCols >= 2 || len(files) > 1) && width < nonRegularMinWidth { + if hasNonRegular && (opts.numCols() >= 2 || len(files) > 1) && width < nonRegularMinWidth { width = nonRegularMinWidth } @@ -363,9 +368,9 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) { } else { // Non-printable, non-whitespace, non-control chars // (e.g. unassigned Cn codepoints) are transparent - // to word counting — they neither start nor end - // words, matching GNU wc behaviour. - lineLen += int64(runeWidth(ch)) + // to both word counting and line length — they + // neither start nor end words, and GNU wc treats + // them as non-printable (wcwidth=-1, width 0). } } } diff --git a/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml index 5a1fe6d0..4ed13f46 100644 --- a/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml +++ b/tests/scenarios/cmd/wc/errors/dir_single_col_width7.yaml @@ -1,7 +1,7 @@ # GNU wc applies width-7 padding for non-regular files (directories) even # with a single column flag, when multiple files produce a total line. description: wc -l with directory and file uses width-7 padding. -skip_assert_against_bash: true +skip_assert_against_bash: true # stderr format differs from GNU wc (PortableErr normalization) setup: files: - path: f.txt