Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
d35e984
remove wc
AlexandreYang Mar 12, 2026
63e05a7
Revert "remove wc"
AlexandreYang Mar 12, 2026
5d8527e
[iter 1] Fix \v and \f display width for wc -L, rename shadowed variable
AlexandreYang Mar 12, 2026
d711e5a
[iter 2] Fix control characters incorrectly counted as word characters
AlexandreYang Mar 12, 2026
92849a4
[iter 3] Fix \r/\f max preservation and stdin min width in wc
AlexandreYang Mar 12, 2026
78af0ea
[iter 4] Add -L flag to wc documentation in SHELL_FEATURES.md
AlexandreYang Mar 12, 2026
101d222
[iter 1] Fix -h flag, files0_from skip, and directory count line
AlexandreYang Mar 12, 2026
4436c58
[iter 1] Add missing windows build tag to wc_isdir_windows.go
AlexandreYang Mar 12, 2026
5e27085
[iter 1] Fix empty filename handling in wc and add syscall to allowlist
AlexandreYang Mar 12, 2026
3f233ff
[iter 1] Match GNU wc error message for empty filename argument
AlexandreYang Mar 12, 2026
64863db
[iter 1] Remove unused os.ErrNotExist and os.PathError from builtinAl…
AlexandreYang Mar 12, 2026
9e648d3
[iter 2] Fix directory zero-count width and add \v word-separator tests
AlexandreYang Mar 12, 2026
a3fe205
[iter 4] Fix width-7 padding for non-regular input with explicit mult…
AlexandreYang Mar 12, 2026
35d90c3
Merge origin/main into alex/review_wc3
AlexandreYang Mar 13, 2026
d33214b
[iter 1] Fix width-7 padding for single-column multi-file with non-re…
AlexandreYang Mar 13, 2026
cd23817
[iter 1] Fix CI: move skip_assert_against_bash to top level and remov…
AlexandreYang Mar 13, 2026
b1666fd
[iter 2] Fix P3 findings: alphabetical grouping and magic number
AlexandreYang Mar 13, 2026
89747ea
[iter 3] Fix wc -w miscounting unassigned Unicode codepoints (Cn)
AlexandreYang Mar 13, 2026
a485936
[iter 4] Add skip_assert_against_bash for empty filename wc test
AlexandreYang Mar 13, 2026
680a742
[iter 5] Fix fuzz corpus type mismatch in FuzzWcDifferentialWords
AlexandreYang Mar 13, 2026
9232dc9
[iter 6] Fix wc -w counting unassigned Unicode code points as words
AlexandreYang Mar 13, 2026
38cc6ec
[iter 7] Fix wc -w to count non-graphic characters as word constituents
AlexandreYang Mar 13, 2026
104f04a
[iter 8] Remove unused unicode.IsGraphic from allowed symbols list
AlexandreYang Mar 13, 2026
67de037
[iter 9] Fix wc -w to not count unassigned codepoints as words
AlexandreYang Mar 13, 2026
18bc6b1
[iter 9] Add unicode.IsGraphic and unicode.Co back to allowed symbols
AlexandreYang Mar 13, 2026
00556df
[iter 10] Fix wc -w by matching glibc iswprint() exactly for word bou…
AlexandreYang Mar 13, 2026
978f0be
Merge remote-tracking branch 'origin/main' into alex/review_wc3
thieman Mar 13, 2026
c379b92
[iter 10] Fix wc -w with Unicode 15.1 supplement table for word bound…
AlexandreYang Mar 13, 2026
8e957c9
[iter 10] Add Cf (format characters) to wc -w word character check
AlexandreYang Mar 13, 2026
761faba
[iter 1] Address PR review comments: numCols helper, Cn width fix, sk…
AlexandreYang Mar 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion SHELL_FEATURES.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Blocked features are rejected before execution with exit code 2.
- ✅ `tr [-cdsCt] SET1 [SET2]` — translate, squeeze, and/or delete characters from stdin
- ✅ `true` — return exit code 0
- ✅ `uniq [OPTION]... [INPUT]` — report or omit repeated lines
- ✅ `wc [-l] [-w] [-c] [-m] [FILE]...` — count lines, words, bytes, or characters in files
- ✅ `wc [-l] [-w] [-c] [-m] [-L] [FILE]...` — count lines, words, bytes, characters, or max line length
- ❌ All other commands — return exit code 127 with `<cmd>: not found` unless an ExecHandler is configured

## Variables
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
go test fuzz v1
[]byte("\xe2\xbf\xbf")
133 changes: 110 additions & 23 deletions interp/builtins/wc/wc.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
// -L, --max-line-length
// Print the length of the longest line.
//
// -h, --help
// --help
// Print this usage message to stdout and exit 0.
//
// Output columns always appear in a fixed order: lines, words, chars,
Expand All @@ -57,6 +57,7 @@ package wc

import (
"context"
"errors"
"io"
"os"
"strconv"
Expand All @@ -69,8 +70,8 @@ import (
// Cmd is the wc builtin command descriptor.
var Cmd = builtins.Command{Name: "wc", MakeFlags: registerFlags}

const chunkSize = 32 * 1024 // 32 KiB read buffer
const stdinMinWidth = 7 // GNU wc minimum column width for stdin
const chunkSize = 32 * 1024 // 32 KiB read buffer
const nonRegularMinWidth = 7 // GNU wc minimum column width for non-regular files

type counts struct {
lines int64
Expand All @@ -88,8 +89,29 @@ type options struct {
showMaxLineLen bool
}

// numCols returns the number of output columns that will be printed.
func (o options) numCols() int {
n := 0
if o.showLines {
n++
}
if o.showWords {
n++
}
if o.showChars {
n++
}
if o.showBytes {
n++
}
if o.showMaxLineLen {
n++
}
return n
}

func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
help := fs.BoolP("help", "h", false, "print usage and exit")
help := fs.Bool("help", false, "print usage and exit")
lines := fs.BoolP("lines", "l", false, "print the newline counts")
words := fs.BoolP("words", "w", false, "print the word counts")
bytesFlag := fs.BoolP("bytes", "c", false, "print the byte counts")
Expand Down Expand Up @@ -147,6 +169,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
c counts
}
results := make([]fileResult, 0, len(files))
hasNonRegular := hasStdin // stdin (pipe) is non-regular

for _, file := range files {
if ctx.Err() != nil {
Expand All @@ -158,11 +181,18 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
if file == "-" {
name = "standard input"
}
callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err))
if name == "" {
callCtx.Errf("wc: %s\n", callCtx.PortableErr(err))
} else {
callCtx.Errf("wc: %s: %s\n", name, callCtx.PortableErr(err))
}
failed = true
if c == (counts{}) {
// GNU wc prints a zero count line for directories but not
// for missing files or other open errors.
if !isErrIsDir(err) {
continue
}
hasNonRegular = true
}
results = append(results, fileResult{name: file, c: c})
total.lines += c.lines
Expand All @@ -175,8 +205,16 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
}

width := fieldWidth(total, opts)
if hasStdin && width < stdinMinWidth {
width = stdinMinWidth
// GNU wc uses a minimum column width of 7 for non-regular files
// (stdin pipes, directories, devices, etc.) when two or more
// columns are printed — whether in default mode or with explicit
// multi-column flags (e.g. wc -lw). GNU also applies this minimum
// when multiple files are processed (a total line is printed), even
// with a single column (e.g. wc -l dir file). When only a single
// column is active with a single file, the width is determined
// solely by the count values.
if hasNonRegular && (opts.numCols() >= 2 || len(files) > 1) && width < nonRegularMinWidth {
width = nonRegularMinWidth
}

for _, fr := range results {
Expand All @@ -199,6 +237,9 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc {
}

func countFile(ctx context.Context, callCtx *builtins.CallContext, path string) (counts, error) {
if path == "" {
return counts{}, errors.New("invalid zero-length file name")
}
var rc io.ReadCloser
if path == "-" {
if callCtx.Stdin == nil {
Expand Down Expand Up @@ -262,47 +303,74 @@ func countReader(ctx context.Context, r io.Reader) (counts, error) {
c.bytes -= int64(carryN)

for i := 0; i < len(chunk); {
r, size := utf8.DecodeRune(chunk[i:])
ch, size := utf8.DecodeRune(chunk[i:])
i += size
Comment thread
AlexandreYang marked this conversation as resolved.
// Invalid UTF-8 byte: not a character in C.UTF-8 locale.
// Skip entirely — no char count, no word effect.
if r == utf8.RuneError && size == 1 {
if ch == utf8.RuneError && size == 1 {
continue
}
c.chars++
if r == '\n' {
if ch == '\n' {
c.lines++
if lineLen > c.maxLineLen {
c.maxLineLen = lineLen
}
lineLen = 0
inWord = false
} else if r == '\r' {
} else if ch == '\r' {
if lineLen > c.maxLineLen {
c.maxLineLen = lineLen
}
Comment thread
AlexandreYang marked this conversation as resolved.
lineLen = 0
Comment thread
AlexandreYang marked this conversation as resolved.
inWord = false
} else if r == '\t' {
} else if ch == '\t' {
lineLen = (lineLen/8 + 1) * 8
inWord = false
} else if r == ' ' || r == '\v' || r == '\f' {
} else if ch == ' ' {
lineLen++
Comment thread
AlexandreYang marked this conversation as resolved.
inWord = false
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
Comment thread
AlexandreYang marked this conversation as resolved.
} else if unicode.IsControl(r) {
// Non-whitespace control chars (C0, DEL, C1) are transparent:
// they do not start or end words, matching GNU wc in POSIX locale.
} else if unicode.Is(unicode.Zs, r) {
} else if ch == '\f' {
if lineLen > c.maxLineLen {
c.maxLineLen = lineLen
}
lineLen = 0
Comment thread
AlexandreYang marked this conversation as resolved.
inWord = false
} else if ch == '\v' {
// vertical tab: zero display width, but breaks words
inWord = false
} else if unicode.Is(unicode.Cc, ch) {
// Control characters are transparent to word counting:
// they don't start or end words, matching GNU wc.
lineLen += int64(runeWidth(ch))
} else if unicode.Is(unicode.Zs, ch) {
// Unicode space separators (NBSP, thin space, etc.) end words,
// matching GNU wc behaviour under C.UTF-8 locale.
lineLen++
inWord = false
} else if !unicode.IsGraphic(r) && !unicode.Is(unicode.Cf, r) && !unicode.Is(unicode.Co, r) {
// Cn (unassigned codepoints): transparent like control chars --
// they do not start or end words, matching GNU wc under C.UTF-8.
} else {
} else if unicode.IsGraphic(ch) || unicode.Is(unicode.Co, ch) || unicode.Is(unicode.Cf, ch) || unicode.Is(unicode151Print, ch) {
// Printable characters start or continue a word,
// matching GNU wc which gates word counting on
// iswprint() in C.UTF-8 locale. IsGraphic covers
// letters, marks, numbers, punctuation, and
// symbols; Co adds private-use characters; Cf adds
// format characters (e.g. U+06DD ARABIC END OF
// AYAH, U+200B ZERO WIDTH SPACE) which glibc's
// iswprint considers printable; unicode151Print
// adds characters assigned in Unicode 15.1 that
// Go's tables don't yet include (Go ships
// Unicode 15.0).
if !inWord {
c.words++
inWord = true
}
lineLen += int64(runeWidth(r))
lineLen += int64(runeWidth(ch))
} else {
// Non-printable, non-whitespace, non-control chars
// (e.g. unassigned Cn codepoints) are transparent
// to both word counting and line length — they
// neither start nor end words, and GNU wc treats
// them as non-printable (wcwidth=-1, width 0).
}
Comment thread
AlexandreYang marked this conversation as resolved.
}
}
Expand Down Expand Up @@ -345,6 +413,25 @@ func fieldWidth(total counts, opts options) int {
return w
}

// unicode151Print covers characters assigned in Unicode 15.1 that are
// printable (graphic) but absent from Go's unicode package (Unicode 15.0).
// CI runs GNU wc linked against glibc ≥ 2.39 (Ubuntu 24.04) which uses
// Unicode 15.1+ character data, so these codepoints must be treated as
// word characters to match GNU wc output.
//
// This table can be removed once Go's unicode package is updated to
// Unicode 15.1 or later (tracked in https://github.com/golang/go/issues/65141,
// expected in Go 1.27).
var unicode151Print = &unicode.RangeTable{
R16: []unicode.Range16{
{0x2FFC, 0x2FFF, 1}, // Ideographic Description Characters (4 new IDCs)
{0x31EF, 0x31EF, 1}, // Ideographic Description Character OVERLAID
},
R32: []unicode.Range32{
{0x2EBF0, 0x2EE5D, 1}, // CJK Unified Ideographs Extension I
},
}

// runeWidth returns the display width of a rune following wcwidth(3) rules:
// 0 for controls, combining marks, and format chars; 2 for East Asian
// Wide/Fullwidth; 1 for everything else.
Expand Down
102 changes: 99 additions & 3 deletions interp/builtins/wc/wc_gnu_compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,21 +148,117 @@ func TestGNUCompatCharsMultibyte(t *testing.T) {
assert.Equal(t, "5 file.txt\n", stdout)
}

// TestGNUCompatControlCharIsWord — control byte \x01 does not count as a word.
// TestGNUCompatControlCharIsNotWord — control byte \x01 is transparent to word counting.
//
// GNU wc in POSIX locale treats C0 control characters as transparent:
// they neither start nor end words. Only printable chars form words.
//
// GNU command (Debian/Ubuntu POSIX locale): printf '\x01\n' | wc -w
// GNU command: printf '\x01\n' | gwc -w
// Expected: "0\n"
func TestGNUCompatControlCharIsWord(t *testing.T) {
func TestGNUCompatControlCharIsNotWord(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "\x01\n")
stdout, _, code := cmdRun(t, "wc -w file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "0 file.txt\n", stdout)
}
Comment thread
AlexandreYang marked this conversation as resolved.

// TestGNUCompatMaxLineLenVerticalTab — -L with \v (zero display width).
//
// GNU command: printf 'a\vb\n' | wc -L
// Expected: "2\n" — \v has zero width, so a(1) + b(1) = 2.
func TestGNUCompatMaxLineLenVerticalTab(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "a\vb\n")
stdout, _, code := cmdRun(t, "wc -L file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "2 file.txt\n", stdout)
}

// TestGNUCompatMaxLineLenFormFeed — -L with \f (resets line position).
//
// GNU command: printf 'abc\fdef\n' | wc -L
// Expected: "3\n" — \f resets position, so def = 3.
func TestGNUCompatMaxLineLenFormFeed(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "abc\fdef\n")
stdout, _, code := cmdRun(t, "wc -L file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "3 file.txt\n", stdout)
}

// TestGNUCompatMaxLineLenCRAsymmetric — -L with \r where text before \r is longer.
//
// GNU command: printf 'abcdef\rxy\n' | wc -L
// Expected: "6\n" — max(6, 2) = 6; \r resets position but preserves max.
func TestGNUCompatMaxLineLenCRAsymmetric(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "abcdef\rxy\n")
stdout, _, code := cmdRun(t, "wc -L file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "6 file.txt\n", stdout)
}

// TestGNUCompatMaxLineLenFFAsymmetric — -L with \f where text before \f is longer.
//
// GNU command: printf 'abcdef\fxy\n' | wc -L
// Expected: "6\n" — max(6, 2) = 6; \f resets position but preserves max.
func TestGNUCompatMaxLineLenFFAsymmetric(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "abcdef\fxy\n")
stdout, _, code := cmdRun(t, "wc -L file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "6 file.txt\n", stdout)
}

// TestGNUCompatDirectoryDefaultWidth — directory gets width-7 padding in default mode.
//
// GNU command: mkdir /tmp/d && wc /tmp/d
// Expected: " 0 0 0 .\n" (width 7, non-regular file)
func TestGNUCompatDirectoryDefaultWidth(t *testing.T) {
dir := t.TempDir()
stdout, stderr, code := cmdRun(t, "wc .", dir)
assert.Equal(t, 1, code)
assert.Contains(t, stderr, "wc:")
assert.Equal(t, " 0 0 0 .\n", stdout)
}

// TestGNUCompatDirectoryExplicitFlag — directory with explicit flag uses width 1.
//
// GNU command: mkdir /tmp/d && wc -l /tmp/d
// Expected: "0 .\n" (width 1, explicit flag)
func TestGNUCompatDirectoryExplicitFlag(t *testing.T) {
dir := t.TempDir()
stdout, stderr, code := cmdRun(t, "wc -l .", dir)
assert.Equal(t, 1, code)
assert.Contains(t, stderr, "wc:")
assert.Equal(t, "0 .\n", stdout)
}

// TestGNUCompatVerticalTabWordsBreak — \v breaks words for wc -w.
//
// GNU command: printf 'a\vb\n' | wc -w
// Expected: "2\n" — \v is a word delimiter.
func TestGNUCompatVerticalTabWordsBreak(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "a\vb\n")
stdout, _, code := cmdRun(t, "wc -w file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "2 file.txt\n", stdout)
}

// TestGNUCompatVerticalTabThreeWords — \v separates three words.
//
// GNU command: printf 'a\vb\vc\n' | wc -w
// Expected: "3\n"
func TestGNUCompatVerticalTabThreeWords(t *testing.T) {
dir := t.TempDir()
writeFile(t, dir, "file.txt", "a\vb\vc\n")
stdout, _, code := cmdRun(t, "wc -w file.txt", dir)
assert.Equal(t, 0, code)
assert.Equal(t, "3 file.txt\n", stdout)
}

// TestGNUCompatRejectedFlag — unknown flag exits 1.
//
// GNU command: gwc --follow
Expand Down
18 changes: 18 additions & 0 deletions interp/builtins/wc/wc_isdir_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2026-present Datadog, Inc.

//go:build !windows

package wc

import (
"errors"
"syscall"
)

// isErrIsDir reports whether err wraps a "is a directory" error.
func isErrIsDir(err error) bool {
return errors.Is(err, syscall.EISDIR)
}
Loading
Loading