From a6ffbb71e1f968035e199197345edddc6d9d4d47 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 00:32:23 +0100 Subject: [PATCH 01/30] Implement sed builtin command with full script parser and safety hardening Co-Authored-By: Claude Opus 4.6 --- SHELL_FEATURES.md | 1 + interp/builtins/sed/sed.go | 1411 +++++++++++++++++ .../builtins/tests/sed/sed_hardening_test.go | 235 +++ interp/builtins/tests/sed/sed_pentest_test.go | 366 +++++ interp/builtins/tests/sed/sed_test.go | 607 +++++++ interp/register_builtins.go | 2 + tests/allowed_symbols_test.go | 4 + .../scenarios/cmd/sed/address/last_line.yaml | 13 + tests/scenarios/cmd/sed/address/line.yaml | 13 + tests/scenarios/cmd/sed/address/negation.yaml | 13 + tests/scenarios/cmd/sed/address/range.yaml | 13 + tests/scenarios/cmd/sed/address/regex.yaml | 13 + tests/scenarios/cmd/sed/address/step.yaml | 14 + tests/scenarios/cmd/sed/branch/basic.yaml | 13 + .../scenarios/cmd/sed/branch/conditional.yaml | 13 + tests/scenarios/cmd/sed/branch/label.yaml | 13 + tests/scenarios/cmd/sed/delete/basic.yaml | 13 + tests/scenarios/cmd/sed/delete/range.yaml | 13 + tests/scenarios/cmd/sed/edge/empty_file.yaml | 13 + .../cmd/sed/edge/no_trailing_newline.yaml | 13 + tests/scenarios/cmd/sed/edge/single_line.yaml | 13 + .../cmd/sed/errors/blocked_execute.yaml | 9 + .../cmd/sed/errors/blocked_inplace.yaml | 9 + .../cmd/sed/errors/blocked_read.yaml | 9 + .../cmd/sed/errors/blocked_write.yaml | 9 + .../cmd/sed/errors/invalid_regex.yaml | 8 + .../cmd/sed/errors/missing_file.yaml | 13 + tests/scenarios/cmd/sed/errors/no_script.yaml | 8 + tests/scenarios/cmd/sed/hold/append.yaml | 13 + tests/scenarios/cmd/sed/hold/copy.yaml | 13 + tests/scenarios/cmd/sed/hold/exchange.yaml | 13 + tests/scenarios/cmd/sed/multiple/multi_e.yaml | 13 + .../scenarios/cmd/sed/multiple/semicolon.yaml | 13 + tests/scenarios/cmd/sed/next/append_next.yaml | 13 + tests/scenarios/cmd/sed/next/basic.yaml | 13 + .../scenarios/cmd/sed/print/line_number.yaml | 13 + tests/scenarios/cmd/sed/print/suppress.yaml | 13 + .../scenarios/cmd/sed/print/unambiguous.yaml | 13 + tests/scenarios/cmd/sed/quit/basic.yaml | 13 + tests/scenarios/cmd/sed/quit/noprint.yaml | 13 + tests/scenarios/cmd/sed/stdin/dash.yaml | 8 + tests/scenarios/cmd/sed/stdin/pipe.yaml | 8 + .../cmd/sed/substitute/ampersand.yaml | 13 + .../cmd/sed/substitute/backreference.yaml | 13 + tests/scenarios/cmd/sed/substitute/basic.yaml | 13 + .../cmd/sed/substitute/case_insensitive.yaml | 13 + .../cmd/sed/substitute/delimiter.yaml | 13 + .../cmd/sed/substitute/empty_match.yaml | 13 + .../scenarios/cmd/sed/substitute/global.yaml | 13 + tests/scenarios/cmd/sed/substitute/nth.yaml | 13 + tests/scenarios/cmd/sed/text/append.yaml | 13 + tests/scenarios/cmd/sed/text/change.yaml | 13 + tests/scenarios/cmd/sed/text/insert.yaml | 13 + .../cmd/sed/transliterate/basic.yaml | 13 + .../cmd/unknown_cmd/common_progs/sed.yaml | 10 - 55 files changed, 3202 insertions(+), 10 deletions(-) create mode 100644 interp/builtins/sed/sed.go create mode 100644 interp/builtins/tests/sed/sed_hardening_test.go create mode 100644 interp/builtins/tests/sed/sed_pentest_test.go create mode 100644 interp/builtins/tests/sed/sed_test.go create mode 100644 tests/scenarios/cmd/sed/address/last_line.yaml create mode 100644 tests/scenarios/cmd/sed/address/line.yaml create mode 100644 tests/scenarios/cmd/sed/address/negation.yaml create mode 100644 tests/scenarios/cmd/sed/address/range.yaml create mode 100644 tests/scenarios/cmd/sed/address/regex.yaml create mode 100644 tests/scenarios/cmd/sed/address/step.yaml create mode 100644 tests/scenarios/cmd/sed/branch/basic.yaml create mode 100644 tests/scenarios/cmd/sed/branch/conditional.yaml create mode 100644 tests/scenarios/cmd/sed/branch/label.yaml create mode 100644 tests/scenarios/cmd/sed/delete/basic.yaml create mode 100644 tests/scenarios/cmd/sed/delete/range.yaml create mode 100644 tests/scenarios/cmd/sed/edge/empty_file.yaml create mode 100644 tests/scenarios/cmd/sed/edge/no_trailing_newline.yaml create mode 100644 tests/scenarios/cmd/sed/edge/single_line.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_execute.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_inplace.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_read.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_write.yaml create mode 100644 tests/scenarios/cmd/sed/errors/invalid_regex.yaml create mode 100644 tests/scenarios/cmd/sed/errors/missing_file.yaml create mode 100644 tests/scenarios/cmd/sed/errors/no_script.yaml create mode 100644 tests/scenarios/cmd/sed/hold/append.yaml create mode 100644 tests/scenarios/cmd/sed/hold/copy.yaml create mode 100644 tests/scenarios/cmd/sed/hold/exchange.yaml create mode 100644 tests/scenarios/cmd/sed/multiple/multi_e.yaml create mode 100644 tests/scenarios/cmd/sed/multiple/semicolon.yaml create mode 100644 tests/scenarios/cmd/sed/next/append_next.yaml create mode 100644 tests/scenarios/cmd/sed/next/basic.yaml create mode 100644 tests/scenarios/cmd/sed/print/line_number.yaml create mode 100644 tests/scenarios/cmd/sed/print/suppress.yaml create mode 100644 tests/scenarios/cmd/sed/print/unambiguous.yaml create mode 100644 tests/scenarios/cmd/sed/quit/basic.yaml create mode 100644 tests/scenarios/cmd/sed/quit/noprint.yaml create mode 100644 tests/scenarios/cmd/sed/stdin/dash.yaml create mode 100644 tests/scenarios/cmd/sed/stdin/pipe.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/ampersand.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/backreference.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/basic.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/case_insensitive.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/delimiter.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/empty_match.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/global.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/nth.yaml create mode 100644 tests/scenarios/cmd/sed/text/append.yaml create mode 100644 tests/scenarios/cmd/sed/text/change.yaml create mode 100644 tests/scenarios/cmd/sed/text/insert.yaml create mode 100644 tests/scenarios/cmd/sed/transliterate/basic.yaml delete mode 100644 tests/scenarios/cmd/unknown_cmd/common_progs/sed.yaml diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 40746388..0097dc95 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -15,6 +15,7 @@ Blocked features are rejected before execution with exit code 2. - ✅ `grep [-EFGivclLnHhoqsxw] [-e PATTERN] [-m NUM] [-A NUM] [-B NUM] [-C NUM] PATTERN [FILE]...` — print lines that match patterns; uses RE2 regex engine (linear-time, no backtracking) - ✅ `head [-n N|-c N] [-q|-v] [-z] [FILE]...` — output the first part of files (default: first 10 lines) - ✅ `ls [-1aAdFhlpRrSt] [FILE]...` — list directory contents +- ✅ `sed [-n] [-e SCRIPT] [-E|-r] [SCRIPT] [FILE]...` — stream editor for filtering and transforming text; uses RE2 regex engine; `-i`/`-f` rejected; `e`/`w`/`W`/`r`/`R` commands blocked - ✅ `strings [-a] [-n MIN] [-t o|d|x] [-o] [-f] [-s SEP] [FILE]...` — print printable character sequences in files (default min length 4); offsets via `-t`/`-o`; filename prefix via `-f`; custom separator via `-s` - ✅ `tail [-n N|-c N] [-q|-v] [-z] [FILE]...` — output the last part of files (default: last 10 lines); supports `+N` offset mode; `-f`/`--follow` is rejected - ✅ `true` — return exit code 0 diff --git a/interp/builtins/sed/sed.go b/interp/builtins/sed/sed.go new file mode 100644 index 00000000..c95a4acf --- /dev/null +++ b/interp/builtins/sed/sed.go @@ -0,0 +1,1411 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +// Package sed implements the sed builtin command. +// +// sed — stream editor for filtering and transforming text +// +// Usage: sed [OPTION]... [script] [FILE]... +// +// sed [OPTION]... -e script [-e script]... [FILE]... +// +// sed reads input files (or standard input if no files are given, or when +// FILE is -), applies editing commands from the script, and writes the +// result to standard output. +// +// Accepted flags: +// +// -n, --quiet, --silent +// Suppress automatic printing of pattern space. Only lines +// explicitly printed via the p command are output. +// +// -e script, --expression=script +// Add the script commands to the set of commands to execute. +// Multiple -e options are allowed; they are concatenated in order. +// +// -E, --regexp-extended +// Use extended regular expressions (ERE) rather than basic (BRE). +// +// -r +// GNU alias for -E (extended regular expressions). +// +// -h, --help +// Print this usage message to stdout and exit 0. +// +// Supported sed commands: +// +// s/regex/replacement/[flags] Substitute matches of regex with replacement. +// Flags: g (global), p (print), i/I (case-insensitive), +// N (replace Nth match). +// p Print the current pattern space. +// d Delete pattern space, start next cycle. +// q [code] Quit with optional exit code (prints pattern space first). +// Q [code] Quit with optional exit code (does not print). +// y/src/dst/ Transliterate characters from src to dst. +// a\text / a text Append text after the current line. +// i\text / i text Insert text before the current line. +// c\text / c text Replace line(s) with text. +// = Print the current line number. +// l Print pattern space unambiguously. +// n Read next input line into pattern space. +// N Append next input line to pattern space. +// h Copy pattern space to hold space. +// H Append pattern space to hold space. +// g Copy hold space to pattern space. +// G Append hold space to pattern space. +// x Exchange pattern and hold spaces. +// b [label] Branch to label (or end of script). +// : label Define a label for branching. +// t [label] Branch to label if s/// made a substitution. +// T [label] Branch to label if s/// did NOT make a substitution. +// {...} Group commands. +// !command Negate the address (apply to non-matching lines). +// +// Addressing: +// +// N Line number (1-based). +// $ Last line. +// /regex/ Lines matching regex. +// addr1,addr2 Range of lines. +// first~step Every step-th line starting from first (GNU extension). +// +// Rejected commands (blocked for safety): +// +// e Execute pattern space as shell command (blocked: command execution). +// w file Write pattern space to file (blocked: file write). +// W file Write first line to file (blocked: file write). +// r file Read file contents (blocked: unsandboxed file read). +// R file Read one line from file (blocked: unsandboxed file read). +// +// Rejected flags: +// +// -i, --in-place Edit files in place (blocked: file write). +// -f, --file Read script from file (not implemented). +// -s, --separate Treat files as separate streams (not implemented). +// -z, --null-data NUL-separated input (not implemented). +// +// Exit codes: +// +// 0 Success (or custom code via q/Q command). +// 1 Invalid script syntax, missing file, or other error. +// +// Memory safety: +// +// Input is processed line-by-line via a buffered scanner with a per-line +// cap of 1 MiB (MaxLineBytes). Pattern space and hold space are each +// bounded to MaxSpaceBytes (1 MiB). Branch loops are capped at +// MaxBranchIterations (10 000) per input line to prevent infinite loops. +// Non-regular-file inputs are subject to a MaxTotalReadBytes (256 MiB) +// limit to guard against infinite sources. +// +// Regex safety: +// +// All regular expressions use Go's regexp package, which implements RE2 +// (guaranteed linear-time matching, no backtracking). This prevents ReDoS +// attacks. BRE patterns are converted to ERE syntax before compilation. +package sed + +import ( + "bufio" + "context" + "errors" + "fmt" + "io" + "os" + "regexp" + "strconv" + "strings" + + "github.com/DataDog/rshell/interp/builtins" +) + +// Cmd is the sed builtin command descriptor. +var Cmd = builtins.Command{Name: "sed", MakeFlags: registerFlags} + +// MaxLineBytes is the per-line buffer cap for the line scanner. +const MaxLineBytes = 1 << 20 // 1 MiB + +// MaxSpaceBytes is the maximum size for pattern space and hold space. +const MaxSpaceBytes = 1 << 20 // 1 MiB + +// MaxBranchIterations is the maximum number of branch iterations per +// input line to prevent infinite loops. +const MaxBranchIterations = 10_000 + +// MaxTotalReadBytes is the maximum total bytes consumed from a single +// non-regular-file input source. +const MaxTotalReadBytes = 256 << 20 // 256 MiB + +// expressionSlice collects multiple -e values. +type expressionSlice []string + +func (e *expressionSlice) String() string { return strings.Join(*e, "\n") } +func (e *expressionSlice) Set(val string) error { + *e = append(*e, val) + return nil +} +func (e *expressionSlice) Type() string { return "string" } + +// registerFlags sets up sed flags and returns the handler. +func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { + help := fs.BoolP("help", "h", false, "print usage and exit") + quiet := fs.BoolP("quiet", "n", false, "suppress automatic printing of pattern space") + fs.Lookup("quiet").NoOptDefVal = "true" + // --silent is an alias for --quiet. + silent := fs.Bool("silent", false, "alias for --quiet") + fs.Lookup("silent").NoOptDefVal = "true" + + var expressions expressionSlice + fs.VarP(&expressions, "expression", "e", "add script commands") + + extendedE := fs.BoolP("regexp-extended", "E", false, "use extended regular expressions") + extendedR := fs.BoolP("regexp-extended-r", "r", false, "use extended regular expressions (GNU alias for -E)") + fs.Lookup("regexp-extended-r").Hidden = true + + return func(ctx context.Context, callCtx *builtins.CallContext, args []string) builtins.Result { + if *help { + callCtx.Out("Usage: sed [OPTION]... [script] [FILE]...\n") + callCtx.Out("Stream editor for filtering and transforming text.\n") + callCtx.Out("With no FILE, or when FILE is -, read standard input.\n\n") + fs.SetOutput(callCtx.Stdout) + fs.PrintDefaults() + return builtins.Result{} + } + + suppressPrint := *quiet || *silent + useERE := *extendedE || *extendedR + + // Determine script and files. + var scriptParts []string + var files []string + + if len(expressions) > 0 { + scriptParts = []string(expressions) + files = args + } else if len(args) > 0 { + scriptParts = []string{args[0]} + files = args[1:] + } else { + callCtx.Errf("sed: no script command has been specified\n") + return builtins.Result{Code: 1} + } + + // Parse the sed script. + prog, err := parseScript(strings.Join(scriptParts, "\n"), useERE) + if err != nil { + callCtx.Errf("sed: %s\n", err) + return builtins.Result{Code: 1} + } + + if len(files) == 0 { + files = []string{"-"} + } + + // Create the execution engine. + eng := &engine{ + callCtx: callCtx, + prog: prog, + suppressPrint: suppressPrint, + } + + var failed bool + for _, file := range files { + if ctx.Err() != nil { + break + } + if err := eng.processFile(ctx, callCtx, file); err != nil { + var qe *quitError + if errors.As(err, &qe) { + // q command: print pattern space if requested, then exit. + return builtins.Result{Code: qe.code} + } + name := file + if file == "-" { + name = "standard input" + } + callCtx.Errf("sed: %s: %s\n", name, callCtx.PortableErr(err)) + failed = true + } + } + + if failed { + return builtins.Result{Code: 1} + } + return builtins.Result{} + } +} + +// --- Error types --- + +// quitError signals a q or Q command with an exit code. +type quitError struct { + code uint8 +} + +func (e *quitError) Error() string { + return fmt.Sprintf("quit with code %d", e.code) +} + +// --- Address types --- + +// addrType distinguishes different address kinds. +type addrType int + +const ( + addrNone addrType = iota + addrLine // specific line number + addrLast // $ (last line) + addrRegexp // /regex/ + addrStep // first~step (GNU extension) +) + +// address represents a sed address (line number, regex, or $). +type address struct { + kind addrType + line int64 // for addrLine + re *regexp.Regexp // for addrRegexp + first int64 // for addrStep + step int64 // for addrStep +} + +// --- Command types --- + +// cmdType identifies the sed command. +type cmdType int + +const ( + cmdSubstitute cmdType = iota + cmdPrint + cmdDelete + cmdQuit + cmdQuitNoprint + cmdTransliterate + cmdAppend + cmdInsert + cmdChange + cmdLineNum + cmdPrintUnambig + cmdNext + cmdNextAppend + cmdHoldCopy + cmdHoldAppend + cmdGetCopy + cmdGetAppend + cmdExchange + cmdBranch + cmdLabel + cmdBranchIfSub + cmdBranchIfNoSub + cmdPrintFirstLine // P: print up to first embedded newline + cmdDeleteFirstLine // D: delete up to first embedded newline, restart cycle + cmdGroup + cmdNoop +) + +// sedCmd represents a single parsed sed command. +type sedCmd struct { + addr1 *address + addr2 *address + negated bool + inRange bool // stateful: tracks whether we're inside a two-address range + kind cmdType + + // For s command: + subRe *regexp.Regexp + subReplacement string + subGlobal bool + subPrint bool + subNth int + + // For y command: + transFrom []rune + transTo []rune + + // For a, i, c commands: + text string + + // For q, Q commands: + quitCode uint8 + + // For b, t, T commands: + label string + + // For { ... } grouping: + children []*sedCmd +} + +// --- Parser --- + +// parser holds state during sed script parsing. +type parser struct { + input string + pos int + useERE bool +} + +func parseScript(script string, useERE bool) ([]*sedCmd, error) { + p := &parser{input: script, useERE: useERE} + cmds, err := p.parseCommands(false) + if err != nil { + return nil, err + } + return cmds, nil +} + +func (p *parser) parseCommands(inGroup bool) ([]*sedCmd, error) { + var cmds []*sedCmd + for p.pos < len(p.input) { + p.skipWhitespaceAndSemicolons() + if p.pos >= len(p.input) { + break + } + ch := p.input[p.pos] + if ch == '}' { + if inGroup { + p.pos++ // consume '}' + return cmds, nil + } + return nil, errors.New("unexpected '}'") + } + if ch == '#' { + // Comment — skip to end of line. + for p.pos < len(p.input) && p.input[p.pos] != '\n' { + p.pos++ + } + continue + } + cmd, err := p.parseOneCommand() + if err != nil { + return nil, err + } + if cmd != nil { + cmds = append(cmds, cmd) + } + } + if inGroup { + return nil, errors.New("unterminated '{'") + } + return cmds, nil +} + +func (p *parser) skipWhitespaceAndSemicolons() { + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ';' { + p.pos++ + } else { + break + } + } +} + +func (p *parser) skipSpaces() { + for p.pos < len(p.input) && (p.input[p.pos] == ' ' || p.input[p.pos] == '\t') { + p.pos++ + } +} + +func (p *parser) parseOneCommand() (*sedCmd, error) { + cmd := &sedCmd{} + + // Parse first address. + addr1, err := p.parseAddress() + if err != nil { + return nil, err + } + cmd.addr1 = addr1 + + // Check for comma (address range). + if cmd.addr1 != nil && p.pos < len(p.input) && p.input[p.pos] == ',' { + p.pos++ // consume ',' + p.skipSpaces() + addr2, err := p.parseAddress() + if err != nil { + return nil, err + } + if addr2 == nil { + return nil, errors.New("expected address after ','") + } + cmd.addr2 = addr2 + } + + p.skipSpaces() + + // Check for negation. + if p.pos < len(p.input) && p.input[p.pos] == '!' { + cmd.negated = true + p.pos++ + p.skipSpaces() + } + + if p.pos >= len(p.input) { + return nil, errors.New("missing command") + } + + ch := p.input[p.pos] + p.pos++ + + switch ch { + case 's': + return p.parseSubstitute(cmd) + case 'y': + return p.parseTransliterate(cmd) + case 'p': + cmd.kind = cmdPrint + case 'P': + cmd.kind = cmdPrintFirstLine + case 'd': + cmd.kind = cmdDelete + case 'D': + cmd.kind = cmdDeleteFirstLine + case 'q': + cmd.kind = cmdQuit + cmd.quitCode = p.parseOptionalExitCode() + case 'Q': + cmd.kind = cmdQuitNoprint + cmd.quitCode = p.parseOptionalExitCode() + case 'a': + cmd.kind = cmdAppend + cmd.text = p.parseTextArg() + case 'i': + cmd.kind = cmdInsert + cmd.text = p.parseTextArg() + case 'c': + cmd.kind = cmdChange + cmd.text = p.parseTextArg() + case '=': + cmd.kind = cmdLineNum + case 'l': + cmd.kind = cmdPrintUnambig + case 'n': + cmd.kind = cmdNext + case 'N': + cmd.kind = cmdNextAppend + case 'h': + cmd.kind = cmdHoldCopy + case 'H': + cmd.kind = cmdHoldAppend + case 'g': + cmd.kind = cmdGetCopy + case 'G': + cmd.kind = cmdGetAppend + case 'x': + cmd.kind = cmdExchange + case 'b': + cmd.kind = cmdBranch + cmd.label = p.parseLabelArg() + case 't': + cmd.kind = cmdBranchIfSub + cmd.label = p.parseLabelArg() + case 'T': + cmd.kind = cmdBranchIfNoSub + cmd.label = p.parseLabelArg() + case ':': + cmd.kind = cmdLabel + cmd.label = p.parseLabelArg() + if cmd.label == "" { + return nil, errors.New("missing label name for ':'") + } + case '{': + children, err := p.parseCommands(true) + if err != nil { + return nil, err + } + cmd.kind = cmdGroup + cmd.children = children + case 'e': + return nil, errors.New("'e' command is blocked: command execution is not allowed") + case 'w': + return nil, errors.New("'w' command is blocked: file writing is not allowed") + case 'W': + return nil, errors.New("'W' command is blocked: file writing is not allowed") + case 'r': + return nil, errors.New("'r' command is blocked: unsandboxed file reading is not allowed") + case 'R': + return nil, errors.New("'R' command is blocked: unsandboxed file reading is not allowed") + default: + return nil, errors.New("unknown command: '" + string(ch) + "'") + } + + return cmd, nil +} + +func (p *parser) parseOptionalExitCode() uint8 { + p.skipSpaces() + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + if start == p.pos { + return 0 + } + n, err := strconv.Atoi(p.input[start:p.pos]) + if err != nil || n < 0 || n > 255 { + return 0 + } + return uint8(n) +} + +func (p *parser) parseTextArg() string { + // GNU sed allows: a\text, a text, or a\text + if p.pos < len(p.input) && p.input[p.pos] == '\\' { + p.pos++ + if p.pos < len(p.input) && p.input[p.pos] == '\n' { + p.pos++ // consume newline after backslash + } + } else { + p.skipSpaces() + } + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] != '\n' && p.input[p.pos] != ';' { + p.pos++ + } + return p.input[start:p.pos] +} + +func (p *parser) parseLabelArg() string { + p.skipSpaces() + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] != ' ' && p.input[p.pos] != '\t' && + p.input[p.pos] != '\n' && p.input[p.pos] != ';' && p.input[p.pos] != '}' { + p.pos++ + } + return p.input[start:p.pos] +} + +func (p *parser) parseAddress() (*address, error) { + if p.pos >= len(p.input) { + return nil, nil + } + + ch := p.input[p.pos] + + // Line number. + if ch >= '0' && ch <= '9' { + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + // Check for first~step syntax. + if p.pos < len(p.input) && p.input[p.pos] == '~' { + first, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) + if err != nil { + return nil, errors.New("invalid address: " + p.input[start:p.pos]) + } + p.pos++ // consume '~' + stepStart := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + step, err := strconv.ParseInt(p.input[stepStart:p.pos], 10, 64) + if err != nil || step <= 0 { + return nil, errors.New("invalid step in address") + } + return &address{kind: addrStep, first: first, step: step}, nil + } + n, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) + if err != nil { + return nil, errors.New("invalid line number: " + p.input[start:p.pos]) + } + return &address{kind: addrLine, line: n}, nil + } + + // Last line. + if ch == '$' { + p.pos++ + return &address{kind: addrLast}, nil + } + + // Regex address. + if ch == '/' || ch == '\\' { + var delim byte + if ch == '\\' { + p.pos++ // consume '\' + if p.pos >= len(p.input) { + return nil, errors.New("expected delimiter after '\\'") + } + delim = p.input[p.pos] + } else { + delim = '/' + } + p.pos++ // consume delimiter + pattern, err := p.readUntilDelimiter(delim) + if err != nil { + return nil, err + } + re, err := p.compileRegex(pattern) + if err != nil { + return nil, err + } + return &address{kind: addrRegexp, re: re}, nil + } + + return nil, nil +} + +func (p *parser) readUntilDelimiter(delim byte) (string, error) { + var sb strings.Builder + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == '\\' && p.pos+1 < len(p.input) { + next := p.input[p.pos+1] + if next == delim { + sb.WriteByte(delim) + p.pos += 2 + continue + } + sb.WriteByte('\\') + sb.WriteByte(next) + p.pos += 2 + continue + } + if ch == delim { + p.pos++ // consume closing delimiter + return sb.String(), nil + } + sb.WriteByte(ch) + p.pos++ + } + return "", errors.New("unterminated address regex") +} + +func (p *parser) parseSubstitute(cmd *sedCmd) (*sedCmd, error) { + if p.pos >= len(p.input) { + return nil, errors.New("missing delimiter for 's' command") + } + delim := p.input[p.pos] + if delim == '\\' || delim == '\n' { + return nil, errors.New("invalid delimiter for 's' command: '" + string(delim) + "'") + } + p.pos++ // consume delimiter + + // Read pattern. + pattern, err := p.readSubstPart(delim) + if err != nil { + return nil, errors.New("unterminated 's' command: " + err.Error()) + } + + // Read replacement. + replacement, err := p.readSubstPart(delim) + if err != nil { + return nil, errors.New("unterminated 's' command: " + err.Error()) + } + + // Read flags. + cmd.kind = cmdSubstitute + cmd.subReplacement = replacement + caseInsensitive := false + + for p.pos < len(p.input) { + ch := p.input[p.pos] + switch ch { + case 'g': + cmd.subGlobal = true + p.pos++ + case 'p': + cmd.subPrint = true + p.pos++ + case 'i', 'I': + caseInsensitive = true + p.pos++ + case 'w': + return nil, errors.New("'w' flag in 's' command is blocked: file writing is not allowed") + case 'e': + return nil, errors.New("'e' flag in 's' command is blocked: command execution is not allowed") + default: + if ch >= '1' && ch <= '9' { + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + n, err := strconv.Atoi(p.input[start:p.pos]) + if err != nil || n <= 0 { + return nil, errors.New("invalid substitution occurrence number") + } + cmd.subNth = n + continue + } + // Any other character ends the flag list. + goto flagsDone + } + } +flagsDone: + + re, err := p.compileRegex(pattern) + if err != nil { + return nil, err + } + // Apply case-insensitive flag after BRE-to-ERE conversion so (?i) isn't mangled. + if caseInsensitive { + re, err = regexp.Compile("(?i)" + re.String()) + if err != nil { + return nil, errors.New("invalid regex with case-insensitive flag: " + err.Error()) + } + } + cmd.subRe = re + return cmd, nil +} + +func (p *parser) readSubstPart(delim byte) (string, error) { + var sb strings.Builder + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == '\\' && p.pos+1 < len(p.input) { + next := p.input[p.pos+1] + if next == delim { + sb.WriteByte(delim) + p.pos += 2 + continue + } + if next == 'n' { + sb.WriteByte('\n') + p.pos += 2 + continue + } + if next == 't' { + sb.WriteByte('\t') + p.pos += 2 + continue + } + sb.WriteByte('\\') + sb.WriteByte(next) + p.pos += 2 + continue + } + if ch == delim { + p.pos++ // consume closing delimiter + return sb.String(), nil + } + sb.WriteByte(ch) + p.pos++ + } + return sb.String(), nil +} + +func (p *parser) parseTransliterate(cmd *sedCmd) (*sedCmd, error) { + if p.pos >= len(p.input) { + return nil, errors.New("missing delimiter for 'y' command") + } + delim := p.input[p.pos] + p.pos++ + + srcStr, err := p.readSubstPart(delim) + if err != nil { + return nil, err + } + dstStr, err := p.readSubstPart(delim) + if err != nil { + return nil, err + } + + src := []rune(srcStr) + dst := []rune(dstStr) + if len(src) != len(dst) { + return nil, errors.New("'y' command: source and destination must have the same length") + } + + cmd.kind = cmdTransliterate + cmd.transFrom = src + cmd.transTo = dst + return cmd, nil +} + +// compileRegex compiles a regex pattern, converting BRE to ERE if needed. +func (p *parser) compileRegex(pattern string) (*regexp.Regexp, error) { + if !p.useERE { + pattern = breToERE(pattern) + } + re, err := regexp.Compile(pattern) + if err != nil { + return nil, errors.New("invalid regex: " + err.Error()) + } + return re, nil +} + +// breToERE converts a basic regular expression to an extended one. +// In BRE: \( \) \{ \} \+ \? are special; ( ) { } + ? are literal. +// In ERE: ( ) { } + ? are special; \( \) etc. are literal. +func breToERE(pattern string) string { + var sb strings.Builder + sb.Grow(len(pattern)) + i := 0 + for i < len(pattern) { + if pattern[i] == '\\' && i+1 < len(pattern) { + next := pattern[i+1] + switch next { + case '(', ')', '{', '}', '+', '?', '|': + // BRE escaped special → ERE unescaped special. + sb.WriteByte(next) + i += 2 + default: + // Includes backreferences (\1-\9) which RE2 doesn't support + // but are passed through unchanged. + sb.WriteByte('\\') + sb.WriteByte(next) + i += 2 + } + } else { + ch := pattern[i] + switch ch { + case '(', ')', '{', '}', '+', '?', '|': + // In BRE these are literal; escape them for ERE. + sb.WriteByte('\\') + sb.WriteByte(ch) + default: + sb.WriteByte(ch) + } + i++ + } + } + return sb.String() +} + +// --- Execution Engine --- + +// engine holds the state for executing a sed script. +type engine struct { + callCtx *builtins.CallContext + prog []*sedCmd + suppressPrint bool + lineNum int64 + lastLine bool + patternSpace string + holdSpace string + appendQueue []string // text queued by 'a' command, flushed after auto-print + subMade bool // set when s/// succeeds (cleared on new input line) + totalRead int64 + isRegularFile bool +} + +// lineReader wraps a scanner with one-line look-ahead so we can determine +// whether the current line is the last one, while still allowing n/N commands +// to consume lines from the same scanner. +type lineReader struct { + sc *bufio.Scanner + nextLine string + hasNext bool + totalRead int64 + isRegularFile bool +} + +func newLineReader(sc *bufio.Scanner, isRegular bool) *lineReader { + lr := &lineReader{sc: sc, isRegularFile: isRegular} + lr.advance() // prime the look-ahead + return lr +} + +func (lr *lineReader) advance() bool { + if lr.sc.Scan() { + lr.nextLine = lr.sc.Text() + lr.totalRead += int64(len(lr.sc.Bytes())) + lr.hasNext = true + return true + } + lr.hasNext = false + return false +} + +func (lr *lineReader) readLine() (string, bool) { + if !lr.hasNext { + return "", false + } + line := lr.nextLine + lr.advance() + return line, true +} + +func (lr *lineReader) isLast() bool { + return !lr.hasNext +} + +func (lr *lineReader) checkLimit() error { + if !lr.isRegularFile && lr.totalRead > MaxTotalReadBytes { + return errors.New("input too large: read limit exceeded") + } + return nil +} + +func (eng *engine) processFile(ctx context.Context, callCtx *builtins.CallContext, file string) error { + var rc io.ReadCloser + if file == "-" { + if callCtx.Stdin == nil { + return nil + } + eng.isRegularFile = isRegularFile(callCtx.Stdin) + rc = io.NopCloser(callCtx.Stdin) + } else { + f, err := callCtx.OpenFile(ctx, file, os.O_RDONLY, 0) + if err != nil { + return err + } + defer f.Close() + eng.isRegularFile = isRegularFile(f) + rc = f + } + + sc := bufio.NewScanner(rc) + buf := make([]byte, 4096) + sc.Buffer(buf, MaxLineBytes) + + lr := newLineReader(sc, eng.isRegularFile) + + for { + if ctx.Err() != nil { + return ctx.Err() + } + + line, ok := lr.readLine() + if !ok { + break + } + if err := lr.checkLimit(); err != nil { + return err + } + + eng.lineNum++ + eng.patternSpace = line + eng.lastLine = lr.isLast() + + err := eng.runCycle(ctx, lr) + if err != nil { + return err + } + } + + if err := sc.Err(); err != nil { + return err + } + return nil +} + +// runCycle executes the script for the current input line. +func (eng *engine) runCycle(ctx context.Context, lr *lineReader) error { + eng.subMade = false + eng.appendQueue = eng.appendQueue[:0] + action, err := eng.execCommandsFrom(ctx, 0, lr, 0) + if err != nil { + return err + } + if action != actionDelete && !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + // Flush queued 'a' text after auto-print (even if auto-print was suppressed or deleted). + for _, text := range eng.appendQueue { + eng.callCtx.Outf("%s\n", text) + } + return nil +} + +// actionType signals how to proceed after executing a command. +type actionType int + +const ( + actionContinue actionType = iota + actionDelete // d/D command: skip auto-print, start next cycle +) + +// execCommandsFrom executes commands starting from index startIdx in the given +// command list. For branching, it always searches the full eng.prog for labels +// and restarts from there to handle backward branches correctly. +func (eng *engine) execCommandsFrom(ctx context.Context, startIdx int, lr *lineReader, depth int) (actionType, error) { + return eng.execCmds(ctx, eng.prog, startIdx, lr, depth) +} + +func (eng *engine) execCmds(ctx context.Context, cmds []*sedCmd, startIdx int, lr *lineReader, depth int) (actionType, error) { + if depth > MaxBranchIterations { + return actionContinue, errors.New("branch loop limit exceeded") + } + + for i := startIdx; i < len(cmds); i++ { + if ctx.Err() != nil { + return actionContinue, ctx.Err() + } + + cmd := cmds[i] + + if cmd.kind == cmdLabel { + continue + } + + if !eng.addressMatch(cmd) { + continue + } + + switch cmd.kind { + case cmdSubstitute: + if err := eng.execSubstitute(cmd); err != nil { + return actionContinue, err + } + + case cmdPrint: + eng.callCtx.Outf("%s\n", eng.patternSpace) + + case cmdDelete: + return actionDelete, nil + + case cmdPrintFirstLine: + if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { + eng.callCtx.Outf("%s\n", eng.patternSpace[:idx]) + } else { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + + case cmdDeleteFirstLine: + if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { + eng.patternSpace = eng.patternSpace[idx+1:] + // Restart the cycle with the remaining pattern space. + eng.subMade = false + eng.appendQueue = eng.appendQueue[:0] + return eng.execCommandsFrom(ctx, 0, lr, depth+1) + } + return actionDelete, nil + + case cmdQuit: + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + return actionContinue, &quitError{code: cmd.quitCode} + + case cmdQuitNoprint: + return actionContinue, &quitError{code: cmd.quitCode} + + case cmdTransliterate: + eng.patternSpace = eng.transliterate(eng.patternSpace, cmd.transFrom, cmd.transTo) + + case cmdAppend: + eng.appendQueue = append(eng.appendQueue, cmd.text) + + case cmdInsert: + eng.callCtx.Outf("%s\n", cmd.text) + + case cmdChange: + eng.callCtx.Outf("%s\n", cmd.text) + return actionDelete, nil + + case cmdLineNum: + eng.callCtx.Outf("%d\n", eng.lineNum) + + case cmdPrintUnambig: + eng.printUnambiguous() + + case cmdNext: + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + for _, text := range eng.appendQueue { + eng.callCtx.Outf("%s\n", text) + } + eng.appendQueue = eng.appendQueue[:0] + line, ok := lr.readLine() + if ok { + if err := lr.checkLimit(); err != nil { + return actionContinue, err + } + eng.lineNum++ + eng.patternSpace = line + eng.lastLine = lr.isLast() + } else { + eng.lastLine = true + return actionContinue, nil + } + + case cmdNextAppend: + line, ok := lr.readLine() + if ok { + if err := lr.checkLimit(); err != nil { + return actionContinue, err + } + eng.lineNum++ + if len(eng.patternSpace)+1+len(line) > MaxSpaceBytes { + return actionContinue, errors.New("pattern space exceeded size limit") + } + eng.patternSpace += "\n" + line + eng.lastLine = lr.isLast() + } else { + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + return actionDelete, nil + } + + case cmdHoldCopy: + eng.holdSpace = eng.patternSpace + + case cmdHoldAppend: + if len(eng.holdSpace)+1+len(eng.patternSpace) > MaxSpaceBytes { + return actionContinue, errors.New("hold space exceeded size limit") + } + eng.holdSpace += "\n" + eng.patternSpace + + case cmdGetCopy: + eng.patternSpace = eng.holdSpace + + case cmdGetAppend: + if len(eng.patternSpace)+1+len(eng.holdSpace) > MaxSpaceBytes { + return actionContinue, errors.New("pattern space exceeded size limit") + } + eng.patternSpace += "\n" + eng.holdSpace + + case cmdExchange: + eng.patternSpace, eng.holdSpace = eng.holdSpace, eng.patternSpace + + case cmdBranch: + return eng.branchTo(ctx, cmd.label, lr, depth) + + case cmdBranchIfSub: + if eng.subMade { + eng.subMade = false + return eng.branchTo(ctx, cmd.label, lr, depth) + } + + case cmdBranchIfNoSub: + if !eng.subMade { + return eng.branchTo(ctx, cmd.label, lr, depth) + } + + case cmdGroup: + action, err := eng.execCmds(ctx, cmd.children, 0, lr, depth) + if err != nil || action != actionContinue { + return action, err + } + + case cmdNoop, cmdLabel: + // Do nothing. + } + } + + return actionContinue, nil +} + +func findLabel(cmds []*sedCmd, label string) int { + for i, cmd := range cmds { + if cmd.kind == cmdLabel && cmd.label == label { + return i + } + if cmd.kind == cmdGroup { + // Labels inside groups are visible from the top level in GNU sed. + if idx := findLabel(cmd.children, label); idx >= 0 { + // Return the group's index since we can't index into children from here. + return i + } + } + } + return -1 +} + +// branchTo resolves a label and continues execution from the command after it. +// An empty label branches to end of script (returns actionContinue). +func (eng *engine) branchTo(ctx context.Context, label string, lr *lineReader, depth int) (actionType, error) { + if label == "" { + return actionContinue, nil + } + target := findLabel(eng.prog, label) + if target < 0 { + return actionContinue, errors.New("undefined label '" + label + "'") + } + return eng.execCmds(ctx, eng.prog, target+1, lr, depth+1) +} + +// addressMatch checks whether the current line matches the command's address. +func (eng *engine) addressMatch(cmd *sedCmd) bool { + match := eng.rawAddressMatch(cmd) + if cmd.negated { + return !match + } + return match +} + +func (eng *engine) rawAddressMatch(cmd *sedCmd) bool { + if cmd.addr1 == nil { + return true // no address means match all + } + + if cmd.addr2 == nil { + // Single address. + return eng.matchAddr(cmd.addr1) + } + + // Two-address range: match from addr1 to addr2 inclusive. + // We use a simple approach: check if current line is >= addr1 and <= addr2. + // For regex addresses, this is more complex. We use a stateful approach + // via the command itself to track whether we're inside the range. + return eng.matchRange(cmd) +} + +func (eng *engine) matchAddr(addr *address) bool { + switch addr.kind { + case addrLine: + return eng.lineNum == addr.line + case addrLast: + return eng.lastLine + case addrRegexp: + return addr.re.MatchString(eng.patternSpace) + case addrStep: + if addr.first == 0 { + return eng.lineNum%addr.step == 0 + } + return eng.lineNum >= addr.first && (eng.lineNum-addr.first)%addr.step == 0 + } + return false +} + +func (eng *engine) matchRange(cmd *sedCmd) bool { + if cmd.inRange { + // We're inside the range. Check if addr2 closes it. + if eng.matchAddr(cmd.addr2) { + cmd.inRange = false + return true // addr2 line is still part of the range + } + return true + } + // Not in range — check if addr1 opens it. + if eng.matchAddr(cmd.addr1) { + // Check if addr2 also matches on the same line (degenerate range). + if eng.matchAddr(cmd.addr2) { + return true // one-line range, don't enter inRange state + } + cmd.inRange = true + return true + } + return false +} + +func (eng *engine) execSubstitute(cmd *sedCmd) error { + var result string + if cmd.subGlobal { + result = cmd.subRe.ReplaceAllString(eng.patternSpace, expandReplacement(cmd.subReplacement)) + } else if cmd.subNth > 0 { + count := 0 + result = cmd.subRe.ReplaceAllStringFunc(eng.patternSpace, func(match string) string { + count++ + if count == cmd.subNth { + return cmd.subRe.ReplaceAllString(match, expandReplacement(cmd.subReplacement)) + } + return match + }) + } else { + loc := cmd.subRe.FindStringIndex(eng.patternSpace) + if loc != nil { + matched := eng.patternSpace[loc[0]:loc[1]] + replacement := cmd.subRe.ReplaceAllString(matched, expandReplacement(cmd.subReplacement)) + result = eng.patternSpace[:loc[0]] + replacement + eng.patternSpace[loc[1]:] + } else { + return nil + } + } + if result != eng.patternSpace { + if len(result) > MaxSpaceBytes { + return errors.New("pattern space exceeded size limit") + } + eng.subMade = true + eng.patternSpace = result + if cmd.subPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + } + return nil +} + +// expandReplacement converts sed replacement syntax to Go regexp replacement. +// In sed, & means the whole match. In Go regexp, that's ${0} or $0. +// Sed uses \1-\9 for groups, Go uses $1-$9. +func expandReplacement(repl string) string { + var sb strings.Builder + sb.Grow(len(repl)) + for i := 0; i < len(repl); i++ { + ch := repl[i] + if ch == '&' { + sb.WriteString("${0}") + } else if ch == '\\' && i+1 < len(repl) { + next := repl[i+1] + if next >= '1' && next <= '9' { + sb.WriteByte('$') + sb.WriteByte(next) + i++ + } else if next == '&' { + sb.WriteByte('&') + i++ + } else if next == '\\' { + sb.WriteByte('\\') + i++ + } else if next == 'n' { + sb.WriteByte('\n') + i++ + } else if next == 't' { + sb.WriteByte('\t') + i++ + } else { + sb.WriteByte('\\') + sb.WriteByte(next) + i++ + } + } else { + sb.WriteByte(ch) + } + } + return sb.String() +} + +func (eng *engine) transliterate(s string, from, to []rune) string { + runes := []rune(s) + for i, r := range runes { + for j, fr := range from { + if r == fr { + runes[i] = to[j] + break + } + } + } + return string(runes) +} + +func (eng *engine) printUnambiguous() { + // l command: print pattern space showing non-printing characters. + var sb strings.Builder + col := 0 + for _, r := range eng.patternSpace { + var s string + switch { + case r == '\\': + s = "\\\\" + case r == '\a': + s = "\\a" + case r == '\b': + s = "\\b" + case r == '\f': + s = "\\f" + case r == '\r': + s = "\\r" + case r == '\t': + s = "\\t" + case r == '\n': + s = "\\n" + case r < 32 || r == 127: + s = fmt.Sprintf("\\%03o", r) + default: + s = string(r) + } + if col+len(s) >= 70 { + sb.WriteString("\\\n") + col = 0 + } + sb.WriteString(s) + col += len(s) + } + sb.WriteByte('$') + sb.WriteByte('\n') + eng.callCtx.Out(sb.String()) +} + +// isRegularFile checks whether an io.Reader is backed by a regular file. +func isRegularFile(r any) bool { + type stater interface{ Stat() (os.FileInfo, error) } + sf, ok := r.(stater) + if !ok { + return false + } + fi, err := sf.Stat() + return err == nil && fi.Mode().IsRegular() +} diff --git a/interp/builtins/tests/sed/sed_hardening_test.go b/interp/builtins/tests/sed/sed_hardening_test.go new file mode 100644 index 00000000..d152a2a4 --- /dev/null +++ b/interp/builtins/tests/sed/sed_hardening_test.go @@ -0,0 +1,235 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed_test + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/DataDog/rshell/interp" + "github.com/stretchr/testify/assert" +) + +// --- Memory Safety & Resource Limits --- + +func TestHardenLongLine(t *testing.T) { + dir := setupDir(t, map[string]string{ + "long.txt": strings.Repeat("x", 512*1024) + "\n", + }) + stdout, _, code := cmdRun(t, `sed 's/x/y/' long.txt`, dir) + assert.Equal(t, 0, code) + // First 'x' replaced with 'y', rest unchanged. + assert.True(t, strings.HasPrefix(stdout, "y")) +} + +func TestHardenPatternSpaceLimit(t *testing.T) { + // Use N command to accumulate lines until pattern space limit is hit. + var sb strings.Builder + for i := 0; i < 2000; i++ { + sb.WriteString(strings.Repeat("a", 600)) + sb.WriteByte('\n') + } + dir := setupDir(t, map[string]string{ + "big.txt": sb.String(), + }) + _, stderr, code := cmdRun(t, `sed ':a;N;ba' big.txt`, dir) + // Should fail with pattern space limit error. + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "pattern space exceeded size limit") +} + +func TestHardenHoldSpaceLimit(t *testing.T) { + var sb strings.Builder + for i := 0; i < 2000; i++ { + sb.WriteString(strings.Repeat("b", 600)) + sb.WriteByte('\n') + } + dir := setupDir(t, map[string]string{ + "big.txt": sb.String(), + }) + _, stderr, code := cmdRun(t, `sed 'H' big.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "hold space exceeded size limit") +} + +func TestHardenBranchLoopLimit(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "test\n", + }) + _, stderr, code := cmdRun(t, `sed ':loop;b loop' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "branch loop limit exceeded") +} + +// --- Context Cancellation --- + +func TestHardenContextCancellation(t *testing.T) { + // Create a large file that would take a while to process. + dir := setupDir(t, map[string]string{ + "big.txt": strings.Repeat("line\n", 100000), + }) + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + _, _, code := runScriptCtx(ctx, t, `sed 's/line/LINE/g' big.txt`, dir) + // Should either complete or be cancelled — both are acceptable. + _ = code +} + +// --- Blocked Commands --- + +func TestHardenBlockedExecuteCommand(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'e' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedWriteCommand(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'w /tmp/evil' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedReadCommand(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'r /etc/passwd' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedBigRCommand(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'R /etc/passwd' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedBigWCommand(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'W /tmp/evil' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedSubstituteWriteFlag(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 's/t/T/w /tmp/evil' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestHardenBlockedSubstituteExecuteFlag(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 's/t/T/e' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +// --- Input Validation --- + +func TestHardenInvalidRegex(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 's/[invalid/x/' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestHardenEmptyScript(t *testing.T) { + _, stderr, code := cmdRun(t, `sed '' /dev/null`, "") + // Empty script is valid — matches all lines with no commands. + _ = stderr + _ = code +} + +func TestHardenNoScript(t *testing.T) { + _, stderr, code := cmdRun(t, `sed`, "") + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestHardenUnterminatedSubstitution(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 's/foo' f.txt`, dir) + // Unterminated s command — the parser may accept the last delimiter as optional. + // Just make sure it doesn't crash. + _ = stderr + _ = code +} + +func TestHardenUnterminatedGroup(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed '{p' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "unterminated") +} + +func TestHardenUnmatchedCloseBrace(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed '}' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "unexpected '}'") +} + +// --- Multiple Files --- + +func TestHardenMultipleFiles(t *testing.T) { + dir := setupDir(t, map[string]string{ + "a.txt": "alpha\n", + "b.txt": "beta\n", + "c.txt": "gamma\n", + }) + stdout, _, code := cmdRun(t, `sed 's/a/A/g' a.txt b.txt c.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "AlphA\nbetA\ngAmmA\n", stdout) +} + +func TestHardenMissingFileContinues(t *testing.T) { + dir := setupDir(t, map[string]string{ + "a.txt": "alpha\n", + }) + stdout, stderr, code := cmdRun(t, `sed 's/a/A/' a.txt nonexistent.txt`, dir) + assert.Equal(t, 1, code) + assert.Equal(t, "Alpha\n", stdout) + assert.Contains(t, stderr, "nonexistent.txt") +} + +// --- Regex Safety --- + +func TestHardenRegexComplexPattern(t *testing.T) { + // RE2 guarantees linear time, so complex patterns should not cause ReDoS. + dir := setupDir(t, map[string]string{ + "f.txt": strings.Repeat("a", 100) + "\n", + }) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, code := runScriptCtx(ctx, t, `sed -E 's/(a+)+b/x/' f.txt`, dir, interp.AllowedPaths([]string{dir})) + // Should complete without timeout (RE2 handles this in linear time). + assert.Equal(t, 0, code) +} + +// --- Y command edge cases --- + +func TestHardenTransliterateMismatch(t *testing.T) { + dir := setupDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'y/abc/de/' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "same length") +} + +// --- Comments --- + +func TestHardenComments(t *testing.T) { + dir := setupDir(t, map[string]string{ + "f.txt": "hello\n", + }) + stdout, _, code := cmdRun(t, `sed '#this is a comment +s/hello/world/' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "world\n", stdout) +} diff --git a/interp/builtins/tests/sed/sed_pentest_test.go b/interp/builtins/tests/sed/sed_pentest_test.go new file mode 100644 index 00000000..b09ad0b0 --- /dev/null +++ b/interp/builtins/tests/sed/sed_pentest_test.go @@ -0,0 +1,366 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed_test + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/DataDog/rshell/interp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const pentestTimeout = 10 * time.Second + +func pentestDir(t *testing.T, files map[string]string) string { + t.Helper() + dir := t.TempDir() + for name, content := range files { + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) + } + return dir +} + +// --- Flag and argument injection --- + +func TestPentestUnknownFlags(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + for _, flag := range []string{"-f", "--follow", "--no-such-flag", "-z", "-s"} { + t.Run(flag, func(t *testing.T) { + _, stderr, code := cmdRun(t, "sed "+flag+" 's/a/b/' f.txt", dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") + }) + } +} + +func TestPentestRejectedInPlaceFlag(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "hello\n"}) + _, stderr, code := cmdRun(t, `sed -i 's/hello/bye/' f.txt`, dir) + assert.NotEqual(t, 0, code) + assert.Contains(t, stderr, "sed:") + // Verify file was NOT modified. + data, err := os.ReadFile(filepath.Join(dir, "f.txt")) + require.NoError(t, err) + assert.Equal(t, "hello\n", string(data)) +} + +func TestPentestDoubleDashEndOfFlags(t *testing.T) { + dir := pentestDir(t, map[string]string{"-n": "hello\n"}) + // -- should allow flag-like filenames. + stdout, _, code := runScript(t, `sed 's/hello/bye/' -- -n`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) + assert.Equal(t, "bye\n", stdout) +} + +func TestPentestFlagViaWordExpansion(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + // Flag injection via variable. + _, stderr, code := cmdRun(t, `flag="-f"; sed $flag 's/a/b/' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +// --- Blocked commands --- + +func TestPentestAllBlockedCommands(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + blocked := []struct { + name, script string + }{ + {"e-command", `sed 'e' f.txt`}, + {"w-command", `sed 'w /tmp/evil' f.txt`}, + {"W-command", `sed 'W /tmp/evil' f.txt`}, + {"r-command", `sed 'r /etc/passwd' f.txt`}, + {"R-command", `sed 'R /etc/passwd' f.txt`}, + {"s-w-flag", `sed 's/t/T/w /tmp/evil' f.txt`}, + {"s-e-flag", `sed 's/t/T/e' f.txt`}, + } + for _, tc := range blocked { + t.Run(tc.name, func(t *testing.T) { + _, stderr, code := cmdRun(t, tc.script, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") + }) + } +} + +// --- Memory / resource exhaustion --- + +func TestPentestLongLineNearLimit(t *testing.T) { + // Line of MaxLineBytes - 1 bytes should succeed. + const limit = 1 << 20 // 1 MiB + dir := pentestDir(t, map[string]string{ + "near.txt": strings.Repeat("x", limit-1) + "\n", + }) + stdout, _, code := cmdRun(t, `sed 's/x/y/' near.txt`, dir) + assert.Equal(t, 0, code) + assert.True(t, strings.HasPrefix(stdout, "y")) +} + +func TestPentestPatternSpaceExhaustion(t *testing.T) { + // Try to exceed MaxSpaceBytes via N command. + var sb strings.Builder + for i := 0; i < 2000; i++ { + sb.WriteString(strings.Repeat("a", 600)) + sb.WriteByte('\n') + } + dir := pentestDir(t, map[string]string{"big.txt": sb.String()}) + _, stderr, code := cmdRun(t, `sed ':a;N;ba' big.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "pattern space exceeded size limit") +} + +func TestPentestHoldSpaceExhaustion(t *testing.T) { + var sb strings.Builder + for i := 0; i < 2000; i++ { + sb.WriteString(strings.Repeat("b", 600)) + sb.WriteByte('\n') + } + dir := pentestDir(t, map[string]string{"big.txt": sb.String()}) + _, stderr, code := cmdRun(t, `sed 'H' big.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "hold space exceeded size limit") +} + +func TestPentestBranchLoopLimit(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + ctx, cancel := context.WithTimeout(context.Background(), pentestTimeout) + defer cancel() + _, stderr, code := runScriptCtx(ctx, t, `sed ':loop;b loop' f.txt`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "branch loop limit exceeded") +} + +func TestPentestSubstitutionGrowsPatternSpace(t *testing.T) { + // s///g with a replacement much larger than the match can grow pattern space. + dir := pentestDir(t, map[string]string{ + "f.txt": strings.Repeat("x", 100000) + "\n", + }) + // Replace each 'x' with 20 chars — would grow to 2MB, exceeding MaxSpaceBytes (1MB). + _, stderr, code := cmdRun(t, `sed 's/x/xxxxxxxxxxxxxxxxxxxx/g' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "pattern space exceeded size limit") +} + +func TestPentestSmallFileWithLargeNthSub(t *testing.T) { + // Substitution with very high N should not cause issues. + dir := pentestDir(t, map[string]string{"f.txt": "aaa\n"}) + stdout, _, code := cmdRun(t, `sed 's/a/X/999' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "aaa\n", stdout) // No 999th occurrence, no change. +} + +// --- Special files --- + +func TestPentestDevNull(t *testing.T) { + stdout, _, code := runScript(t, `sed 's/a/b/' /dev/null`, "", interp.AllowedPaths([]string{"/dev"})) + assert.Equal(t, 0, code) + assert.Equal(t, "", stdout) +} + +func TestPentestContextCancelledDuringProcessing(t *testing.T) { + dir := pentestDir(t, map[string]string{ + "big.txt": strings.Repeat("line\n", 100000), + }) + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + _, _, code := runScriptCtx(ctx, t, `sed 's/line/LINE/g' big.txt`, dir, interp.AllowedPaths([]string{dir})) + // Should either complete or be cancelled — both are acceptable. + _ = code +} + +// --- Path and filename edge cases --- + +func TestPentestNonExistentFile(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, `sed 's/a/b/' nonexistent.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "nonexistent.txt") +} + +func TestPentestDirectoryAsFile(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.Mkdir(filepath.Join(dir, "subdir"), 0755)) + _, stderr, code := cmdRun(t, `sed 's/a/b/' subdir`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestPentestEmptyStringFilename(t *testing.T) { + _, stderr, code := cmdRun(t, `sed 's/a/b/' ""`, "") + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestPentestMultipleStdinArgs(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "hello\n"}) + // Multiple - args: stdin should only be consumed once. + stdout, _, code := cmdRun(t, `echo hello | sed 's/hello/bye/' - -`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "bye\n", stdout) +} + +func TestPentestManyFiles(t *testing.T) { + // Verify no FD leak with many files. + files := make(map[string]string) + var names []string + for i := 0; i < 50; i++ { + name := "f" + strings.Repeat("0", 3-len(strings.TrimLeft(string(rune('0'+i%10)), ""))) + ".txt" + name = filepath.Base(name) + actualName := "file_" + string(rune('a'+i%26)) + string(rune('a'+i/26)) + ".txt" + files[actualName] = "line\n" + names = append(names, actualName) + } + dir := pentestDir(t, files) + script := "sed 's/line/LINE/' " + strings.Join(names, " ") + stdout, _, code := cmdRun(t, script, dir) + assert.Equal(t, 0, code) + assert.Equal(t, strings.Repeat("LINE\n", len(names)), stdout) +} + +// --- Input validation --- + +func TestPentestInvalidRegexPatterns(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + patterns := []string{ + `sed 's/[/x/' f.txt`, // Unterminated character class + `sed '/[/d' f.txt`, // Invalid address regex + `sed -E 's/(/x/' f.txt`, // Unmatched paren in ERE + } + for _, script := range patterns { + t.Run(script, func(t *testing.T) { + _, stderr, code := cmdRun(t, script, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") + }) + } +} + +func TestPentestEmptyScript(t *testing.T) { + // Empty script is valid in GNU sed. + _, _, code := cmdRun(t, `sed '' /dev/null`, "") + _ = code // just don't crash +} + +func TestPentestNoScriptNoFiles(t *testing.T) { + _, stderr, code := cmdRun(t, `sed`, "") + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestPentestUnterminatedSubstitution(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + _, _, code := cmdRun(t, `sed 's/foo' f.txt`, dir) + // Just don't crash. + _ = code +} + +func TestPentestUnterminatedGroup(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed '{p' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "unterminated") +} + +func TestPentestDeeplyNestedGroups(t *testing.T) { + // Many nested groups should not crash. + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + script := strings.Repeat("{", 50) + "p" + strings.Repeat("}", 50) + stdout, _, code := cmdRun(t, `sed '`+script+`' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "test") +} + +// --- Regex safety (ReDoS) --- + +func TestPentestReDoSPattern(t *testing.T) { + // RE2 should handle this in linear time. + dir := pentestDir(t, map[string]string{ + "f.txt": strings.Repeat("a", 100) + "\n", + }) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, code := runScriptCtx(ctx, t, `sed -E 's/(a+)+b/x/' f.txt`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) +} + +// --- P and D commands --- + +func TestPentestPCommandMultiline(t *testing.T) { + dir := pentestDir(t, map[string]string{ + "f.txt": "line1\nline2\n", + }) + stdout, _, code := cmdRun(t, `sed -n 'N;P' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\n", stdout) +} + +func TestPentestDCommandMultiline(t *testing.T) { + dir := pentestDir(t, map[string]string{ + "f.txt": "a\nb\nc\n", + }) + // D deletes first line of pattern space and restarts cycle. + // N appends next line, D removes the first. + stdout, _, code := cmdRun(t, `sed 'N;P;D' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "a\nb\nc\n", stdout) +} + +// --- Quit commands with exit codes --- + +func TestPentestQuitWithCode(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "line1\nline2\nline3\n"}) + _, _, code := cmdRun(t, `sed 'q 42' f.txt`, dir) + assert.Equal(t, 42, code) +} + +func TestPentestQuitNoPrintWithCode(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "line1\nline2\nline3\n"}) + stdout, _, code := cmdRun(t, `sed 'Q 7' f.txt`, dir) + assert.Equal(t, 7, code) + assert.Equal(t, "", stdout) +} + +// --- Transliterate edge cases --- + +func TestPentestTransliterateMismatch(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + _, stderr, code := cmdRun(t, `sed 'y/abc/de/' f.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "same length") +} + +func TestPentestTransliterateEmpty(t *testing.T) { + dir := pentestDir(t, map[string]string{"f.txt": "test\n"}) + stdout, _, code := cmdRun(t, `sed 'y///' f.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "test\n", stdout) // No transliteration, identity. +} + +// --- Symlinks --- + +func TestPentestSymlinkToRegularFile(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, "real.txt"), []byte("hello\n"), 0644)) + require.NoError(t, os.Symlink("real.txt", filepath.Join(dir, "link.txt"))) + stdout, _, code := cmdRun(t, `sed 's/hello/bye/' link.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "bye\n", stdout) +} + +func TestPentestDanglingSymlink(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.Symlink(filepath.Join(dir, "nonexistent"), filepath.Join(dir, "dangling.txt"))) + _, stderr, code := cmdRun(t, `sed 's/a/b/' dangling.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} diff --git a/interp/builtins/tests/sed/sed_test.go b/interp/builtins/tests/sed/sed_test.go new file mode 100644 index 00000000..ef4e3d14 --- /dev/null +++ b/interp/builtins/tests/sed/sed_test.go @@ -0,0 +1,607 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed_test + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/DataDog/rshell/interp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "mvdan.cc/sh/v3/syntax" +) + +func runScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return runScriptCtx(context.Background(), t, script, dir, opts...) +} + +func runScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + var outBuf, errBuf bytes.Buffer + allOpts := append([]interp.RunnerOption{interp.StdIO(nil, &outBuf, &errBuf)}, opts...) + runner, err := interp.New(allOpts...) + require.NoError(t, err) + defer runner.Close() + if dir != "" { + runner.Dir = dir + } + err = runner.Run(ctx, prog) + exitCode := 0 + if err != nil { + var es interp.ExitStatus + if errors.As(err, &es) { + exitCode = int(es) + } else if ctx.Err() == nil { + t.Fatalf("unexpected error: %v", err) + } + } + return outBuf.String(), errBuf.String(), exitCode +} + +func cmdRun(t *testing.T, script, dir string) (stdout, stderr string, exitCode int) { + t.Helper() + return runScript(t, script, dir, interp.AllowedPaths([]string{dir})) +} + +func writeFile(t *testing.T, dir, name, content string) { + t.Helper() + err := os.WriteFile(filepath.Join(dir, name), []byte(content), 0644) + require.NoError(t, err) +} + +func setupDir(t *testing.T, files map[string]string) string { + t.Helper() + dir := t.TempDir() + for name, content := range files { + writeFile(t, dir, name, content) + } + return dir +} + +// --- Basic Substitution --- + +func TestSubstituteBasic(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello world\n", + }) + stdout, _, code := cmdRun(t, `sed 's/world/earth/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello earth\n", stdout) +} + +func TestSubstituteGlobal(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "aaa bbb aaa\n", + }) + stdout, _, code := cmdRun(t, `sed 's/aaa/zzz/g' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "zzz bbb zzz\n", stdout) +} + +func TestSubstituteNth(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "ab ab ab ab\n", + }) + stdout, _, code := cmdRun(t, `sed 's/ab/XY/2' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "ab XY ab ab\n", stdout) +} + +func TestSubstituteCaseInsensitive(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "Hello HELLO hello\n", + }) + stdout, _, code := cmdRun(t, `sed 's/hello/bye/i' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "bye HELLO hello\n", stdout) +} + +func TestSubstituteAlternateDelimiter(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "/usr/local/bin\n", + }) + stdout, _, code := cmdRun(t, `sed 's|/usr/local|/opt|' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "/opt/bin\n", stdout) +} + +func TestSubstituteAmpersand(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + stdout, _, code := cmdRun(t, `sed 's/hello/[&]/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "[hello]\n", stdout) +} + +func TestSubstituteEmptyPattern(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + stdout, _, code := cmdRun(t, `sed 's/^/prefix: /' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "prefix: hello\n", stdout) +} + +func TestSubstituteWithPrint(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "aaa\nbbb\naaa\n", + }) + stdout, _, code := cmdRun(t, `sed -n 's/aaa/zzz/p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "zzz\nzzz\n", stdout) +} + +// --- Print and Output --- + +func TestPrint(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\n", + }) + stdout, _, code := cmdRun(t, `sed 'p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nline1\nline2\nline2\n", stdout) +} + +func TestSuppressAutoPrint(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\n", + }) + stdout, _, code := cmdRun(t, `sed -n 'p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nline2\nline3\n", stdout) +} + +func TestLineNumber(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "aaa\nbbb\nccc\n", + }) + stdout, _, code := cmdRun(t, `sed '=' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1\naaa\n2\nbbb\n3\nccc\n", stdout) +} + +func TestPrintUnambiguous(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\tworld\n", + }) + stdout, _, code := cmdRun(t, `sed -n 'l' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello\\tworld$\n", stdout) +} + +// --- Delete --- + +func TestDeleteBasic(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\n", + }) + stdout, _, code := cmdRun(t, `sed '2d' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nline3\n", stdout) +} + +func TestDeleteRange(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\nline4\nline5\n", + }) + stdout, _, code := cmdRun(t, `sed '2,4d' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nline5\n", stdout) +} + +// --- Addressing --- + +func TestAddressLine(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "first\nsecond\nthird\n", + }) + stdout, _, code := cmdRun(t, `sed '2s/second/SECOND/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "first\nSECOND\nthird\n", stdout) +} + +func TestAddressLastLine(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "first\nsecond\nthird\n", + }) + stdout, _, code := cmdRun(t, `sed '$s/third/THIRD/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "first\nsecond\nTHIRD\n", stdout) +} + +func TestAddressRegex(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "apple\nbanana\ncherry\n", + }) + stdout, _, code := cmdRun(t, `sed '/banana/d' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "apple\ncherry\n", stdout) +} + +func TestAddressRange(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "1\n2\n3\n4\n5\n", + }) + stdout, _, code := cmdRun(t, `sed -n '2,4p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "2\n3\n4\n", stdout) +} + +func TestAddressRegexRange(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "start\nmiddle1\nmiddle2\nend\nafter\n", + }) + stdout, _, code := cmdRun(t, `sed -n '/start/,/end/p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "start\nmiddle1\nmiddle2\nend\n", stdout) +} + +func TestAddressNegation(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "keep\ndelete\nkeep\n", + }) + stdout, _, code := cmdRun(t, `sed '/keep/!d' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "keep\nkeep\n", stdout) +} + +func TestAddressStep(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "1\n2\n3\n4\n5\n6\n", + }) + stdout, _, code := cmdRun(t, `sed -n '1~2p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "1\n3\n5\n", stdout) +} + +// --- Text Commands --- + +func TestAppend(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\n", + }) + stdout, _, code := cmdRun(t, `sed '1a\appended' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nappended\nline2\n", stdout) +} + +func TestInsert(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\n", + }) + stdout, _, code := cmdRun(t, `sed '2i\inserted' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\ninserted\nline2\n", stdout) +} + +func TestChange(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\n", + }) + stdout, _, code := cmdRun(t, `sed '2c\changed' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nchanged\nline3\n", stdout) +} + +// --- Hold Space --- + +func TestHoldCopy(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "first\nsecond\n", + }) + // Copy first line to hold space, on second line replace pattern with hold + stdout, _, code := cmdRun(t, `sed -n '1h;2{g;p}' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "first\n", stdout) +} + +func TestHoldAppend(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "a\nb\nc\n", + }) + // Accumulate all lines in hold space, print at end + stdout, _, code := cmdRun(t, `sed -n 'H;${g;p}' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "\na\nb\nc\n", stdout) +} + +func TestExchange(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "pattern\n", + }) + // Exchange swaps pattern space (content) with hold space (initially empty) + stdout, _, code := cmdRun(t, `sed -n 'x;p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "\n", stdout) +} + +// --- Branching --- + +func TestBranch(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + // b with no label branches to end of script, skipping subsequent commands + stdout, _, code := cmdRun(t, `sed 'b;s/hello/bye/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello\n", stdout) +} + +func TestBranchLabel(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + stdout, _, code := cmdRun(t, "sed 'b skip;s/hello/bye/;:skip' input.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello\n", stdout) +} + +func TestBranchConditional(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "aXb\n", + }) + // t branches if substitution was made + stdout, _, code := cmdRun(t, `sed 's/X/Y/;t done;s/a/Z/;:done' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "aYb\n", stdout) +} + +func TestBranchConditionalNoSub(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\nworld\n", + }) + // T branches if NO substitution was made. + // On "hello": s/hello/HI/ succeeds, T does not branch, s/HI/BYE/ runs → "BYE" + // On "world": s/hello/HI/ fails, T branches to done, s/HI/BYE/ skipped → "world" + stdout, _, code := cmdRun(t, `sed 's/hello/HI/;T done;s/HI/BYE/;:done' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "BYE\nworld\n", stdout) +} + +// --- Next Line --- + +func TestNext(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\nline4\n", + }) + // n prints current line (unless -n), reads next line into pattern space + stdout, _, code := cmdRun(t, `sed -n 'n;p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line2\nline4\n", stdout) +} + +func TestNextAppend(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\n", + }) + // N appends next line to pattern space with embedded newline + stdout, _, code := cmdRun(t, `sed -n 'N;p' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "line1") + assert.Contains(t, stdout, "line2") +} + +// --- Transliterate --- + +func TestTransliterate(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + stdout, _, code := cmdRun(t, `sed 'y/helo/HELO/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "HELLO\n", stdout) +} + +// --- Quit --- + +func TestQuit(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\n", + }) + // q prints current line then exits + stdout, _, code := cmdRun(t, `sed '2q' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\nline2\n", stdout) +} + +func TestQuitNoPrint(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "line1\nline2\nline3\n", + }) + // Q exits without printing current line + stdout, _, code := cmdRun(t, `sed '2Q' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "line1\n", stdout) +} + +// --- Multiple Expressions --- + +func TestMultipleExpressions(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello world\n", + }) + stdout, _, code := cmdRun(t, `sed -e 's/hello/hi/' -e 's/world/earth/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hi earth\n", stdout) +} + +func TestSemicolonSeparator(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello world\n", + }) + stdout, _, code := cmdRun(t, `sed 's/hello/hi/;s/world/earth/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hi earth\n", stdout) +} + +// --- Extended Regex --- + +func TestExtendedRegex(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "abc123def\n", + }) + stdout, _, code := cmdRun(t, `sed -E 's/[0-9]+/NUM/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "abcNUMdef\n", stdout) +} + +func TestExtendedRegexR(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "abc123def\n", + }) + stdout, _, code := cmdRun(t, `sed -r 's/[0-9]+/NUM/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "abcNUMdef\n", stdout) +} + +// --- Stdin --- + +func TestStdinPipe(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello world\n", + }) + stdout, _, code := cmdRun(t, `cat input.txt | sed 's/world/earth/'`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello earth\n", stdout) +} + +func TestStdinDash(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello world\n", + }) + stdout, _, code := cmdRun(t, `cat input.txt | sed 's/world/earth/' -`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "hello earth\n", stdout) +} + +// --- Edge Cases --- + +func TestEmptyFile(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "", + }) + stdout, _, code := cmdRun(t, `sed 's/a/b/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stdout) +} + +func TestSingleLine(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "only line\n", + }) + stdout, _, code := cmdRun(t, `sed 's/only/single/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "single line\n", stdout) +} + +func TestNoTrailingNewline(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "no newline", + }) + stdout, _, code := cmdRun(t, `sed 's/no/with/' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "with newline\n", stdout) +} + +func TestMultipleFiles(t *testing.T) { + dir := setupDir(t, map[string]string{ + "a.txt": "alpha\n", + "b.txt": "beta\n", + }) + stdout, _, code := cmdRun(t, `sed 's/^/> /' a.txt b.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "> alpha\n> beta\n", stdout) +} + +// --- Error Cases --- + +func TestMissingFile(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, `sed 's/a/b/' nonexistent.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestNoScript(t *testing.T) { + dir := t.TempDir() + _, stderr, code := cmdRun(t, `sed`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestInvalidRegex(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed 's/[invalid/replacement/' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "sed:") +} + +func TestBlockedWriteCommand(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed 'w output.txt' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestBlockedExecuteCommand(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed 'e' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestBlockedInPlaceFlag(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed -i 's/hello/bye/' input.txt`, dir) + assert.NotEqual(t, 0, code) + assert.Contains(t, stderr, "sed:") +} + +func TestBlockedReadCommand(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed 'r other.txt' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +func TestBlockedWriteFlag(t *testing.T) { + dir := setupDir(t, map[string]string{ + "input.txt": "hello\n", + }) + _, stderr, code := cmdRun(t, `sed 's/hello/bye/w output.txt' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "blocked") +} + +// --- Help --- + +func TestHelp(t *testing.T) { + dir := t.TempDir() + stdout, _, code := cmdRun(t, `sed --help`, dir) + assert.Equal(t, 0, code) + assert.Contains(t, stdout, "Usage:") +} diff --git a/interp/register_builtins.go b/interp/register_builtins.go index 8d7f50d5..7fbb05b5 100644 --- a/interp/register_builtins.go +++ b/interp/register_builtins.go @@ -19,6 +19,7 @@ import ( "github.com/DataDog/rshell/interp/builtins/grep" "github.com/DataDog/rshell/interp/builtins/head" "github.com/DataDog/rshell/interp/builtins/ls" + "github.com/DataDog/rshell/interp/builtins/sed" "github.com/DataDog/rshell/interp/builtins/strings_cmd" "github.com/DataDog/rshell/interp/builtins/tail" "github.com/DataDog/rshell/interp/builtins/testcmd" @@ -42,6 +43,7 @@ func registerBuiltins() { grep.Cmd, head.Cmd, ls.Cmd, + sed.Cmd, strings_cmd.Cmd, tail.Cmd, testcmd.Cmd, diff --git a/tests/allowed_symbols_test.go b/tests/allowed_symbols_test.go index e74d5d17..380b776c 100644 --- a/tests/allowed_symbols_test.go +++ b/tests/allowed_symbols_test.go @@ -34,10 +34,14 @@ import ( var builtinAllowedSymbols = []string{ // bufio.NewScanner — line-by-line input reading (e.g. head, cat); no write or exec capability. "bufio.NewScanner", + // bufio.Scanner — scanner type for buffered input reading; no write or exec capability. + "bufio.Scanner", // bufio.SplitFunc — type for custom scanner split functions; pure type, no I/O. "bufio.SplitFunc", // context.Context — deadline/cancellation plumbing; pure interface, no side effects. "context.Context", + // errors.As — error type assertion; pure function, no I/O. + "errors.As", // errors.Is — error comparison; pure function, no I/O. "errors.Is", // errors.New — creates a simple error value; pure function, no I/O. diff --git a/tests/scenarios/cmd/sed/address/last_line.yaml b/tests/scenarios/cmd/sed/address/last_line.yaml new file mode 100644 index 00000000..2839e4d5 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/last_line.yaml @@ -0,0 +1,13 @@ +description: The $ address matches the last line of input. +setup: + files: + - path: input.txt + content: "first\nmiddle\nlast\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '$s/last/LAST/' input.txt +expect: + stdout: "first\nmiddle\nLAST\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/line.yaml b/tests/scenarios/cmd/sed/address/line.yaml new file mode 100644 index 00000000..bb405cc4 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/line.yaml @@ -0,0 +1,13 @@ +description: Specific line number address applies command to that line only. +setup: + files: + - path: input.txt + content: "one\ntwo\nthree\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2s/two/TWO/' input.txt +expect: + stdout: "one\nTWO\nthree\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/negation.yaml b/tests/scenarios/cmd/sed/address/negation.yaml new file mode 100644 index 00000000..eb8aa059 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/negation.yaml @@ -0,0 +1,13 @@ +description: The ! negation applies command to non-matching lines. +setup: + files: + - path: input.txt + content: "keep\ndelete\nkeep\ndelete\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/delete/!s/keep/KEEP/' input.txt +expect: + stdout: "KEEP\ndelete\nKEEP\ndelete\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/range.yaml b/tests/scenarios/cmd/sed/address/range.yaml new file mode 100644 index 00000000..704fbbe6 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/range.yaml @@ -0,0 +1,13 @@ +description: Address range applies command from first to second address inclusive. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\nline5\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2,4s/line/LINE/' input.txt +expect: + stdout: "line1\nLINE2\nLINE3\nLINE4\nline5\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/regex.yaml b/tests/scenarios/cmd/sed/address/regex.yaml new file mode 100644 index 00000000..bed2a5cc --- /dev/null +++ b/tests/scenarios/cmd/sed/address/regex.yaml @@ -0,0 +1,13 @@ +description: Regex address applies command to matching lines. +setup: + files: + - path: input.txt + content: "apple\nbanana\napricot\ncherry\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/^a/s/a/A/' input.txt +expect: + stdout: "Apple\nbanana\nApricot\ncherry\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/step.yaml b/tests/scenarios/cmd/sed/address/step.yaml new file mode 100644 index 00000000..5bbaf82d --- /dev/null +++ b/tests/scenarios/cmd/sed/address/step.yaml @@ -0,0 +1,14 @@ +description: The first~step address matches every step-th line starting from first. +skip_assert_against_bash: true +setup: + files: + - path: input.txt + content: "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '1~3p' input.txt +expect: + stdout: "1\n4\n7\n10\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/branch/basic.yaml b/tests/scenarios/cmd/sed/branch/basic.yaml new file mode 100644 index 00000000..c51d1fc2 --- /dev/null +++ b/tests/scenarios/cmd/sed/branch/basic.yaml @@ -0,0 +1,13 @@ +description: The b command with no label branches to end of script. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/bbb/b; s/^/>> /' input.txt +expect: + stdout: ">> aaa\nbbb\n>> ccc\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/branch/conditional.yaml b/tests/scenarios/cmd/sed/branch/conditional.yaml new file mode 100644 index 00000000..cc077eeb --- /dev/null +++ b/tests/scenarios/cmd/sed/branch/conditional.yaml @@ -0,0 +1,13 @@ +description: The t command branches only if a substitution was made. +setup: + files: + - path: input.txt + content: "foo bar\nhello world\nfoo baz\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/FOO/; t done; s/hello/HELLO/; :done' input.txt +expect: + stdout: "FOO bar\nHELLO world\nFOO baz\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/branch/label.yaml b/tests/scenarios/cmd/sed/branch/label.yaml new file mode 100644 index 00000000..2218011e --- /dev/null +++ b/tests/scenarios/cmd/sed/branch/label.yaml @@ -0,0 +1,13 @@ +description: The b command branches to a named label defined with colon. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '/bbb/b skip; p; :skip' input.txt +expect: + stdout: "aaa\nccc\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/delete/basic.yaml b/tests/scenarios/cmd/sed/delete/basic.yaml new file mode 100644 index 00000000..07be9ddd --- /dev/null +++ b/tests/scenarios/cmd/sed/delete/basic.yaml @@ -0,0 +1,13 @@ +description: The d command deletes lines matching an address. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2d' input.txt +expect: + stdout: "line1\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/delete/range.yaml b/tests/scenarios/cmd/sed/delete/range.yaml new file mode 100644 index 00000000..3450eea3 --- /dev/null +++ b/tests/scenarios/cmd/sed/delete/range.yaml @@ -0,0 +1,13 @@ +description: The d command deletes a range of lines. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\nline5\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2,4d' input.txt +expect: + stdout: "line1\nline5\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/edge/empty_file.yaml b/tests/scenarios/cmd/sed/edge/empty_file.yaml new file mode 100644 index 00000000..67509b3b --- /dev/null +++ b/tests/scenarios/cmd/sed/edge/empty_file.yaml @@ -0,0 +1,13 @@ +description: Sed handles an empty input file without error. +setup: + files: + - path: empty.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/a/b/' empty.txt +expect: + stdout: "" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/edge/no_trailing_newline.yaml b/tests/scenarios/cmd/sed/edge/no_trailing_newline.yaml new file mode 100644 index 00000000..63143a36 --- /dev/null +++ b/tests/scenarios/cmd/sed/edge/no_trailing_newline.yaml @@ -0,0 +1,13 @@ +description: Sed processes a file without a trailing newline. +setup: + files: + - path: input.txt + content: "no newline at end" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/end/END/' input.txt +expect: + stdout: "no newline at END\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/edge/single_line.yaml b/tests/scenarios/cmd/sed/edge/single_line.yaml new file mode 100644 index 00000000..97b06b61 --- /dev/null +++ b/tests/scenarios/cmd/sed/edge/single_line.yaml @@ -0,0 +1,13 @@ +description: Sed processes a single line file correctly. +setup: + files: + - path: input.txt + content: "only line\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/only/the only/' input.txt +expect: + stdout: "the only line\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/errors/blocked_execute.yaml b/tests/scenarios/cmd/sed/errors/blocked_execute.yaml new file mode 100644 index 00000000..89678dcb --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_execute.yaml @@ -0,0 +1,9 @@ +description: The e command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 'e' +expect: + stdout: "" + stderr: "sed: 'e' command is blocked: command execution is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_inplace.yaml b/tests/scenarios/cmd/sed/errors/blocked_inplace.yaml new file mode 100644 index 00000000..8db911f8 --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_inplace.yaml @@ -0,0 +1,9 @@ +description: The -i flag is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed -i 's/test/replaced/' +expect: + stdout: "" + stderr_contains: ["unknown shorthand flag"] + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_read.yaml b/tests/scenarios/cmd/sed/errors/blocked_read.yaml new file mode 100644 index 00000000..5b069997 --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_read.yaml @@ -0,0 +1,9 @@ +description: The r command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 'r somefile.txt' +expect: + stdout: "" + stderr: "sed: 'r' command is blocked: unsandboxed file reading is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_write.yaml b/tests/scenarios/cmd/sed/errors/blocked_write.yaml new file mode 100644 index 00000000..7336792c --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_write.yaml @@ -0,0 +1,9 @@ +description: The w command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 's/test/replaced/w output.txt' +expect: + stdout: "" + stderr: "sed: 'w' flag in 's' command is blocked: file writing is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/invalid_regex.yaml b/tests/scenarios/cmd/sed/errors/invalid_regex.yaml new file mode 100644 index 00000000..9d2c8f2c --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/invalid_regex.yaml @@ -0,0 +1,8 @@ +description: Sed reports an error for an invalid regular expression. +input: + script: |+ + echo "test" | sed 's/[invalid/replace/' +expect: + stdout: "" + stderr_contains: ["sed:"] + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/missing_file.yaml b/tests/scenarios/cmd/sed/errors/missing_file.yaml new file mode 100644 index 00000000..c708b71a --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/missing_file.yaml @@ -0,0 +1,13 @@ +description: Sed reports an error for a non-existent input file. +setup: + files: + - path: dummy.txt + content: "" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/a/b/' nonexistent.txt +expect: + stdout: "" + stderr_contains: ["nonexistent.txt"] + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/no_script.yaml b/tests/scenarios/cmd/sed/errors/no_script.yaml new file mode 100644 index 00000000..de7ab0ce --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/no_script.yaml @@ -0,0 +1,8 @@ +description: Sed reports an error when no script is provided. +input: + script: |+ + sed +expect: + stdout: "" + stderr: "sed: no script command has been specified\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/hold/append.yaml b/tests/scenarios/cmd/sed/hold/append.yaml new file mode 100644 index 00000000..5e6af229 --- /dev/null +++ b/tests/scenarios/cmd/sed/hold/append.yaml @@ -0,0 +1,13 @@ +description: The H and G commands append between pattern and hold space. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 'H; ${g;p}' input.txt +expect: + stdout: "\naaa\nbbb\nccc\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/hold/copy.yaml b/tests/scenarios/cmd/sed/hold/copy.yaml new file mode 100644 index 00000000..4e415ef5 --- /dev/null +++ b/tests/scenarios/cmd/sed/hold/copy.yaml @@ -0,0 +1,13 @@ +description: The h and g commands copy between pattern and hold space. +setup: + files: + - path: input.txt + content: "first\nsecond\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '1h; 2{g;p}' input.txt +expect: + stdout: "first\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/hold/exchange.yaml b/tests/scenarios/cmd/sed/hold/exchange.yaml new file mode 100644 index 00000000..248641de --- /dev/null +++ b/tests/scenarios/cmd/sed/hold/exchange.yaml @@ -0,0 +1,13 @@ +description: The x command exchanges pattern and hold space. +setup: + files: + - path: input.txt + content: "first\nsecond\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '1{h;d}; 2{x;p;x;p}' input.txt +expect: + stdout: "first\nsecond\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/multiple/multi_e.yaml b/tests/scenarios/cmd/sed/multiple/multi_e.yaml new file mode 100644 index 00000000..efebb9dc --- /dev/null +++ b/tests/scenarios/cmd/sed/multiple/multi_e.yaml @@ -0,0 +1,13 @@ +description: Multiple -e flags apply expressions in order. +setup: + files: + - path: input.txt + content: "hello world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -e 's/hello/goodbye/' -e 's/world/earth/' input.txt +expect: + stdout: "goodbye earth\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/multiple/semicolon.yaml b/tests/scenarios/cmd/sed/multiple/semicolon.yaml new file mode 100644 index 00000000..f634e56f --- /dev/null +++ b/tests/scenarios/cmd/sed/multiple/semicolon.yaml @@ -0,0 +1,13 @@ +description: Semicolons separate multiple commands within a single script. +setup: + files: + - path: input.txt + content: "hello world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/hello/goodbye/; s/world/earth/' input.txt +expect: + stdout: "goodbye earth\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/next/append_next.yaml b/tests/scenarios/cmd/sed/next/append_next.yaml new file mode 100644 index 00000000..7c22f4e3 --- /dev/null +++ b/tests/scenarios/cmd/sed/next/append_next.yaml @@ -0,0 +1,13 @@ +description: The N command appends next line to pattern space with embedded newline. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 'N; s/\n/ /' input.txt +expect: + stdout: "line1 line2\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/next/basic.yaml b/tests/scenarios/cmd/sed/next/basic.yaml new file mode 100644 index 00000000..65411675 --- /dev/null +++ b/tests/scenarios/cmd/sed/next/basic.yaml @@ -0,0 +1,13 @@ +description: The n command prints current line and reads the next into pattern space. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 'n; s/line/LINE/' input.txt +expect: + stdout: "line1\nLINE2\nline3\nLINE4\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/print/line_number.yaml b/tests/scenarios/cmd/sed/print/line_number.yaml new file mode 100644 index 00000000..f1126371 --- /dev/null +++ b/tests/scenarios/cmd/sed/print/line_number.yaml @@ -0,0 +1,13 @@ +description: The = command prints the current line number. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '=' input.txt +expect: + stdout: "1\n2\n3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/print/suppress.yaml b/tests/scenarios/cmd/sed/print/suppress.yaml new file mode 100644 index 00000000..59a0caf0 --- /dev/null +++ b/tests/scenarios/cmd/sed/print/suppress.yaml @@ -0,0 +1,13 @@ +description: The -n flag suppresses auto-print and p command prints explicitly. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '2p' input.txt +expect: + stdout: "line2\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/print/unambiguous.yaml b/tests/scenarios/cmd/sed/print/unambiguous.yaml new file mode 100644 index 00000000..07f7700b --- /dev/null +++ b/tests/scenarios/cmd/sed/print/unambiguous.yaml @@ -0,0 +1,13 @@ +description: The l command prints the pattern space unambiguously with $ at end. +setup: + files: + - path: input.txt + content: "hello\tworld\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 'l' input.txt +expect: + stdout: "hello\\tworld$\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/quit/basic.yaml b/tests/scenarios/cmd/sed/quit/basic.yaml new file mode 100644 index 00000000..9dd998e1 --- /dev/null +++ b/tests/scenarios/cmd/sed/quit/basic.yaml @@ -0,0 +1,13 @@ +description: The q command prints the current line and quits. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2q' input.txt +expect: + stdout: "line1\nline2\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/quit/noprint.yaml b/tests/scenarios/cmd/sed/quit/noprint.yaml new file mode 100644 index 00000000..c555b23f --- /dev/null +++ b/tests/scenarios/cmd/sed/quit/noprint.yaml @@ -0,0 +1,13 @@ +description: The Q command quits without printing the current line. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2Q' input.txt +expect: + stdout: "line1\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/stdin/dash.yaml b/tests/scenarios/cmd/sed/stdin/dash.yaml new file mode 100644 index 00000000..6463e257 --- /dev/null +++ b/tests/scenarios/cmd/sed/stdin/dash.yaml @@ -0,0 +1,8 @@ +description: Sed reads from stdin when - is given as the file argument. +input: + script: |+ + echo "hello world" | sed 's/world/earth/' - +expect: + stdout: "hello earth\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/stdin/pipe.yaml b/tests/scenarios/cmd/sed/stdin/pipe.yaml new file mode 100644 index 00000000..cd4fd026 --- /dev/null +++ b/tests/scenarios/cmd/sed/stdin/pipe.yaml @@ -0,0 +1,8 @@ +description: Sed reads from piped stdin when no file is given. +input: + script: |+ + echo "hello world" | sed 's/world/earth/' +expect: + stdout: "hello earth\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/ampersand.yaml b/tests/scenarios/cmd/sed/substitute/ampersand.yaml new file mode 100644 index 00000000..727de0dd --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/ampersand.yaml @@ -0,0 +1,13 @@ +description: Ampersand in replacement refers to the matched text. +setup: + files: + - path: input.txt + content: "foo bar\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/[&]/' input.txt +expect: + stdout: "[foo] bar\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/backreference.yaml b/tests/scenarios/cmd/sed/substitute/backreference.yaml new file mode 100644 index 00000000..6a3ddbe6 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/backreference.yaml @@ -0,0 +1,13 @@ +description: Backreference in replacement using capture groups with -E flag. +setup: + files: + - path: input.txt + content: "hello world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -E 's/(hello) (world)/\2 \1/' input.txt +expect: + stdout: "world hello\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/basic.yaml b/tests/scenarios/cmd/sed/substitute/basic.yaml new file mode 100644 index 00000000..533a7e68 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/basic.yaml @@ -0,0 +1,13 @@ +description: Basic sed substitution replaces first occurrence on each line. +setup: + files: + - path: input.txt + content: "hello world hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/hello/goodbye/' input.txt +expect: + stdout: "goodbye world hello\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/case_insensitive.yaml b/tests/scenarios/cmd/sed/substitute/case_insensitive.yaml new file mode 100644 index 00000000..1e2ad35e --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/case_insensitive.yaml @@ -0,0 +1,13 @@ +description: Case-insensitive substitution flag matches regardless of case. +setup: + files: + - path: input.txt + content: "Hello HELLO hello\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/hello/world/I' input.txt +expect: + stdout: "world HELLO hello\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/delimiter.yaml b/tests/scenarios/cmd/sed/substitute/delimiter.yaml new file mode 100644 index 00000000..bfbe9e6e --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/delimiter.yaml @@ -0,0 +1,13 @@ +description: Substitution with alternate delimiter character. +setup: + files: + - path: input.txt + content: "/usr/local/bin\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's|/usr/local|/opt|' input.txt +expect: + stdout: "/opt/bin\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/empty_match.yaml b/tests/scenarios/cmd/sed/substitute/empty_match.yaml new file mode 100644 index 00000000..281c4ee2 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/empty_match.yaml @@ -0,0 +1,13 @@ +description: Substitution matching beginning of line inserts a prefix. +setup: + files: + - path: input.txt + content: "line1\nline2\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/^/>> /' input.txt +expect: + stdout: ">> line1\n>> line2\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/global.yaml b/tests/scenarios/cmd/sed/substitute/global.yaml new file mode 100644 index 00000000..923aae99 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/global.yaml @@ -0,0 +1,13 @@ +description: Global substitution flag replaces all occurrences on each line. +setup: + files: + - path: input.txt + content: "aaa bbb aaa\naaa ccc aaa\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/aaa/XXX/g' input.txt +expect: + stdout: "XXX bbb XXX\nXXX ccc XXX\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/nth.yaml b/tests/scenarios/cmd/sed/substitute/nth.yaml new file mode 100644 index 00000000..9f62aba0 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/nth.yaml @@ -0,0 +1,13 @@ +description: Numeric flag replaces only the Nth occurrence. +setup: + files: + - path: input.txt + content: "one one one one\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/one/TWO/2' input.txt +expect: + stdout: "one TWO one one\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/text/append.yaml b/tests/scenarios/cmd/sed/text/append.yaml new file mode 100644 index 00000000..f8718ca0 --- /dev/null +++ b/tests/scenarios/cmd/sed/text/append.yaml @@ -0,0 +1,13 @@ +description: The a command appends text after the current line. +setup: + files: + - path: input.txt + content: "line1\nline2\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '1a appended' input.txt +expect: + stdout: "line1\nappended\nline2\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/text/change.yaml b/tests/scenarios/cmd/sed/text/change.yaml new file mode 100644 index 00000000..b2be4d02 --- /dev/null +++ b/tests/scenarios/cmd/sed/text/change.yaml @@ -0,0 +1,13 @@ +description: The c command replaces the current line with new text. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2c replaced' input.txt +expect: + stdout: "line1\nreplaced\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/text/insert.yaml b/tests/scenarios/cmd/sed/text/insert.yaml new file mode 100644 index 00000000..75c270ae --- /dev/null +++ b/tests/scenarios/cmd/sed/text/insert.yaml @@ -0,0 +1,13 @@ +description: The i command inserts text before the current line. +setup: + files: + - path: input.txt + content: "line1\nline2\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2i inserted' input.txt +expect: + stdout: "line1\ninserted\nline2\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/transliterate/basic.yaml b/tests/scenarios/cmd/sed/transliterate/basic.yaml new file mode 100644 index 00000000..921f108f --- /dev/null +++ b/tests/scenarios/cmd/sed/transliterate/basic.yaml @@ -0,0 +1,13 @@ +description: The y command transliterates characters from source to destination set. +setup: + files: + - path: input.txt + content: "hello world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 'y/helo/HELO/' input.txt +expect: + stdout: "HELLO wOrLd\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/unknown_cmd/common_progs/sed.yaml b/tests/scenarios/cmd/unknown_cmd/common_progs/sed.yaml deleted file mode 100644 index e4c28f58..00000000 --- a/tests/scenarios/cmd/unknown_cmd/common_progs/sed.yaml +++ /dev/null @@ -1,10 +0,0 @@ -skip_assert_against_bash: true -description: The sed command is not a builtin and is rejected as unknown. -input: - script: |+ - sed s/foo/bar/ file.txt -expect: - stdout: "" - stderr: |+ - sed: command not found - exit_code: 127 From 3500bd80a96b7f6b8fd2f86d17de6ade76e14de1 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 00:42:20 +0100 Subject: [PATCH 02/30] Split sed.go into types.go, parser.go, engine.go for readability Co-Authored-By: Claude Opus 4.6 --- interp/builtins/sed/engine.go | 558 ++++++++++++++++ interp/builtins/sed/parser.go | 537 +++++++++++++++ interp/builtins/sed/sed.go | 1179 --------------------------------- interp/builtins/sed/types.go | 120 ++++ 4 files changed, 1215 insertions(+), 1179 deletions(-) create mode 100644 interp/builtins/sed/engine.go create mode 100644 interp/builtins/sed/parser.go create mode 100644 interp/builtins/sed/types.go diff --git a/interp/builtins/sed/engine.go b/interp/builtins/sed/engine.go new file mode 100644 index 00000000..fed56e45 --- /dev/null +++ b/interp/builtins/sed/engine.go @@ -0,0 +1,558 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed + +import ( + "bufio" + "context" + "errors" + "fmt" + "io" + "os" + "strings" + + "github.com/DataDog/rshell/interp/builtins" +) + +// engine holds the state for executing a sed script. +type engine struct { + callCtx *builtins.CallContext + prog []*sedCmd + suppressPrint bool + lineNum int64 + lastLine bool + patternSpace string + holdSpace string + appendQueue []string // text queued by 'a' command, flushed after auto-print + subMade bool // set when s/// succeeds (cleared on new input line) + totalRead int64 + isRegularFile bool +} + +// lineReader wraps a scanner with one-line look-ahead so we can determine +// whether the current line is the last one, while still allowing n/N commands +// to consume lines from the same scanner. +type lineReader struct { + sc *bufio.Scanner + nextLine string + hasNext bool + totalRead int64 + isRegularFile bool +} + +func newLineReader(sc *bufio.Scanner, isRegular bool) *lineReader { + lr := &lineReader{sc: sc, isRegularFile: isRegular} + lr.advance() // prime the look-ahead + return lr +} + +func (lr *lineReader) advance() bool { + if lr.sc.Scan() { + lr.nextLine = lr.sc.Text() + lr.totalRead += int64(len(lr.sc.Bytes())) + lr.hasNext = true + return true + } + lr.hasNext = false + return false +} + +func (lr *lineReader) readLine() (string, bool) { + if !lr.hasNext { + return "", false + } + line := lr.nextLine + lr.advance() + return line, true +} + +func (lr *lineReader) isLast() bool { + return !lr.hasNext +} + +func (lr *lineReader) checkLimit() error { + if !lr.isRegularFile && lr.totalRead > MaxTotalReadBytes { + return errors.New("input too large: read limit exceeded") + } + return nil +} + +// processFile reads a single file and runs the sed script on each line. +func (eng *engine) processFile(ctx context.Context, callCtx *builtins.CallContext, file string) error { + var rc io.ReadCloser + if file == "-" { + if callCtx.Stdin == nil { + return nil + } + eng.isRegularFile = isRegularFile(callCtx.Stdin) + rc = io.NopCloser(callCtx.Stdin) + } else { + f, err := callCtx.OpenFile(ctx, file, os.O_RDONLY, 0) + if err != nil { + return err + } + defer f.Close() + eng.isRegularFile = isRegularFile(f) + rc = f + } + + sc := bufio.NewScanner(rc) + buf := make([]byte, 4096) + sc.Buffer(buf, MaxLineBytes) + + lr := newLineReader(sc, eng.isRegularFile) + + for { + if ctx.Err() != nil { + return ctx.Err() + } + + line, ok := lr.readLine() + if !ok { + break + } + if err := lr.checkLimit(); err != nil { + return err + } + + eng.lineNum++ + eng.patternSpace = line + eng.lastLine = lr.isLast() + + err := eng.runCycle(ctx, lr) + if err != nil { + return err + } + } + + if err := sc.Err(); err != nil { + return err + } + return nil +} + +// runCycle executes the script for the current input line. +func (eng *engine) runCycle(ctx context.Context, lr *lineReader) error { + eng.subMade = false + eng.appendQueue = eng.appendQueue[:0] + action, err := eng.execCommandsFrom(ctx, 0, lr, 0) + if err != nil { + return err + } + if action != actionDelete && !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + // Flush queued 'a' text after auto-print (even if auto-print was suppressed or deleted). + for _, text := range eng.appendQueue { + eng.callCtx.Outf("%s\n", text) + } + return nil +} + +// execCommandsFrom executes commands starting from index startIdx in the given +// command list. For branching, it always searches the full eng.prog for labels +// and restarts from there to handle backward branches correctly. +func (eng *engine) execCommandsFrom(ctx context.Context, startIdx int, lr *lineReader, depth int) (actionType, error) { + return eng.execCmds(ctx, eng.prog, startIdx, lr, depth) +} + +func (eng *engine) execCmds(ctx context.Context, cmds []*sedCmd, startIdx int, lr *lineReader, depth int) (actionType, error) { + if depth > MaxBranchIterations { + return actionContinue, errors.New("branch loop limit exceeded") + } + + for i := startIdx; i < len(cmds); i++ { + if ctx.Err() != nil { + return actionContinue, ctx.Err() + } + + cmd := cmds[i] + + if cmd.kind == cmdLabel { + continue + } + + if !eng.addressMatch(cmd) { + continue + } + + switch cmd.kind { + case cmdSubstitute: + if err := eng.execSubstitute(cmd); err != nil { + return actionContinue, err + } + + case cmdPrint: + eng.callCtx.Outf("%s\n", eng.patternSpace) + + case cmdDelete: + return actionDelete, nil + + case cmdPrintFirstLine: + if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { + eng.callCtx.Outf("%s\n", eng.patternSpace[:idx]) + } else { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + + case cmdDeleteFirstLine: + if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { + eng.patternSpace = eng.patternSpace[idx+1:] + // Restart the cycle with the remaining pattern space. + eng.subMade = false + eng.appendQueue = eng.appendQueue[:0] + return eng.execCommandsFrom(ctx, 0, lr, depth+1) + } + return actionDelete, nil + + case cmdQuit: + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + return actionContinue, &quitError{code: cmd.quitCode} + + case cmdQuitNoprint: + return actionContinue, &quitError{code: cmd.quitCode} + + case cmdTransliterate: + eng.patternSpace = eng.transliterate(eng.patternSpace, cmd.transFrom, cmd.transTo) + + case cmdAppend: + eng.appendQueue = append(eng.appendQueue, cmd.text) + + case cmdInsert: + eng.callCtx.Outf("%s\n", cmd.text) + + case cmdChange: + eng.callCtx.Outf("%s\n", cmd.text) + return actionDelete, nil + + case cmdLineNum: + eng.callCtx.Outf("%d\n", eng.lineNum) + + case cmdPrintUnambig: + eng.printUnambiguous() + + case cmdNext: + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + for _, text := range eng.appendQueue { + eng.callCtx.Outf("%s\n", text) + } + eng.appendQueue = eng.appendQueue[:0] + line, ok := lr.readLine() + if ok { + if err := lr.checkLimit(); err != nil { + return actionContinue, err + } + eng.lineNum++ + eng.patternSpace = line + eng.lastLine = lr.isLast() + } else { + eng.lastLine = true + return actionContinue, nil + } + + case cmdNextAppend: + line, ok := lr.readLine() + if ok { + if err := lr.checkLimit(); err != nil { + return actionContinue, err + } + eng.lineNum++ + if len(eng.patternSpace)+1+len(line) > MaxSpaceBytes { + return actionContinue, errors.New("pattern space exceeded size limit") + } + eng.patternSpace += "\n" + line + eng.lastLine = lr.isLast() + } else { + if !eng.suppressPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + return actionDelete, nil + } + + case cmdHoldCopy: + eng.holdSpace = eng.patternSpace + + case cmdHoldAppend: + if len(eng.holdSpace)+1+len(eng.patternSpace) > MaxSpaceBytes { + return actionContinue, errors.New("hold space exceeded size limit") + } + eng.holdSpace += "\n" + eng.patternSpace + + case cmdGetCopy: + eng.patternSpace = eng.holdSpace + + case cmdGetAppend: + if len(eng.patternSpace)+1+len(eng.holdSpace) > MaxSpaceBytes { + return actionContinue, errors.New("pattern space exceeded size limit") + } + eng.patternSpace += "\n" + eng.holdSpace + + case cmdExchange: + eng.patternSpace, eng.holdSpace = eng.holdSpace, eng.patternSpace + + case cmdBranch: + return eng.branchTo(ctx, cmd.label, lr, depth) + + case cmdBranchIfSub: + if eng.subMade { + eng.subMade = false + return eng.branchTo(ctx, cmd.label, lr, depth) + } + + case cmdBranchIfNoSub: + if !eng.subMade { + return eng.branchTo(ctx, cmd.label, lr, depth) + } + + case cmdGroup: + action, err := eng.execCmds(ctx, cmd.children, 0, lr, depth) + if err != nil || action != actionContinue { + return action, err + } + + case cmdNoop, cmdLabel: + // Do nothing. + } + } + + return actionContinue, nil +} + +func findLabel(cmds []*sedCmd, label string) int { + for i, cmd := range cmds { + if cmd.kind == cmdLabel && cmd.label == label { + return i + } + if cmd.kind == cmdGroup { + // Labels inside groups are visible from the top level in GNU sed. + if idx := findLabel(cmd.children, label); idx >= 0 { + // Return the group's index since we can't index into children from here. + return i + } + } + } + return -1 +} + +// branchTo resolves a label and continues execution from the command after it. +// An empty label branches to end of script (returns actionContinue). +func (eng *engine) branchTo(ctx context.Context, label string, lr *lineReader, depth int) (actionType, error) { + if label == "" { + return actionContinue, nil + } + target := findLabel(eng.prog, label) + if target < 0 { + return actionContinue, errors.New("undefined label '" + label + "'") + } + return eng.execCmds(ctx, eng.prog, target+1, lr, depth+1) +} + +// --- Address matching --- + +// addressMatch checks whether the current line matches the command's address. +func (eng *engine) addressMatch(cmd *sedCmd) bool { + match := eng.rawAddressMatch(cmd) + if cmd.negated { + return !match + } + return match +} + +func (eng *engine) rawAddressMatch(cmd *sedCmd) bool { + if cmd.addr1 == nil { + return true // no address means match all + } + + if cmd.addr2 == nil { + // Single address. + return eng.matchAddr(cmd.addr1) + } + + // Two-address range: match from addr1 to addr2 inclusive. + return eng.matchRange(cmd) +} + +func (eng *engine) matchAddr(addr *address) bool { + switch addr.kind { + case addrLine: + return eng.lineNum == addr.line + case addrLast: + return eng.lastLine + case addrRegexp: + return addr.re.MatchString(eng.patternSpace) + case addrStep: + if addr.first == 0 { + return eng.lineNum%addr.step == 0 + } + return eng.lineNum >= addr.first && (eng.lineNum-addr.first)%addr.step == 0 + } + return false +} + +func (eng *engine) matchRange(cmd *sedCmd) bool { + if cmd.inRange { + // We're inside the range. Check if addr2 closes it. + if eng.matchAddr(cmd.addr2) { + cmd.inRange = false + return true // addr2 line is still part of the range + } + return true + } + // Not in range — check if addr1 opens it. + if eng.matchAddr(cmd.addr1) { + // Check if addr2 also matches on the same line (degenerate range). + if eng.matchAddr(cmd.addr2) { + return true // one-line range, don't enter inRange state + } + cmd.inRange = true + return true + } + return false +} + +// --- Command implementations --- + +func (eng *engine) execSubstitute(cmd *sedCmd) error { + var result string + if cmd.subGlobal { + result = cmd.subRe.ReplaceAllString(eng.patternSpace, expandReplacement(cmd.subReplacement)) + } else if cmd.subNth > 0 { + count := 0 + result = cmd.subRe.ReplaceAllStringFunc(eng.patternSpace, func(match string) string { + count++ + if count == cmd.subNth { + return cmd.subRe.ReplaceAllString(match, expandReplacement(cmd.subReplacement)) + } + return match + }) + } else { + loc := cmd.subRe.FindStringIndex(eng.patternSpace) + if loc != nil { + matched := eng.patternSpace[loc[0]:loc[1]] + replacement := cmd.subRe.ReplaceAllString(matched, expandReplacement(cmd.subReplacement)) + result = eng.patternSpace[:loc[0]] + replacement + eng.patternSpace[loc[1]:] + } else { + return nil + } + } + if result != eng.patternSpace { + if len(result) > MaxSpaceBytes { + return errors.New("pattern space exceeded size limit") + } + eng.subMade = true + eng.patternSpace = result + if cmd.subPrint { + eng.callCtx.Outf("%s\n", eng.patternSpace) + } + } + return nil +} + +// expandReplacement converts sed replacement syntax to Go regexp replacement. +// In sed, & means the whole match. In Go regexp, that's ${0} or $0. +// Sed uses \1-\9 for groups, Go uses $1-$9. +func expandReplacement(repl string) string { + var sb strings.Builder + sb.Grow(len(repl)) + for i := 0; i < len(repl); i++ { + ch := repl[i] + if ch == '&' { + sb.WriteString("${0}") + } else if ch == '\\' && i+1 < len(repl) { + next := repl[i+1] + if next >= '1' && next <= '9' { + sb.WriteByte('$') + sb.WriteByte(next) + i++ + } else if next == '&' { + sb.WriteByte('&') + i++ + } else if next == '\\' { + sb.WriteByte('\\') + i++ + } else if next == 'n' { + sb.WriteByte('\n') + i++ + } else if next == 't' { + sb.WriteByte('\t') + i++ + } else { + sb.WriteByte('\\') + sb.WriteByte(next) + i++ + } + } else { + sb.WriteByte(ch) + } + } + return sb.String() +} + +func (eng *engine) transliterate(s string, from, to []rune) string { + runes := []rune(s) + for i, r := range runes { + for j, fr := range from { + if r == fr { + runes[i] = to[j] + break + } + } + } + return string(runes) +} + +func (eng *engine) printUnambiguous() { + // l command: print pattern space showing non-printing characters. + var sb strings.Builder + col := 0 + for _, r := range eng.patternSpace { + var s string + switch { + case r == '\\': + s = "\\\\" + case r == '\a': + s = "\\a" + case r == '\b': + s = "\\b" + case r == '\f': + s = "\\f" + case r == '\r': + s = "\\r" + case r == '\t': + s = "\\t" + case r == '\n': + s = "\\n" + case r < 32 || r == 127: + s = fmt.Sprintf("\\%03o", r) + default: + s = string(r) + } + if col+len(s) >= 70 { + sb.WriteString("\\\n") + col = 0 + } + sb.WriteString(s) + col += len(s) + } + sb.WriteByte('$') + sb.WriteByte('\n') + eng.callCtx.Out(sb.String()) +} + +// isRegularFile checks whether an io.Reader is backed by a regular file. +func isRegularFile(r any) bool { + type stater interface{ Stat() (os.FileInfo, error) } + sf, ok := r.(stater) + if !ok { + return false + } + fi, err := sf.Stat() + return err == nil && fi.Mode().IsRegular() +} diff --git a/interp/builtins/sed/parser.go b/interp/builtins/sed/parser.go new file mode 100644 index 00000000..ac568351 --- /dev/null +++ b/interp/builtins/sed/parser.go @@ -0,0 +1,537 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed + +import ( + "errors" + "regexp" + "strconv" + "strings" +) + +// parser holds state during sed script parsing. +type parser struct { + input string + pos int + useERE bool +} + +func parseScript(script string, useERE bool) ([]*sedCmd, error) { + p := &parser{input: script, useERE: useERE} + cmds, err := p.parseCommands(false) + if err != nil { + return nil, err + } + return cmds, nil +} + +func (p *parser) parseCommands(inGroup bool) ([]*sedCmd, error) { + var cmds []*sedCmd + for p.pos < len(p.input) { + p.skipWhitespaceAndSemicolons() + if p.pos >= len(p.input) { + break + } + ch := p.input[p.pos] + if ch == '}' { + if inGroup { + p.pos++ // consume '}' + return cmds, nil + } + return nil, errors.New("unexpected '}'") + } + if ch == '#' { + // Comment — skip to end of line. + for p.pos < len(p.input) && p.input[p.pos] != '\n' { + p.pos++ + } + continue + } + cmd, err := p.parseOneCommand() + if err != nil { + return nil, err + } + if cmd != nil { + cmds = append(cmds, cmd) + } + } + if inGroup { + return nil, errors.New("unterminated '{'") + } + return cmds, nil +} + +func (p *parser) skipWhitespaceAndSemicolons() { + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ';' { + p.pos++ + } else { + break + } + } +} + +func (p *parser) skipSpaces() { + for p.pos < len(p.input) && (p.input[p.pos] == ' ' || p.input[p.pos] == '\t') { + p.pos++ + } +} + +func (p *parser) parseOneCommand() (*sedCmd, error) { + cmd := &sedCmd{} + + // Parse first address. + addr1, err := p.parseAddress() + if err != nil { + return nil, err + } + cmd.addr1 = addr1 + + // Check for comma (address range). + if cmd.addr1 != nil && p.pos < len(p.input) && p.input[p.pos] == ',' { + p.pos++ // consume ',' + p.skipSpaces() + addr2, err := p.parseAddress() + if err != nil { + return nil, err + } + if addr2 == nil { + return nil, errors.New("expected address after ','") + } + cmd.addr2 = addr2 + } + + p.skipSpaces() + + // Check for negation. + if p.pos < len(p.input) && p.input[p.pos] == '!' { + cmd.negated = true + p.pos++ + p.skipSpaces() + } + + if p.pos >= len(p.input) { + return nil, errors.New("missing command") + } + + ch := p.input[p.pos] + p.pos++ + + switch ch { + case 's': + return p.parseSubstitute(cmd) + case 'y': + return p.parseTransliterate(cmd) + case 'p': + cmd.kind = cmdPrint + case 'P': + cmd.kind = cmdPrintFirstLine + case 'd': + cmd.kind = cmdDelete + case 'D': + cmd.kind = cmdDeleteFirstLine + case 'q': + cmd.kind = cmdQuit + cmd.quitCode = p.parseOptionalExitCode() + case 'Q': + cmd.kind = cmdQuitNoprint + cmd.quitCode = p.parseOptionalExitCode() + case 'a': + cmd.kind = cmdAppend + cmd.text = p.parseTextArg() + case 'i': + cmd.kind = cmdInsert + cmd.text = p.parseTextArg() + case 'c': + cmd.kind = cmdChange + cmd.text = p.parseTextArg() + case '=': + cmd.kind = cmdLineNum + case 'l': + cmd.kind = cmdPrintUnambig + case 'n': + cmd.kind = cmdNext + case 'N': + cmd.kind = cmdNextAppend + case 'h': + cmd.kind = cmdHoldCopy + case 'H': + cmd.kind = cmdHoldAppend + case 'g': + cmd.kind = cmdGetCopy + case 'G': + cmd.kind = cmdGetAppend + case 'x': + cmd.kind = cmdExchange + case 'b': + cmd.kind = cmdBranch + cmd.label = p.parseLabelArg() + case 't': + cmd.kind = cmdBranchIfSub + cmd.label = p.parseLabelArg() + case 'T': + cmd.kind = cmdBranchIfNoSub + cmd.label = p.parseLabelArg() + case ':': + cmd.kind = cmdLabel + cmd.label = p.parseLabelArg() + if cmd.label == "" { + return nil, errors.New("missing label name for ':'") + } + case '{': + children, err := p.parseCommands(true) + if err != nil { + return nil, err + } + cmd.kind = cmdGroup + cmd.children = children + case 'e': + return nil, errors.New("'e' command is blocked: command execution is not allowed") + case 'w': + return nil, errors.New("'w' command is blocked: file writing is not allowed") + case 'W': + return nil, errors.New("'W' command is blocked: file writing is not allowed") + case 'r': + return nil, errors.New("'r' command is blocked: unsandboxed file reading is not allowed") + case 'R': + return nil, errors.New("'R' command is blocked: unsandboxed file reading is not allowed") + default: + return nil, errors.New("unknown command: '" + string(ch) + "'") + } + + return cmd, nil +} + +func (p *parser) parseOptionalExitCode() uint8 { + p.skipSpaces() + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + if start == p.pos { + return 0 + } + n, err := strconv.Atoi(p.input[start:p.pos]) + if err != nil || n < 0 || n > 255 { + return 0 + } + return uint8(n) +} + +func (p *parser) parseTextArg() string { + // GNU sed allows: a\text, a text, or a\text + if p.pos < len(p.input) && p.input[p.pos] == '\\' { + p.pos++ + if p.pos < len(p.input) && p.input[p.pos] == '\n' { + p.pos++ // consume newline after backslash + } + } else { + p.skipSpaces() + } + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] != '\n' && p.input[p.pos] != ';' { + p.pos++ + } + return p.input[start:p.pos] +} + +func (p *parser) parseLabelArg() string { + p.skipSpaces() + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] != ' ' && p.input[p.pos] != '\t' && + p.input[p.pos] != '\n' && p.input[p.pos] != ';' && p.input[p.pos] != '}' { + p.pos++ + } + return p.input[start:p.pos] +} + +func (p *parser) parseAddress() (*address, error) { + if p.pos >= len(p.input) { + return nil, nil + } + + ch := p.input[p.pos] + + // Line number. + if ch >= '0' && ch <= '9' { + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + // Check for first~step syntax. + if p.pos < len(p.input) && p.input[p.pos] == '~' { + first, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) + if err != nil { + return nil, errors.New("invalid address: " + p.input[start:p.pos]) + } + p.pos++ // consume '~' + stepStart := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + step, err := strconv.ParseInt(p.input[stepStart:p.pos], 10, 64) + if err != nil || step <= 0 { + return nil, errors.New("invalid step in address") + } + return &address{kind: addrStep, first: first, step: step}, nil + } + n, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) + if err != nil { + return nil, errors.New("invalid line number: " + p.input[start:p.pos]) + } + return &address{kind: addrLine, line: n}, nil + } + + // Last line. + if ch == '$' { + p.pos++ + return &address{kind: addrLast}, nil + } + + // Regex address. + if ch == '/' || ch == '\\' { + var delim byte + if ch == '\\' { + p.pos++ // consume '\' + if p.pos >= len(p.input) { + return nil, errors.New("expected delimiter after '\\'") + } + delim = p.input[p.pos] + } else { + delim = '/' + } + p.pos++ // consume delimiter + pattern, err := p.readUntilDelimiter(delim) + if err != nil { + return nil, err + } + re, err := p.compileRegex(pattern) + if err != nil { + return nil, err + } + return &address{kind: addrRegexp, re: re}, nil + } + + return nil, nil +} + +func (p *parser) readUntilDelimiter(delim byte) (string, error) { + var sb strings.Builder + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == '\\' && p.pos+1 < len(p.input) { + next := p.input[p.pos+1] + if next == delim { + sb.WriteByte(delim) + p.pos += 2 + continue + } + sb.WriteByte('\\') + sb.WriteByte(next) + p.pos += 2 + continue + } + if ch == delim { + p.pos++ // consume closing delimiter + return sb.String(), nil + } + sb.WriteByte(ch) + p.pos++ + } + return "", errors.New("unterminated address regex") +} + +func (p *parser) parseSubstitute(cmd *sedCmd) (*sedCmd, error) { + if p.pos >= len(p.input) { + return nil, errors.New("missing delimiter for 's' command") + } + delim := p.input[p.pos] + if delim == '\\' || delim == '\n' { + return nil, errors.New("invalid delimiter for 's' command: '" + string(delim) + "'") + } + p.pos++ // consume delimiter + + // Read pattern. + pattern, err := p.readSubstPart(delim) + if err != nil { + return nil, errors.New("unterminated 's' command: " + err.Error()) + } + + // Read replacement. + replacement, err := p.readSubstPart(delim) + if err != nil { + return nil, errors.New("unterminated 's' command: " + err.Error()) + } + + // Read flags. + cmd.kind = cmdSubstitute + cmd.subReplacement = replacement + caseInsensitive := false + + for p.pos < len(p.input) { + ch := p.input[p.pos] + switch ch { + case 'g': + cmd.subGlobal = true + p.pos++ + case 'p': + cmd.subPrint = true + p.pos++ + case 'i', 'I': + caseInsensitive = true + p.pos++ + case 'w': + return nil, errors.New("'w' flag in 's' command is blocked: file writing is not allowed") + case 'e': + return nil, errors.New("'e' flag in 's' command is blocked: command execution is not allowed") + default: + if ch >= '1' && ch <= '9' { + start := p.pos + for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { + p.pos++ + } + n, err := strconv.Atoi(p.input[start:p.pos]) + if err != nil || n <= 0 { + return nil, errors.New("invalid substitution occurrence number") + } + cmd.subNth = n + continue + } + // Any other character ends the flag list. + goto flagsDone + } + } +flagsDone: + + re, err := p.compileRegex(pattern) + if err != nil { + return nil, err + } + // Apply case-insensitive flag after BRE-to-ERE conversion so (?i) isn't mangled. + if caseInsensitive { + re, err = regexp.Compile("(?i)" + re.String()) + if err != nil { + return nil, errors.New("invalid regex with case-insensitive flag: " + err.Error()) + } + } + cmd.subRe = re + return cmd, nil +} + +func (p *parser) readSubstPart(delim byte) (string, error) { + var sb strings.Builder + for p.pos < len(p.input) { + ch := p.input[p.pos] + if ch == '\\' && p.pos+1 < len(p.input) { + next := p.input[p.pos+1] + if next == delim { + sb.WriteByte(delim) + p.pos += 2 + continue + } + if next == 'n' { + sb.WriteByte('\n') + p.pos += 2 + continue + } + if next == 't' { + sb.WriteByte('\t') + p.pos += 2 + continue + } + sb.WriteByte('\\') + sb.WriteByte(next) + p.pos += 2 + continue + } + if ch == delim { + p.pos++ // consume closing delimiter + return sb.String(), nil + } + sb.WriteByte(ch) + p.pos++ + } + return sb.String(), nil +} + +func (p *parser) parseTransliterate(cmd *sedCmd) (*sedCmd, error) { + if p.pos >= len(p.input) { + return nil, errors.New("missing delimiter for 'y' command") + } + delim := p.input[p.pos] + p.pos++ + + srcStr, err := p.readSubstPart(delim) + if err != nil { + return nil, err + } + dstStr, err := p.readSubstPart(delim) + if err != nil { + return nil, err + } + + src := []rune(srcStr) + dst := []rune(dstStr) + if len(src) != len(dst) { + return nil, errors.New("'y' command: source and destination must have the same length") + } + + cmd.kind = cmdTransliterate + cmd.transFrom = src + cmd.transTo = dst + return cmd, nil +} + +// compileRegex compiles a regex pattern, converting BRE to ERE if needed. +func (p *parser) compileRegex(pattern string) (*regexp.Regexp, error) { + if !p.useERE { + pattern = breToERE(pattern) + } + re, err := regexp.Compile(pattern) + if err != nil { + return nil, errors.New("invalid regex: " + err.Error()) + } + return re, nil +} + +// breToERE converts a basic regular expression to an extended one. +// In BRE: \( \) \{ \} \+ \? are special; ( ) { } + ? are literal. +// In ERE: ( ) { } + ? are special; \( \) etc. are literal. +func breToERE(pattern string) string { + var sb strings.Builder + sb.Grow(len(pattern)) + i := 0 + for i < len(pattern) { + if pattern[i] == '\\' && i+1 < len(pattern) { + next := pattern[i+1] + switch next { + case '(', ')', '{', '}', '+', '?', '|': + // BRE escaped special → ERE unescaped special. + sb.WriteByte(next) + i += 2 + default: + // Includes backreferences (\1-\9) which RE2 doesn't support + // but are passed through unchanged. + sb.WriteByte('\\') + sb.WriteByte(next) + i += 2 + } + } else { + ch := pattern[i] + switch ch { + case '(', ')', '{', '}', '+', '?', '|': + // In BRE these are literal; escape them for ERE. + sb.WriteByte('\\') + sb.WriteByte(ch) + default: + sb.WriteByte(ch) + } + i++ + } + } + return sb.String() +} diff --git a/interp/builtins/sed/sed.go b/interp/builtins/sed/sed.go index c95a4acf..72d3966f 100644 --- a/interp/builtins/sed/sed.go +++ b/interp/builtins/sed/sed.go @@ -108,14 +108,8 @@ package sed import ( - "bufio" "context" "errors" - "fmt" - "io" - "os" - "regexp" - "strconv" "strings" "github.com/DataDog/rshell/interp/builtins" @@ -236,1176 +230,3 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { return builtins.Result{} } } - -// --- Error types --- - -// quitError signals a q or Q command with an exit code. -type quitError struct { - code uint8 -} - -func (e *quitError) Error() string { - return fmt.Sprintf("quit with code %d", e.code) -} - -// --- Address types --- - -// addrType distinguishes different address kinds. -type addrType int - -const ( - addrNone addrType = iota - addrLine // specific line number - addrLast // $ (last line) - addrRegexp // /regex/ - addrStep // first~step (GNU extension) -) - -// address represents a sed address (line number, regex, or $). -type address struct { - kind addrType - line int64 // for addrLine - re *regexp.Regexp // for addrRegexp - first int64 // for addrStep - step int64 // for addrStep -} - -// --- Command types --- - -// cmdType identifies the sed command. -type cmdType int - -const ( - cmdSubstitute cmdType = iota - cmdPrint - cmdDelete - cmdQuit - cmdQuitNoprint - cmdTransliterate - cmdAppend - cmdInsert - cmdChange - cmdLineNum - cmdPrintUnambig - cmdNext - cmdNextAppend - cmdHoldCopy - cmdHoldAppend - cmdGetCopy - cmdGetAppend - cmdExchange - cmdBranch - cmdLabel - cmdBranchIfSub - cmdBranchIfNoSub - cmdPrintFirstLine // P: print up to first embedded newline - cmdDeleteFirstLine // D: delete up to first embedded newline, restart cycle - cmdGroup - cmdNoop -) - -// sedCmd represents a single parsed sed command. -type sedCmd struct { - addr1 *address - addr2 *address - negated bool - inRange bool // stateful: tracks whether we're inside a two-address range - kind cmdType - - // For s command: - subRe *regexp.Regexp - subReplacement string - subGlobal bool - subPrint bool - subNth int - - // For y command: - transFrom []rune - transTo []rune - - // For a, i, c commands: - text string - - // For q, Q commands: - quitCode uint8 - - // For b, t, T commands: - label string - - // For { ... } grouping: - children []*sedCmd -} - -// --- Parser --- - -// parser holds state during sed script parsing. -type parser struct { - input string - pos int - useERE bool -} - -func parseScript(script string, useERE bool) ([]*sedCmd, error) { - p := &parser{input: script, useERE: useERE} - cmds, err := p.parseCommands(false) - if err != nil { - return nil, err - } - return cmds, nil -} - -func (p *parser) parseCommands(inGroup bool) ([]*sedCmd, error) { - var cmds []*sedCmd - for p.pos < len(p.input) { - p.skipWhitespaceAndSemicolons() - if p.pos >= len(p.input) { - break - } - ch := p.input[p.pos] - if ch == '}' { - if inGroup { - p.pos++ // consume '}' - return cmds, nil - } - return nil, errors.New("unexpected '}'") - } - if ch == '#' { - // Comment — skip to end of line. - for p.pos < len(p.input) && p.input[p.pos] != '\n' { - p.pos++ - } - continue - } - cmd, err := p.parseOneCommand() - if err != nil { - return nil, err - } - if cmd != nil { - cmds = append(cmds, cmd) - } - } - if inGroup { - return nil, errors.New("unterminated '{'") - } - return cmds, nil -} - -func (p *parser) skipWhitespaceAndSemicolons() { - for p.pos < len(p.input) { - ch := p.input[p.pos] - if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == ';' { - p.pos++ - } else { - break - } - } -} - -func (p *parser) skipSpaces() { - for p.pos < len(p.input) && (p.input[p.pos] == ' ' || p.input[p.pos] == '\t') { - p.pos++ - } -} - -func (p *parser) parseOneCommand() (*sedCmd, error) { - cmd := &sedCmd{} - - // Parse first address. - addr1, err := p.parseAddress() - if err != nil { - return nil, err - } - cmd.addr1 = addr1 - - // Check for comma (address range). - if cmd.addr1 != nil && p.pos < len(p.input) && p.input[p.pos] == ',' { - p.pos++ // consume ',' - p.skipSpaces() - addr2, err := p.parseAddress() - if err != nil { - return nil, err - } - if addr2 == nil { - return nil, errors.New("expected address after ','") - } - cmd.addr2 = addr2 - } - - p.skipSpaces() - - // Check for negation. - if p.pos < len(p.input) && p.input[p.pos] == '!' { - cmd.negated = true - p.pos++ - p.skipSpaces() - } - - if p.pos >= len(p.input) { - return nil, errors.New("missing command") - } - - ch := p.input[p.pos] - p.pos++ - - switch ch { - case 's': - return p.parseSubstitute(cmd) - case 'y': - return p.parseTransliterate(cmd) - case 'p': - cmd.kind = cmdPrint - case 'P': - cmd.kind = cmdPrintFirstLine - case 'd': - cmd.kind = cmdDelete - case 'D': - cmd.kind = cmdDeleteFirstLine - case 'q': - cmd.kind = cmdQuit - cmd.quitCode = p.parseOptionalExitCode() - case 'Q': - cmd.kind = cmdQuitNoprint - cmd.quitCode = p.parseOptionalExitCode() - case 'a': - cmd.kind = cmdAppend - cmd.text = p.parseTextArg() - case 'i': - cmd.kind = cmdInsert - cmd.text = p.parseTextArg() - case 'c': - cmd.kind = cmdChange - cmd.text = p.parseTextArg() - case '=': - cmd.kind = cmdLineNum - case 'l': - cmd.kind = cmdPrintUnambig - case 'n': - cmd.kind = cmdNext - case 'N': - cmd.kind = cmdNextAppend - case 'h': - cmd.kind = cmdHoldCopy - case 'H': - cmd.kind = cmdHoldAppend - case 'g': - cmd.kind = cmdGetCopy - case 'G': - cmd.kind = cmdGetAppend - case 'x': - cmd.kind = cmdExchange - case 'b': - cmd.kind = cmdBranch - cmd.label = p.parseLabelArg() - case 't': - cmd.kind = cmdBranchIfSub - cmd.label = p.parseLabelArg() - case 'T': - cmd.kind = cmdBranchIfNoSub - cmd.label = p.parseLabelArg() - case ':': - cmd.kind = cmdLabel - cmd.label = p.parseLabelArg() - if cmd.label == "" { - return nil, errors.New("missing label name for ':'") - } - case '{': - children, err := p.parseCommands(true) - if err != nil { - return nil, err - } - cmd.kind = cmdGroup - cmd.children = children - case 'e': - return nil, errors.New("'e' command is blocked: command execution is not allowed") - case 'w': - return nil, errors.New("'w' command is blocked: file writing is not allowed") - case 'W': - return nil, errors.New("'W' command is blocked: file writing is not allowed") - case 'r': - return nil, errors.New("'r' command is blocked: unsandboxed file reading is not allowed") - case 'R': - return nil, errors.New("'R' command is blocked: unsandboxed file reading is not allowed") - default: - return nil, errors.New("unknown command: '" + string(ch) + "'") - } - - return cmd, nil -} - -func (p *parser) parseOptionalExitCode() uint8 { - p.skipSpaces() - start := p.pos - for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { - p.pos++ - } - if start == p.pos { - return 0 - } - n, err := strconv.Atoi(p.input[start:p.pos]) - if err != nil || n < 0 || n > 255 { - return 0 - } - return uint8(n) -} - -func (p *parser) parseTextArg() string { - // GNU sed allows: a\text, a text, or a\text - if p.pos < len(p.input) && p.input[p.pos] == '\\' { - p.pos++ - if p.pos < len(p.input) && p.input[p.pos] == '\n' { - p.pos++ // consume newline after backslash - } - } else { - p.skipSpaces() - } - start := p.pos - for p.pos < len(p.input) && p.input[p.pos] != '\n' && p.input[p.pos] != ';' { - p.pos++ - } - return p.input[start:p.pos] -} - -func (p *parser) parseLabelArg() string { - p.skipSpaces() - start := p.pos - for p.pos < len(p.input) && p.input[p.pos] != ' ' && p.input[p.pos] != '\t' && - p.input[p.pos] != '\n' && p.input[p.pos] != ';' && p.input[p.pos] != '}' { - p.pos++ - } - return p.input[start:p.pos] -} - -func (p *parser) parseAddress() (*address, error) { - if p.pos >= len(p.input) { - return nil, nil - } - - ch := p.input[p.pos] - - // Line number. - if ch >= '0' && ch <= '9' { - start := p.pos - for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { - p.pos++ - } - // Check for first~step syntax. - if p.pos < len(p.input) && p.input[p.pos] == '~' { - first, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) - if err != nil { - return nil, errors.New("invalid address: " + p.input[start:p.pos]) - } - p.pos++ // consume '~' - stepStart := p.pos - for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { - p.pos++ - } - step, err := strconv.ParseInt(p.input[stepStart:p.pos], 10, 64) - if err != nil || step <= 0 { - return nil, errors.New("invalid step in address") - } - return &address{kind: addrStep, first: first, step: step}, nil - } - n, err := strconv.ParseInt(p.input[start:p.pos], 10, 64) - if err != nil { - return nil, errors.New("invalid line number: " + p.input[start:p.pos]) - } - return &address{kind: addrLine, line: n}, nil - } - - // Last line. - if ch == '$' { - p.pos++ - return &address{kind: addrLast}, nil - } - - // Regex address. - if ch == '/' || ch == '\\' { - var delim byte - if ch == '\\' { - p.pos++ // consume '\' - if p.pos >= len(p.input) { - return nil, errors.New("expected delimiter after '\\'") - } - delim = p.input[p.pos] - } else { - delim = '/' - } - p.pos++ // consume delimiter - pattern, err := p.readUntilDelimiter(delim) - if err != nil { - return nil, err - } - re, err := p.compileRegex(pattern) - if err != nil { - return nil, err - } - return &address{kind: addrRegexp, re: re}, nil - } - - return nil, nil -} - -func (p *parser) readUntilDelimiter(delim byte) (string, error) { - var sb strings.Builder - for p.pos < len(p.input) { - ch := p.input[p.pos] - if ch == '\\' && p.pos+1 < len(p.input) { - next := p.input[p.pos+1] - if next == delim { - sb.WriteByte(delim) - p.pos += 2 - continue - } - sb.WriteByte('\\') - sb.WriteByte(next) - p.pos += 2 - continue - } - if ch == delim { - p.pos++ // consume closing delimiter - return sb.String(), nil - } - sb.WriteByte(ch) - p.pos++ - } - return "", errors.New("unterminated address regex") -} - -func (p *parser) parseSubstitute(cmd *sedCmd) (*sedCmd, error) { - if p.pos >= len(p.input) { - return nil, errors.New("missing delimiter for 's' command") - } - delim := p.input[p.pos] - if delim == '\\' || delim == '\n' { - return nil, errors.New("invalid delimiter for 's' command: '" + string(delim) + "'") - } - p.pos++ // consume delimiter - - // Read pattern. - pattern, err := p.readSubstPart(delim) - if err != nil { - return nil, errors.New("unterminated 's' command: " + err.Error()) - } - - // Read replacement. - replacement, err := p.readSubstPart(delim) - if err != nil { - return nil, errors.New("unterminated 's' command: " + err.Error()) - } - - // Read flags. - cmd.kind = cmdSubstitute - cmd.subReplacement = replacement - caseInsensitive := false - - for p.pos < len(p.input) { - ch := p.input[p.pos] - switch ch { - case 'g': - cmd.subGlobal = true - p.pos++ - case 'p': - cmd.subPrint = true - p.pos++ - case 'i', 'I': - caseInsensitive = true - p.pos++ - case 'w': - return nil, errors.New("'w' flag in 's' command is blocked: file writing is not allowed") - case 'e': - return nil, errors.New("'e' flag in 's' command is blocked: command execution is not allowed") - default: - if ch >= '1' && ch <= '9' { - start := p.pos - for p.pos < len(p.input) && p.input[p.pos] >= '0' && p.input[p.pos] <= '9' { - p.pos++ - } - n, err := strconv.Atoi(p.input[start:p.pos]) - if err != nil || n <= 0 { - return nil, errors.New("invalid substitution occurrence number") - } - cmd.subNth = n - continue - } - // Any other character ends the flag list. - goto flagsDone - } - } -flagsDone: - - re, err := p.compileRegex(pattern) - if err != nil { - return nil, err - } - // Apply case-insensitive flag after BRE-to-ERE conversion so (?i) isn't mangled. - if caseInsensitive { - re, err = regexp.Compile("(?i)" + re.String()) - if err != nil { - return nil, errors.New("invalid regex with case-insensitive flag: " + err.Error()) - } - } - cmd.subRe = re - return cmd, nil -} - -func (p *parser) readSubstPart(delim byte) (string, error) { - var sb strings.Builder - for p.pos < len(p.input) { - ch := p.input[p.pos] - if ch == '\\' && p.pos+1 < len(p.input) { - next := p.input[p.pos+1] - if next == delim { - sb.WriteByte(delim) - p.pos += 2 - continue - } - if next == 'n' { - sb.WriteByte('\n') - p.pos += 2 - continue - } - if next == 't' { - sb.WriteByte('\t') - p.pos += 2 - continue - } - sb.WriteByte('\\') - sb.WriteByte(next) - p.pos += 2 - continue - } - if ch == delim { - p.pos++ // consume closing delimiter - return sb.String(), nil - } - sb.WriteByte(ch) - p.pos++ - } - return sb.String(), nil -} - -func (p *parser) parseTransliterate(cmd *sedCmd) (*sedCmd, error) { - if p.pos >= len(p.input) { - return nil, errors.New("missing delimiter for 'y' command") - } - delim := p.input[p.pos] - p.pos++ - - srcStr, err := p.readSubstPart(delim) - if err != nil { - return nil, err - } - dstStr, err := p.readSubstPart(delim) - if err != nil { - return nil, err - } - - src := []rune(srcStr) - dst := []rune(dstStr) - if len(src) != len(dst) { - return nil, errors.New("'y' command: source and destination must have the same length") - } - - cmd.kind = cmdTransliterate - cmd.transFrom = src - cmd.transTo = dst - return cmd, nil -} - -// compileRegex compiles a regex pattern, converting BRE to ERE if needed. -func (p *parser) compileRegex(pattern string) (*regexp.Regexp, error) { - if !p.useERE { - pattern = breToERE(pattern) - } - re, err := regexp.Compile(pattern) - if err != nil { - return nil, errors.New("invalid regex: " + err.Error()) - } - return re, nil -} - -// breToERE converts a basic regular expression to an extended one. -// In BRE: \( \) \{ \} \+ \? are special; ( ) { } + ? are literal. -// In ERE: ( ) { } + ? are special; \( \) etc. are literal. -func breToERE(pattern string) string { - var sb strings.Builder - sb.Grow(len(pattern)) - i := 0 - for i < len(pattern) { - if pattern[i] == '\\' && i+1 < len(pattern) { - next := pattern[i+1] - switch next { - case '(', ')', '{', '}', '+', '?', '|': - // BRE escaped special → ERE unescaped special. - sb.WriteByte(next) - i += 2 - default: - // Includes backreferences (\1-\9) which RE2 doesn't support - // but are passed through unchanged. - sb.WriteByte('\\') - sb.WriteByte(next) - i += 2 - } - } else { - ch := pattern[i] - switch ch { - case '(', ')', '{', '}', '+', '?', '|': - // In BRE these are literal; escape them for ERE. - sb.WriteByte('\\') - sb.WriteByte(ch) - default: - sb.WriteByte(ch) - } - i++ - } - } - return sb.String() -} - -// --- Execution Engine --- - -// engine holds the state for executing a sed script. -type engine struct { - callCtx *builtins.CallContext - prog []*sedCmd - suppressPrint bool - lineNum int64 - lastLine bool - patternSpace string - holdSpace string - appendQueue []string // text queued by 'a' command, flushed after auto-print - subMade bool // set when s/// succeeds (cleared on new input line) - totalRead int64 - isRegularFile bool -} - -// lineReader wraps a scanner with one-line look-ahead so we can determine -// whether the current line is the last one, while still allowing n/N commands -// to consume lines from the same scanner. -type lineReader struct { - sc *bufio.Scanner - nextLine string - hasNext bool - totalRead int64 - isRegularFile bool -} - -func newLineReader(sc *bufio.Scanner, isRegular bool) *lineReader { - lr := &lineReader{sc: sc, isRegularFile: isRegular} - lr.advance() // prime the look-ahead - return lr -} - -func (lr *lineReader) advance() bool { - if lr.sc.Scan() { - lr.nextLine = lr.sc.Text() - lr.totalRead += int64(len(lr.sc.Bytes())) - lr.hasNext = true - return true - } - lr.hasNext = false - return false -} - -func (lr *lineReader) readLine() (string, bool) { - if !lr.hasNext { - return "", false - } - line := lr.nextLine - lr.advance() - return line, true -} - -func (lr *lineReader) isLast() bool { - return !lr.hasNext -} - -func (lr *lineReader) checkLimit() error { - if !lr.isRegularFile && lr.totalRead > MaxTotalReadBytes { - return errors.New("input too large: read limit exceeded") - } - return nil -} - -func (eng *engine) processFile(ctx context.Context, callCtx *builtins.CallContext, file string) error { - var rc io.ReadCloser - if file == "-" { - if callCtx.Stdin == nil { - return nil - } - eng.isRegularFile = isRegularFile(callCtx.Stdin) - rc = io.NopCloser(callCtx.Stdin) - } else { - f, err := callCtx.OpenFile(ctx, file, os.O_RDONLY, 0) - if err != nil { - return err - } - defer f.Close() - eng.isRegularFile = isRegularFile(f) - rc = f - } - - sc := bufio.NewScanner(rc) - buf := make([]byte, 4096) - sc.Buffer(buf, MaxLineBytes) - - lr := newLineReader(sc, eng.isRegularFile) - - for { - if ctx.Err() != nil { - return ctx.Err() - } - - line, ok := lr.readLine() - if !ok { - break - } - if err := lr.checkLimit(); err != nil { - return err - } - - eng.lineNum++ - eng.patternSpace = line - eng.lastLine = lr.isLast() - - err := eng.runCycle(ctx, lr) - if err != nil { - return err - } - } - - if err := sc.Err(); err != nil { - return err - } - return nil -} - -// runCycle executes the script for the current input line. -func (eng *engine) runCycle(ctx context.Context, lr *lineReader) error { - eng.subMade = false - eng.appendQueue = eng.appendQueue[:0] - action, err := eng.execCommandsFrom(ctx, 0, lr, 0) - if err != nil { - return err - } - if action != actionDelete && !eng.suppressPrint { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - // Flush queued 'a' text after auto-print (even if auto-print was suppressed or deleted). - for _, text := range eng.appendQueue { - eng.callCtx.Outf("%s\n", text) - } - return nil -} - -// actionType signals how to proceed after executing a command. -type actionType int - -const ( - actionContinue actionType = iota - actionDelete // d/D command: skip auto-print, start next cycle -) - -// execCommandsFrom executes commands starting from index startIdx in the given -// command list. For branching, it always searches the full eng.prog for labels -// and restarts from there to handle backward branches correctly. -func (eng *engine) execCommandsFrom(ctx context.Context, startIdx int, lr *lineReader, depth int) (actionType, error) { - return eng.execCmds(ctx, eng.prog, startIdx, lr, depth) -} - -func (eng *engine) execCmds(ctx context.Context, cmds []*sedCmd, startIdx int, lr *lineReader, depth int) (actionType, error) { - if depth > MaxBranchIterations { - return actionContinue, errors.New("branch loop limit exceeded") - } - - for i := startIdx; i < len(cmds); i++ { - if ctx.Err() != nil { - return actionContinue, ctx.Err() - } - - cmd := cmds[i] - - if cmd.kind == cmdLabel { - continue - } - - if !eng.addressMatch(cmd) { - continue - } - - switch cmd.kind { - case cmdSubstitute: - if err := eng.execSubstitute(cmd); err != nil { - return actionContinue, err - } - - case cmdPrint: - eng.callCtx.Outf("%s\n", eng.patternSpace) - - case cmdDelete: - return actionDelete, nil - - case cmdPrintFirstLine: - if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { - eng.callCtx.Outf("%s\n", eng.patternSpace[:idx]) - } else { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - - case cmdDeleteFirstLine: - if idx := strings.IndexByte(eng.patternSpace, '\n'); idx >= 0 { - eng.patternSpace = eng.patternSpace[idx+1:] - // Restart the cycle with the remaining pattern space. - eng.subMade = false - eng.appendQueue = eng.appendQueue[:0] - return eng.execCommandsFrom(ctx, 0, lr, depth+1) - } - return actionDelete, nil - - case cmdQuit: - if !eng.suppressPrint { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - return actionContinue, &quitError{code: cmd.quitCode} - - case cmdQuitNoprint: - return actionContinue, &quitError{code: cmd.quitCode} - - case cmdTransliterate: - eng.patternSpace = eng.transliterate(eng.patternSpace, cmd.transFrom, cmd.transTo) - - case cmdAppend: - eng.appendQueue = append(eng.appendQueue, cmd.text) - - case cmdInsert: - eng.callCtx.Outf("%s\n", cmd.text) - - case cmdChange: - eng.callCtx.Outf("%s\n", cmd.text) - return actionDelete, nil - - case cmdLineNum: - eng.callCtx.Outf("%d\n", eng.lineNum) - - case cmdPrintUnambig: - eng.printUnambiguous() - - case cmdNext: - if !eng.suppressPrint { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - for _, text := range eng.appendQueue { - eng.callCtx.Outf("%s\n", text) - } - eng.appendQueue = eng.appendQueue[:0] - line, ok := lr.readLine() - if ok { - if err := lr.checkLimit(); err != nil { - return actionContinue, err - } - eng.lineNum++ - eng.patternSpace = line - eng.lastLine = lr.isLast() - } else { - eng.lastLine = true - return actionContinue, nil - } - - case cmdNextAppend: - line, ok := lr.readLine() - if ok { - if err := lr.checkLimit(); err != nil { - return actionContinue, err - } - eng.lineNum++ - if len(eng.patternSpace)+1+len(line) > MaxSpaceBytes { - return actionContinue, errors.New("pattern space exceeded size limit") - } - eng.patternSpace += "\n" + line - eng.lastLine = lr.isLast() - } else { - if !eng.suppressPrint { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - return actionDelete, nil - } - - case cmdHoldCopy: - eng.holdSpace = eng.patternSpace - - case cmdHoldAppend: - if len(eng.holdSpace)+1+len(eng.patternSpace) > MaxSpaceBytes { - return actionContinue, errors.New("hold space exceeded size limit") - } - eng.holdSpace += "\n" + eng.patternSpace - - case cmdGetCopy: - eng.patternSpace = eng.holdSpace - - case cmdGetAppend: - if len(eng.patternSpace)+1+len(eng.holdSpace) > MaxSpaceBytes { - return actionContinue, errors.New("pattern space exceeded size limit") - } - eng.patternSpace += "\n" + eng.holdSpace - - case cmdExchange: - eng.patternSpace, eng.holdSpace = eng.holdSpace, eng.patternSpace - - case cmdBranch: - return eng.branchTo(ctx, cmd.label, lr, depth) - - case cmdBranchIfSub: - if eng.subMade { - eng.subMade = false - return eng.branchTo(ctx, cmd.label, lr, depth) - } - - case cmdBranchIfNoSub: - if !eng.subMade { - return eng.branchTo(ctx, cmd.label, lr, depth) - } - - case cmdGroup: - action, err := eng.execCmds(ctx, cmd.children, 0, lr, depth) - if err != nil || action != actionContinue { - return action, err - } - - case cmdNoop, cmdLabel: - // Do nothing. - } - } - - return actionContinue, nil -} - -func findLabel(cmds []*sedCmd, label string) int { - for i, cmd := range cmds { - if cmd.kind == cmdLabel && cmd.label == label { - return i - } - if cmd.kind == cmdGroup { - // Labels inside groups are visible from the top level in GNU sed. - if idx := findLabel(cmd.children, label); idx >= 0 { - // Return the group's index since we can't index into children from here. - return i - } - } - } - return -1 -} - -// branchTo resolves a label and continues execution from the command after it. -// An empty label branches to end of script (returns actionContinue). -func (eng *engine) branchTo(ctx context.Context, label string, lr *lineReader, depth int) (actionType, error) { - if label == "" { - return actionContinue, nil - } - target := findLabel(eng.prog, label) - if target < 0 { - return actionContinue, errors.New("undefined label '" + label + "'") - } - return eng.execCmds(ctx, eng.prog, target+1, lr, depth+1) -} - -// addressMatch checks whether the current line matches the command's address. -func (eng *engine) addressMatch(cmd *sedCmd) bool { - match := eng.rawAddressMatch(cmd) - if cmd.negated { - return !match - } - return match -} - -func (eng *engine) rawAddressMatch(cmd *sedCmd) bool { - if cmd.addr1 == nil { - return true // no address means match all - } - - if cmd.addr2 == nil { - // Single address. - return eng.matchAddr(cmd.addr1) - } - - // Two-address range: match from addr1 to addr2 inclusive. - // We use a simple approach: check if current line is >= addr1 and <= addr2. - // For regex addresses, this is more complex. We use a stateful approach - // via the command itself to track whether we're inside the range. - return eng.matchRange(cmd) -} - -func (eng *engine) matchAddr(addr *address) bool { - switch addr.kind { - case addrLine: - return eng.lineNum == addr.line - case addrLast: - return eng.lastLine - case addrRegexp: - return addr.re.MatchString(eng.patternSpace) - case addrStep: - if addr.first == 0 { - return eng.lineNum%addr.step == 0 - } - return eng.lineNum >= addr.first && (eng.lineNum-addr.first)%addr.step == 0 - } - return false -} - -func (eng *engine) matchRange(cmd *sedCmd) bool { - if cmd.inRange { - // We're inside the range. Check if addr2 closes it. - if eng.matchAddr(cmd.addr2) { - cmd.inRange = false - return true // addr2 line is still part of the range - } - return true - } - // Not in range — check if addr1 opens it. - if eng.matchAddr(cmd.addr1) { - // Check if addr2 also matches on the same line (degenerate range). - if eng.matchAddr(cmd.addr2) { - return true // one-line range, don't enter inRange state - } - cmd.inRange = true - return true - } - return false -} - -func (eng *engine) execSubstitute(cmd *sedCmd) error { - var result string - if cmd.subGlobal { - result = cmd.subRe.ReplaceAllString(eng.patternSpace, expandReplacement(cmd.subReplacement)) - } else if cmd.subNth > 0 { - count := 0 - result = cmd.subRe.ReplaceAllStringFunc(eng.patternSpace, func(match string) string { - count++ - if count == cmd.subNth { - return cmd.subRe.ReplaceAllString(match, expandReplacement(cmd.subReplacement)) - } - return match - }) - } else { - loc := cmd.subRe.FindStringIndex(eng.patternSpace) - if loc != nil { - matched := eng.patternSpace[loc[0]:loc[1]] - replacement := cmd.subRe.ReplaceAllString(matched, expandReplacement(cmd.subReplacement)) - result = eng.patternSpace[:loc[0]] + replacement + eng.patternSpace[loc[1]:] - } else { - return nil - } - } - if result != eng.patternSpace { - if len(result) > MaxSpaceBytes { - return errors.New("pattern space exceeded size limit") - } - eng.subMade = true - eng.patternSpace = result - if cmd.subPrint { - eng.callCtx.Outf("%s\n", eng.patternSpace) - } - } - return nil -} - -// expandReplacement converts sed replacement syntax to Go regexp replacement. -// In sed, & means the whole match. In Go regexp, that's ${0} or $0. -// Sed uses \1-\9 for groups, Go uses $1-$9. -func expandReplacement(repl string) string { - var sb strings.Builder - sb.Grow(len(repl)) - for i := 0; i < len(repl); i++ { - ch := repl[i] - if ch == '&' { - sb.WriteString("${0}") - } else if ch == '\\' && i+1 < len(repl) { - next := repl[i+1] - if next >= '1' && next <= '9' { - sb.WriteByte('$') - sb.WriteByte(next) - i++ - } else if next == '&' { - sb.WriteByte('&') - i++ - } else if next == '\\' { - sb.WriteByte('\\') - i++ - } else if next == 'n' { - sb.WriteByte('\n') - i++ - } else if next == 't' { - sb.WriteByte('\t') - i++ - } else { - sb.WriteByte('\\') - sb.WriteByte(next) - i++ - } - } else { - sb.WriteByte(ch) - } - } - return sb.String() -} - -func (eng *engine) transliterate(s string, from, to []rune) string { - runes := []rune(s) - for i, r := range runes { - for j, fr := range from { - if r == fr { - runes[i] = to[j] - break - } - } - } - return string(runes) -} - -func (eng *engine) printUnambiguous() { - // l command: print pattern space showing non-printing characters. - var sb strings.Builder - col := 0 - for _, r := range eng.patternSpace { - var s string - switch { - case r == '\\': - s = "\\\\" - case r == '\a': - s = "\\a" - case r == '\b': - s = "\\b" - case r == '\f': - s = "\\f" - case r == '\r': - s = "\\r" - case r == '\t': - s = "\\t" - case r == '\n': - s = "\\n" - case r < 32 || r == 127: - s = fmt.Sprintf("\\%03o", r) - default: - s = string(r) - } - if col+len(s) >= 70 { - sb.WriteString("\\\n") - col = 0 - } - sb.WriteString(s) - col += len(s) - } - sb.WriteByte('$') - sb.WriteByte('\n') - eng.callCtx.Out(sb.String()) -} - -// isRegularFile checks whether an io.Reader is backed by a regular file. -func isRegularFile(r any) bool { - type stater interface{ Stat() (os.FileInfo, error) } - sf, ok := r.(stater) - if !ok { - return false - } - fi, err := sf.Stat() - return err == nil && fi.Mode().IsRegular() -} diff --git a/interp/builtins/sed/types.go b/interp/builtins/sed/types.go new file mode 100644 index 00000000..f1420794 --- /dev/null +++ b/interp/builtins/sed/types.go @@ -0,0 +1,120 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package sed + +import ( + "fmt" + "regexp" +) + +// --- Error types --- + +// quitError signals a q or Q command with an exit code. +type quitError struct { + code uint8 +} + +func (e *quitError) Error() string { + return fmt.Sprintf("quit with code %d", e.code) +} + +// --- Address types --- + +// addrType distinguishes different address kinds. +type addrType int + +const ( + addrNone addrType = iota + addrLine // specific line number + addrLast // $ (last line) + addrRegexp // /regex/ + addrStep // first~step (GNU extension) +) + +// address represents a sed address (line number, regex, or $). +type address struct { + kind addrType + line int64 // for addrLine + re *regexp.Regexp // for addrRegexp + first int64 // for addrStep + step int64 // for addrStep +} + +// --- Command types --- + +// cmdType identifies the sed command. +type cmdType int + +const ( + cmdSubstitute cmdType = iota + cmdPrint + cmdDelete + cmdQuit + cmdQuitNoprint + cmdTransliterate + cmdAppend + cmdInsert + cmdChange + cmdLineNum + cmdPrintUnambig + cmdNext + cmdNextAppend + cmdHoldCopy + cmdHoldAppend + cmdGetCopy + cmdGetAppend + cmdExchange + cmdBranch + cmdLabel + cmdBranchIfSub + cmdBranchIfNoSub + cmdPrintFirstLine // P: print up to first embedded newline + cmdDeleteFirstLine // D: delete up to first embedded newline, restart cycle + cmdGroup + cmdNoop +) + +// sedCmd represents a single parsed sed command. +type sedCmd struct { + addr1 *address + addr2 *address + negated bool + inRange bool // stateful: tracks whether we're inside a two-address range + kind cmdType + + // For s command: + subRe *regexp.Regexp + subReplacement string + subGlobal bool + subPrint bool + subNth int + + // For y command: + transFrom []rune + transTo []rune + + // For a, i, c commands: + text string + + // For q, Q commands: + quitCode uint8 + + // For b, t, T commands: + label string + + // For { ... } grouping: + children []*sedCmd +} + +// --- Action types --- + +// actionType signals how to proceed after executing a command. +type actionType int + +const ( + actionContinue actionType = iota + actionDelete // d/D command: skip auto-print, start next cycle +) From 335f784d11dc2af07cce51b3dd11536bab2a5dad Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 00:55:38 +0100 Subject: [PATCH 03/30] Add 29 sed test scenarios covering P, D, T, groups, flags, escapes, negation, and blocked commands Co-Authored-By: Claude Opus 4.6 --- .../cmd/sed/address/degenerate_range.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/address/mixed_range.yaml | 13 +++++++++++++ .../scenarios/cmd/sed/address/negation_delete.yaml | 13 +++++++++++++ .../scenarios/cmd/sed/address/negation_print.yaml | 13 +++++++++++++ .../scenarios/cmd/sed/address/negation_range.yaml | 13 +++++++++++++ .../scenarios/cmd/sed/branch/not_substituted.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/delete/first_line.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/errors/blocked_R_cmd.yaml | 9 +++++++++ tests/scenarios/cmd/sed/errors/blocked_W_cmd.yaml | 9 +++++++++ tests/scenarios/cmd/sed/errors/blocked_e_flag.yaml | 9 +++++++++ .../cmd/sed/errors/blocked_write_cmd.yaml | 9 +++++++++ tests/scenarios/cmd/sed/flags/extended_E.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/flags/extended_r.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/flags/help.yaml | 9 +++++++++ tests/scenarios/cmd/sed/group/basic.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/group/nested.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/group/with_address.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/multiple/comments.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/multiple/same_address.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/print/first_line.yaml | 13 +++++++++++++ .../cmd/sed/substitute/combined_flags.yaml | 13 +++++++++++++ .../cmd/sed/substitute/empty_replacement.yaml | 13 +++++++++++++ .../cmd/sed/substitute/escape_ampersand.yaml | 13 +++++++++++++ .../cmd/sed/substitute/escape_backslash.yaml | 13 +++++++++++++ .../cmd/sed/substitute/escape_newline.yaml | 13 +++++++++++++ .../cmd/sed/substitute/escape_sequences.yaml | 14 ++++++++++++++ .../scenarios/cmd/sed/substitute/global_print.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/substitute/nth_print.yaml | 13 +++++++++++++ tests/scenarios/cmd/sed/substitute/print_flag.yaml | 13 +++++++++++++ 29 files changed, 358 insertions(+) create mode 100644 tests/scenarios/cmd/sed/address/degenerate_range.yaml create mode 100644 tests/scenarios/cmd/sed/address/mixed_range.yaml create mode 100644 tests/scenarios/cmd/sed/address/negation_delete.yaml create mode 100644 tests/scenarios/cmd/sed/address/negation_print.yaml create mode 100644 tests/scenarios/cmd/sed/address/negation_range.yaml create mode 100644 tests/scenarios/cmd/sed/branch/not_substituted.yaml create mode 100644 tests/scenarios/cmd/sed/delete/first_line.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_R_cmd.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_W_cmd.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_e_flag.yaml create mode 100644 tests/scenarios/cmd/sed/errors/blocked_write_cmd.yaml create mode 100644 tests/scenarios/cmd/sed/flags/extended_E.yaml create mode 100644 tests/scenarios/cmd/sed/flags/extended_r.yaml create mode 100644 tests/scenarios/cmd/sed/flags/help.yaml create mode 100644 tests/scenarios/cmd/sed/group/basic.yaml create mode 100644 tests/scenarios/cmd/sed/group/nested.yaml create mode 100644 tests/scenarios/cmd/sed/group/with_address.yaml create mode 100644 tests/scenarios/cmd/sed/multiple/comments.yaml create mode 100644 tests/scenarios/cmd/sed/multiple/same_address.yaml create mode 100644 tests/scenarios/cmd/sed/print/first_line.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/combined_flags.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/empty_replacement.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/escape_ampersand.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/escape_backslash.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/escape_newline.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/escape_sequences.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/global_print.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/nth_print.yaml create mode 100644 tests/scenarios/cmd/sed/substitute/print_flag.yaml diff --git a/tests/scenarios/cmd/sed/address/degenerate_range.yaml b/tests/scenarios/cmd/sed/address/degenerate_range.yaml new file mode 100644 index 00000000..746d2962 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/degenerate_range.yaml @@ -0,0 +1,13 @@ +description: A degenerate range where addr1 and addr2 match the same line is treated as a one-line match. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2,2s/line/LINE/' input.txt +expect: + stdout: "line1\nLINE2\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/mixed_range.yaml b/tests/scenarios/cmd/sed/address/mixed_range.yaml new file mode 100644 index 00000000..df8cbb8c --- /dev/null +++ b/tests/scenarios/cmd/sed/address/mixed_range.yaml @@ -0,0 +1,13 @@ +description: Address ranges can mix different types like regex and line number. +setup: + files: + - path: input.txt + content: "start\nkeep1\nkeep2\nstop\nignore\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/start/,3s/^/>> /' input.txt +expect: + stdout: ">> start\n>> keep1\n>> keep2\nstop\nignore\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/negation_delete.yaml b/tests/scenarios/cmd/sed/address/negation_delete.yaml new file mode 100644 index 00000000..7b1b4dfe --- /dev/null +++ b/tests/scenarios/cmd/sed/address/negation_delete.yaml @@ -0,0 +1,13 @@ +description: Negation with the delete command keeps only matching lines. +setup: + files: + - path: input.txt + content: "keep\nremove\nkeep\nremove\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/keep/!d' input.txt +expect: + stdout: "keep\nkeep\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/negation_print.yaml b/tests/scenarios/cmd/sed/address/negation_print.yaml new file mode 100644 index 00000000..68cbf295 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/negation_print.yaml @@ -0,0 +1,13 @@ +description: Negation with the print command. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n '/bbb/!p' input.txt +expect: + stdout: "aaa\nccc\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/address/negation_range.yaml b/tests/scenarios/cmd/sed/address/negation_range.yaml new file mode 100644 index 00000000..a1105ee8 --- /dev/null +++ b/tests/scenarios/cmd/sed/address/negation_range.yaml @@ -0,0 +1,13 @@ +description: Negation applied to a range address. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\nline5\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2,4!d' input.txt +expect: + stdout: "line2\nline3\nline4\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/branch/not_substituted.yaml b/tests/scenarios/cmd/sed/branch/not_substituted.yaml new file mode 100644 index 00000000..0bae5a89 --- /dev/null +++ b/tests/scenarios/cmd/sed/branch/not_substituted.yaml @@ -0,0 +1,13 @@ +description: The T command branches only if no substitution was made. +setup: + files: + - path: input.txt + content: "foo bar\nhello world\nfoo baz\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/FOO/; T skip; s/$/ MATCHED/; :skip' input.txt +expect: + stdout: "FOO bar MATCHED\nhello world\nFOO baz MATCHED\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/delete/first_line.yaml b/tests/scenarios/cmd/sed/delete/first_line.yaml new file mode 100644 index 00000000..0584d265 --- /dev/null +++ b/tests/scenarios/cmd/sed/delete/first_line.yaml @@ -0,0 +1,13 @@ +description: The D command deletes up to the first newline and restarts the cycle. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 'N; P; D' input.txt +expect: + stdout: "line1\nline2\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/errors/blocked_R_cmd.yaml b/tests/scenarios/cmd/sed/errors/blocked_R_cmd.yaml new file mode 100644 index 00000000..f5064b10 --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_R_cmd.yaml @@ -0,0 +1,9 @@ +description: The R command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 'R input.txt' +expect: + stdout: "" + stderr: "sed: 'R' command is blocked: unsandboxed file reading is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_W_cmd.yaml b/tests/scenarios/cmd/sed/errors/blocked_W_cmd.yaml new file mode 100644 index 00000000..831847db --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_W_cmd.yaml @@ -0,0 +1,9 @@ +description: The W command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 'W output.txt' +expect: + stdout: "" + stderr: "sed: 'W' command is blocked: file writing is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_e_flag.yaml b/tests/scenarios/cmd/sed/errors/blocked_e_flag.yaml new file mode 100644 index 00000000..665114f5 --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_e_flag.yaml @@ -0,0 +1,9 @@ +description: The e flag in s command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 's/test/replaced/e' +expect: + stdout: "" + stderr: "sed: 'e' flag in 's' command is blocked: command execution is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/errors/blocked_write_cmd.yaml b/tests/scenarios/cmd/sed/errors/blocked_write_cmd.yaml new file mode 100644 index 00000000..77967c58 --- /dev/null +++ b/tests/scenarios/cmd/sed/errors/blocked_write_cmd.yaml @@ -0,0 +1,9 @@ +description: The w command is blocked for safety. +skip_assert_against_bash: true +input: + script: |+ + echo "test" | sed 'w output.txt' +expect: + stdout: "" + stderr: "sed: 'w' command is blocked: file writing is not allowed\n" + exit_code: 1 diff --git a/tests/scenarios/cmd/sed/flags/extended_E.yaml b/tests/scenarios/cmd/sed/flags/extended_E.yaml new file mode 100644 index 00000000..a0b79514 --- /dev/null +++ b/tests/scenarios/cmd/sed/flags/extended_E.yaml @@ -0,0 +1,13 @@ +description: The -E flag enables extended regular expressions. +setup: + files: + - path: input.txt + content: "foo123bar\nhello456world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -E 's/[0-9]+/NUM/' input.txt +expect: + stdout: "fooNUMbar\nhelloNUMworld\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/flags/extended_r.yaml b/tests/scenarios/cmd/sed/flags/extended_r.yaml new file mode 100644 index 00000000..5a104952 --- /dev/null +++ b/tests/scenarios/cmd/sed/flags/extended_r.yaml @@ -0,0 +1,13 @@ +description: The -r flag is a GNU alias for -E (extended regular expressions). +setup: + files: + - path: input.txt + content: "abc123def\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -r 's/[0-9]+/NUM/' input.txt +expect: + stdout: "abcNUMdef\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/flags/help.yaml b/tests/scenarios/cmd/sed/flags/help.yaml new file mode 100644 index 00000000..d3043440 --- /dev/null +++ b/tests/scenarios/cmd/sed/flags/help.yaml @@ -0,0 +1,9 @@ +description: The -h flag prints usage information and exits 0. +skip_assert_against_bash: true +input: + script: |+ + sed -h +expect: + stdout_contains: ["Usage: sed"] + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/group/basic.yaml b/tests/scenarios/cmd/sed/group/basic.yaml new file mode 100644 index 00000000..438dc2cf --- /dev/null +++ b/tests/scenarios/cmd/sed/group/basic.yaml @@ -0,0 +1,13 @@ +description: Group commands with braces apply multiple commands to matching lines. +setup: + files: + - path: input.txt + content: "apple\nbanana\ncherry\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/banana/{ s/banana/BANANA/; s/$/!/; }' input.txt +expect: + stdout: "apple\nBANANA!\ncherry\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/group/nested.yaml b/tests/scenarios/cmd/sed/group/nested.yaml new file mode 100644 index 00000000..93cf7073 --- /dev/null +++ b/tests/scenarios/cmd/sed/group/nested.yaml @@ -0,0 +1,13 @@ +description: Nested group commands work correctly. +setup: + files: + - path: input.txt + content: "foo bar\nhello world\nfoo baz\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '/foo/{ s/foo/FOO/; /bar/{ s/bar/BAR/; } }' input.txt +expect: + stdout: "FOO BAR\nhello world\nFOO baz\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/group/with_address.yaml b/tests/scenarios/cmd/sed/group/with_address.yaml new file mode 100644 index 00000000..383129a4 --- /dev/null +++ b/tests/scenarios/cmd/sed/group/with_address.yaml @@ -0,0 +1,13 @@ +description: Group commands with line address ranges. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2,3{ s/line/LINE/; s/$/!/; }' input.txt +expect: + stdout: "line1\nLINE2!\nLINE3!\nline4\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/multiple/comments.yaml b/tests/scenarios/cmd/sed/multiple/comments.yaml new file mode 100644 index 00000000..ad9a5311 --- /dev/null +++ b/tests/scenarios/cmd/sed/multiple/comments.yaml @@ -0,0 +1,13 @@ +description: Comments in sed scripts are ignored. +setup: + files: + - path: input.txt + content: "hello\nworld\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -e '# This is a comment' -e 's/hello/HELLO/' input.txt +expect: + stdout: "HELLO\nworld\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/multiple/same_address.yaml b/tests/scenarios/cmd/sed/multiple/same_address.yaml new file mode 100644 index 00000000..9e29bf87 --- /dev/null +++ b/tests/scenarios/cmd/sed/multiple/same_address.yaml @@ -0,0 +1,13 @@ +description: Multiple commands targeting the same address are applied in sequence. +setup: + files: + - path: input.txt + content: "aaa\nbbb\nccc\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed '2s/bbb/BBB/; 2s/BBB/xxx/' input.txt +expect: + stdout: "aaa\nxxx\nccc\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/print/first_line.yaml b/tests/scenarios/cmd/sed/print/first_line.yaml new file mode 100644 index 00000000..bed9574c --- /dev/null +++ b/tests/scenarios/cmd/sed/print/first_line.yaml @@ -0,0 +1,13 @@ +description: The P command prints up to the first embedded newline in the pattern space. +setup: + files: + - path: input.txt + content: "line1\nline2\nline3\nline4\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 'N; P' input.txt +expect: + stdout: "line1\nline3\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/combined_flags.yaml b/tests/scenarios/cmd/sed/substitute/combined_flags.yaml new file mode 100644 index 00000000..41275830 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/combined_flags.yaml @@ -0,0 +1,13 @@ +description: Substitute flags can be combined (e.g., gI for global case-insensitive). +setup: + files: + - path: input.txt + content: "Foo foo FOO\nBar BAR bar\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/X/gI' input.txt +expect: + stdout: "X X X\nBar BAR bar\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/empty_replacement.yaml b/tests/scenarios/cmd/sed/substitute/empty_replacement.yaml new file mode 100644 index 00000000..be197fd9 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/empty_replacement.yaml @@ -0,0 +1,13 @@ +description: Substitution with empty replacement string deletes the match. +setup: + files: + - path: input.txt + content: "hello world\nfoo bar baz\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/world//' input.txt +expect: + stdout: "hello \nfoo bar baz\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/escape_ampersand.yaml b/tests/scenarios/cmd/sed/substitute/escape_ampersand.yaml new file mode 100644 index 00000000..36692823 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/escape_ampersand.yaml @@ -0,0 +1,13 @@ +description: The \& escape in replacement inserts a literal ampersand instead of the matched text. +setup: + files: + - path: input.txt + content: "foo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/\&/' input.txt +expect: + stdout: "&\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/escape_backslash.yaml b/tests/scenarios/cmd/sed/substitute/escape_backslash.yaml new file mode 100644 index 00000000..950aa599 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/escape_backslash.yaml @@ -0,0 +1,13 @@ +description: The \\\\ escape in replacement inserts a literal backslash. +setup: + files: + - path: input.txt + content: "foo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/a\\b/' input.txt +expect: + stdout: "a\\b\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/escape_newline.yaml b/tests/scenarios/cmd/sed/substitute/escape_newline.yaml new file mode 100644 index 00000000..95366e8a --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/escape_newline.yaml @@ -0,0 +1,13 @@ +description: The \n escape in replacement inserts a literal newline. +setup: + files: + - path: input.txt + content: "hello world\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/ /\n/' input.txt +expect: + stdout: "hello\nworld\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/escape_sequences.yaml b/tests/scenarios/cmd/sed/substitute/escape_sequences.yaml new file mode 100644 index 00000000..a6c235b4 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/escape_sequences.yaml @@ -0,0 +1,14 @@ +description: Replacement strings support escape sequences like \n, \t, \\, and \&. +skip_assert_against_bash: true +setup: + files: + - path: input.txt + content: "foo bar\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed 's/foo/a\tb/' input.txt +expect: + stdout: "a\tb bar\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/global_print.yaml b/tests/scenarios/cmd/sed/substitute/global_print.yaml new file mode 100644 index 00000000..4a4f11fb --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/global_print.yaml @@ -0,0 +1,13 @@ +description: Combined gp flags do global replacement and print on match. +setup: + files: + - path: input.txt + content: "foo foo\nbar\nfoo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 's/foo/FOO/gp' input.txt +expect: + stdout: "FOO FOO\nFOO\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/nth_print.yaml b/tests/scenarios/cmd/sed/substitute/nth_print.yaml new file mode 100644 index 00000000..15909f8b --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/nth_print.yaml @@ -0,0 +1,13 @@ +description: Combined Nth occurrence and print flags. +setup: + files: + - path: input.txt + content: "aaa aaa aaa\nbbb\naaa aaa\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 's/aaa/XXX/2p' input.txt +expect: + stdout: "aaa XXX aaa\naaa XXX\n" + stderr: "" + exit_code: 0 diff --git a/tests/scenarios/cmd/sed/substitute/print_flag.yaml b/tests/scenarios/cmd/sed/substitute/print_flag.yaml new file mode 100644 index 00000000..b6b28a34 --- /dev/null +++ b/tests/scenarios/cmd/sed/substitute/print_flag.yaml @@ -0,0 +1,13 @@ +description: The p flag in substitution prints the line if a substitution was made. +setup: + files: + - path: input.txt + content: "foo\nbar\nfoo\n" +input: + allowed_paths: ["$DIR"] + script: |+ + sed -n 's/foo/FOO/p' input.txt +expect: + stdout: "FOO\nFOO\n" + stderr: "" + exit_code: 0 From 47447f88ecbc045235030e2cb347764572bd8602 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 00:57:32 +0100 Subject: [PATCH 04/30] update .claude/skills/code-review/SKILL.md --- .claude/skills/code-review/SKILL.md | 54 +++-------------------------- 1 file changed, 4 insertions(+), 50 deletions(-) diff --git a/.claude/skills/code-review/SKILL.md b/.claude/skills/code-review/SKILL.md index 84d52ae5..c9553c49 100644 --- a/.claude/skills/code-review/SKILL.md +++ b/.claude/skills/code-review/SKILL.md @@ -114,56 +114,10 @@ For every behavioral change: ### D. Test Coverage -Analyze coverage of changed code from two angles: **scenario tests** (YAML) and **Go tests**. Scenario tests are preferred because they also verify bash compatibility. - -#### Step 1: Inventory changed code paths - -For each changed or added function/branch/error-path, list the code path (e.g. "cut: `-f` with `--complement` and `--output-delimiter`", "error when delimiter is multi-byte"). - -#### Step 2: Check scenario test coverage (priority) - -Search `tests/scenarios/cmd//` for YAML scenarios that exercise each code path identified in Step 1. - -- **Covered** — a scenario exists whose `input.script` triggers the code path and `expect` asserts the output. -- **Partially covered** — a scenario triggers the code path but doesn't assert stderr, exit code, or an important edge case. -- **Not covered** — no scenario exercises the code path. - -Flag **not covered** and **partially covered** paths as findings. Suggest concrete YAML scenario(s) to add (including `description`, `input.script`, and expected `stdout`/`stderr`/`exit_code`). - -Scenario test conventions: -- Prefer `expect.stderr` (exact match) over `stderr_contains` -- Tests are asserted against bash by default — only use `skip_assert_against_bash: true` for intentional divergence -- Use `stdout_windows`/`stderr_windows` for platform-specific output -- If YAML scenarios are added or modified, verify they pass against bash - -#### Step 3: Check Go test coverage - -Search `interp/builtins//*_test.go` for Go tests that exercise any code paths **not already covered by scenario tests**. Go test types to check: - -| Test type | File pattern | What it covers | -|-----------|-------------|----------------| -| Functional | `_test.go` | Core logic, argument parsing, edge cases | -| GNU compat | `_gnu_compat_test.go` | Byte-for-byte output equivalence with GNU coreutils | -| Pentest | `_pentest_test.go` | Security vectors (overflow, special files, resource exhaustion) | -| Platform | `_{unix,windows}_test.go` | OS-specific behavior | - -Only flag missing Go tests for paths that **cannot be adequately covered by scenario tests** (e.g. internal error handling, concurrency, memory limits, platform-specific behavior, performance-sensitive paths). - -#### Step 4: Produce coverage summary - -Include a coverage table in the review output: - -```markdown -| Code path | Scenario test | Go test | Status | -|-----------|:---:|:---:|--------| -| `-f` with `--complement` | tests/scenarios/cmd/cut/complement/fields.yaml | — | Covered | -| multi-byte delimiter error | — | — | **Missing** | -| `/dev/zero` hang protection | skip (intentional divergence) | cut_pentest_test.go:45 | Covered | -``` - -Mark the overall coverage status: -- **Adequate** — all new/changed code paths are covered (scenario or Go tests) -- **Gaps found** — list missing coverage as P2 or P3 findings +- **Are new behaviors tested?** Every new code path should have a corresponding test +- **Are edge cases tested?** Empty input, boundary values, error conditions +- **YAML scenario conventions**: prefer `expect.stderr` over `stderr_contains`; tests are asserted against bash by default; use `stdout_windows`/`stderr_windows` for platform-specific output +- **Bash comparison**: if YAML scenarios are added or modified, verify they pass against bash ### E. Code Quality From ed64b73c9491bf7d39f7e558b642831b52d11a34 Mon Sep 17 00:00:00 2001 From: Alexandre Yang Date: Thu, 12 Mar 2026 01:04:11 +0100 Subject: [PATCH 05/30] update .claude/skills/code-review/SKILL.md --- .claude/skills/code-review/SKILL.md | 96 +++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/.claude/skills/code-review/SKILL.md b/.claude/skills/code-review/SKILL.md index c9553c49..0fa2b4fe 100644 --- a/.claude/skills/code-review/SKILL.md +++ b/.claude/skills/code-review/SKILL.md @@ -114,10 +114,56 @@ For every behavioral change: ### D. Test Coverage -- **Are new behaviors tested?** Every new code path should have a corresponding test -- **Are edge cases tested?** Empty input, boundary values, error conditions -- **YAML scenario conventions**: prefer `expect.stderr` over `stderr_contains`; tests are asserted against bash by default; use `stdout_windows`/`stderr_windows` for platform-specific output -- **Bash comparison**: if YAML scenarios are added or modified, verify they pass against bash +Analyze coverage of changed code from two angles: **scenario tests** (YAML) and **Go tests**. Scenario tests are preferred because they also verify bash compatibility. + +#### Step 1: Inventory changed code paths + +For each changed or added function/branch/error-path, list the code path (e.g. "cut: `-f` with `--complement` and `--output-delimiter`", "error when delimiter is multi-byte"). + +#### Step 2: Check scenario test coverage (priority) + +Search `tests/scenarios/cmd//` for YAML scenarios that exercise each code path identified in Step 1. + +- **Covered** — a scenario exists whose `input.script` triggers the code path and `expect` asserts the output. +- **Partially covered** — a scenario triggers the code path but doesn't assert stderr, exit code, or an important edge case. +- **Not covered** — no scenario exercises the code path. + +Flag **not covered** and **partially covered** paths as findings. Suggest concrete YAML scenario(s) to add (including `description`, `input.script`, and expected `stdout`/`stderr`/`exit_code`). + +Scenario test conventions: +- Prefer `expect.stderr` (exact match) over `stderr_contains` +- Tests are asserted against bash by default — only use `skip_assert_against_bash: true` for intentional divergence +- Use `stdout_windows`/`stderr_windows` for platform-specific output +- If YAML scenarios are added or modified, verify they pass against bash + +#### Step 3: Check Go test coverage + +Search `interp/builtins//*_test.go` for Go tests that exercise any code paths **not already covered by scenario tests**. Go test types to check: + +| Test type | File pattern | What it covers | +|-----------|-------------|----------------| +| Functional | `_test.go` | Core logic, argument parsing, edge cases | +| GNU compat | `_gnu_compat_test.go` | Byte-for-byte output equivalence with GNU coreutils | +| Pentest | `_pentest_test.go` | Security vectors (overflow, special files, resource exhaustion) | +| Platform | `_{unix,windows}_test.go` | OS-specific behavior | + +Only flag missing Go tests for paths that **cannot be adequately covered by scenario tests** (e.g. internal error handling, concurrency, memory limits, platform-specific behavior, performance-sensitive paths). + +#### Step 4: Produce coverage summary + +Include a coverage table in the review output: + +```markdown +| Code path | Scenario test | Go test | Status | +|-----------|:---:|:---:|--------| +| `-f` with `--complement` | tests/scenarios/cmd/cut/complement/fields.yaml | — | Covered | +| multi-byte delimiter error | — | — | **Missing** | +| `/dev/zero` hang protection | skip (intentional divergence) | cut_pentest_test.go:45 | Covered | +``` + +Mark the overall coverage status: +- **Adequate** — all new/changed code paths are covered (scenario or Go tests) +- **Gaps found** — list missing coverage as P2 or P3 findings ### E. Code Quality @@ -133,6 +179,48 @@ For every behavioral change: - Platform-aware path handling (not string concatenation)? - Are platform-specific test assertions using the correct fields? +### G. Unnecessary `skip_assert_against_bash: true` + +Every YAML scenario in `tests/scenarios/` is validated against bash by default. The `skip_assert_against_bash: true` flag must **only** be set when the shell intentionally diverges from bash (e.g. sandbox restrictions, blocked commands, readonly enforcement, different help/usage text). + +#### How to check + +1. **Find all scenarios with `skip_assert_against_bash: true`** in the changed or added YAML files: + ```bash + grep -rl 'skip_assert_against_bash: true' tests/scenarios/cmd// + ``` + +2. **For each flagged scenario**, run its script against GNU bash + coreutils to see what bash actually produces: + ```bash + docker run --rm debian:bookworm-slim bash -c '