diff --git a/Cargo.lock b/Cargo.lock index 13787348..3e50f60d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,27 +67,67 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "3.0.7" +version = "3.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" dependencies = [ "anstyle", - "once_cell", + "once_cell_polyfill", "windows-sys 0.59.0", ] +[[package]] +name = "assert_fs" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652f6cb1f516886fcfee5e7a5c078b9ade62cfcb889524efe5a64d682dd27a9" +dependencies = [ + "anstyle", + "doc-comment", + "globwalk", + "predicates", + "predicates-core", + "predicates-tree", + "tempfile", +] + [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.17.0" @@ -96,9 +136,9 @@ checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "cc" -version = "1.2.23" +version = "1.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766" +checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" dependencies = [ "shlex", ] @@ -186,6 +226,31 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "ctor" version = "0.4.2" @@ -217,6 +282,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "displaydoc" version = "0.2.5" @@ -240,6 +311,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dtor" version = "0.0.6" @@ -265,12 +342,32 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + [[package]] name = "fluent" version = "0.17.0" @@ -334,6 +431,30 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags", + "ignore", + "walkdir", +] + [[package]] name = "iana-time-zone" version = "0.1.63" @@ -358,6 +479,22 @@ dependencies = [ "cc", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "intl-memoizer" version = "0.5.3" @@ -444,6 +581,12 @@ dependencies = [ "libc", ] +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + [[package]] name = "ntapi" version = "0.4.1" @@ -508,6 +651,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + [[package]] name = "os_display" version = "0.1.4" @@ -570,6 +719,36 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "predicates" +version = "3.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa" + +[[package]] +name = "predicates-tree" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" +dependencies = [ + "predicates-core", + "termtree", +] + [[package]] name = "pretty_assertions" version = "1.4.1" @@ -713,29 +892,44 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.20" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] [[package]] name = "sed" version = "0.0.1" dependencies = [ + "assert_fs", "chrono", "clap", "clap_complete", "clap_mangen", "ctor", + "fancy-regex", "libc", + "memchr", "memmap2", + "once_cell", "phf", "phf_codegen", + "predicates", "pretty_assertions", "rand 0.9.1", "regex", "rlimit", "sysinfo", "tempfile", + "terminal_size", "textwrap", "uu_sed", "uucore", @@ -795,9 +989,9 @@ checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" [[package]] name = "socket2" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" dependencies = [ "libc", "windows-sys 0.52.0", @@ -857,6 +1051,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + [[package]] name = "textwrap" version = "0.16.2" @@ -987,10 +1187,16 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" name = "uu_sed" version = "0.0.1" dependencies = [ + "assert_fs", "clap", + "fancy-regex", + "memchr", "memmap2", + "once_cell", + "predicates", "regex", "tempfile", + "terminal_size", "uucore", ] @@ -1052,6 +1258,16 @@ dependencies = [ "xattr", ] +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -1144,6 +1360,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -1174,9 +1399,9 @@ dependencies = [ [[package]] name = "windows-core" -version = "0.61.1" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46ec44dc15085cea82cf9c78f85a9114c463a369786585ad2882d1ff0b0acf40" +checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" dependencies = [ "windows-implement", "windows-interface", @@ -1236,18 +1461,18 @@ dependencies = [ [[package]] name = "windows-result" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b895b5356fc36103d0f64dd1e94dfa7ac5633f1c9dd6e80fe9ec4adef69e09d" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" dependencies = [ "windows-link", ] [[package]] name = "windows-strings" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a7ab927b2637c19b3dbe0965e75d8f2d30bdd697a1516191cad2ec4df8fb28a" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" dependencies = [ "windows-link", ] diff --git a/Cargo.toml b/Cargo.toml index 33bd2325..c766fe67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ feat_common_core = [ ] [workspace.dependencies] +assert_fs = "1.1.3" bytesize = "2.0.0" chrono = { version = "0.4.37", default-features = false, features = [ "clock", @@ -36,28 +37,39 @@ chrono = { version = "0.4.37", default-features = false, features = [ clap = { version = "4.4", features = ["wrap_help", "cargo"] } clap_complete = "4.5" clap_mangen = "0.2" +fancy-regex = "0.14.0" libc = "0.2.153" +memchr = "2.7.4" memmap2 = "0.9" +once_cell = "1.21.3" phf = "0.11.2" phf_codegen = "0.11.2" +predicates = "3.1.3" rand = { version = "0.9", features = ["small_rng"] } regex = "1.10.4" sysinfo = "0.35" tempfile = "3.10.1" textwrap = { version = "0.16.1", features = ["terminal_size"] } +terminal_size = "0.4.2" uucore = { version = "0.1.0", features = ["libc"] } xattr = "1.3.1" [dependencies] +assert_fs = { workspace = true } clap = { workspace = true } clap_complete = { workspace = true } clap_mangen = { workspace = true } ctor = "0.4.1" +fancy-regex = { workspace = true } +memchr = { workspace = true } memmap2.workspace = true +once_cell = { workspace = true } phf = { workspace = true } +predicates = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } sysinfo = { workspace = true } +terminal_size = { workspace = true } textwrap = { workspace = true } uucore = { workspace = true } uutests = "0.1.0" diff --git a/README.md b/README.md index f9a2e4ee..6c62b2e4 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,12 @@ and other extensions. ## Status -It is currently a work in progress! +This is still a work in progress, but the current version can +already execute complex scripts, +such as solving the Towers of Hanoi puzzle and performing +arbitrary precision integer arithmetic. +The commands `=`, `l`, `r`, and `w`, as well as in-place replacement +are still missing. ## Installation @@ -37,13 +42,21 @@ cargo run --release * In addition to `\n`, other escape sequences (octal, hex, C) are supported in the strings of the `y` command. Under POSIX these yield undefined behavior. +* The substitution command replacement group `\0` is a synonym for &. +* A `Q` command (optionally followed by an exit code) quits immediately. +* The `q` command can be optionally followed by an exit code. +* The `l` command can be optionally followed by the output width. +* The `--follow-symlinks` flag for in-place editing. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. +* In-place editing of file with the `-i` flag. ### New extensions * Unicode characters can be specified in regular expression pattern, replacement and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. +* The `l` command lists Unicode characters using the `\uXXXX` and `\UXXXXXXXX` + sequences. ### Incompatibilities * The input is assumed to be valid UTF-8 (this includes 7-bit ASCII). diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 4faf2d9b..3b0d0cbb 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -13,10 +13,16 @@ categories = ["command-line-utilities"] [dependencies] +assert_fs = { workspace = true } clap = { workspace = true } +fancy-regex = { workspace = true } +memchr = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } +once_cell = { workspace = true } +predicates = { workspace = true } +terminal_size = { workspace = true } uucore = { workspace = true } [lib] diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index af673f19..acaca06b 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -8,18 +8,15 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// TODO: remove when compile is implemented -#![allow(dead_code)] - +use crate::fast_regex::{Captures, Match, Regex}; use crate::named_writer::NamedWriter; -use regex::Captures; -use regex::Regex; +use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; -use uucore::error::UResult; +use uucore::error::{UResult, USimpleError}; #[derive(Debug, Default, Clone)] /// Compilation and processing options provided mostly through the @@ -52,7 +49,7 @@ pub struct ProcessingContext { /// Stop processing further input. pub stop_processing: bool, /// Previously compiled RE, saved for reuse when specifying an empty RE - pub saved_regex: RefCell>, + pub saved_regex: Option, /// Modification of input processing action // This is required to avoid doubly borrowing the reader in the 'N' // command. @@ -65,6 +62,15 @@ pub struct ProcessingContext { pub label_to_command_map: HashMap>>, /// True if a substitution was made as specified in the t command pub substitution_made: bool, + /// Elements to append at the end of each command processing cycle + pub append_elements: Vec, +} + +#[derive(Clone, Debug)] +/// Elements that shall be appended at the end of each command processing cycle +pub enum AppendElement { + Text(String), // The specified text string + Path(PathBuf), // The contents of the specified file path } #[derive(Clone, Debug, Default, PartialEq)] @@ -100,7 +106,7 @@ pub struct Address { #[derive(Debug)] pub enum AddressValue { LineNumber(usize), - Regex(Regex), + Regex(Option), } #[derive(Debug)] @@ -115,38 +121,65 @@ pub enum ReplacementPart { /// All specified replacements for an RE pub struct ReplacementTemplate { pub parts: Vec, + pub max_group_number: usize, // Highest used group number (e.g. 8 for \8) } impl Default for ReplacementTemplate { /// Create an empty template. fn default() -> Self { - ReplacementTemplate { parts: Vec::new() } + ReplacementTemplate::new(Vec::new()) } } impl ReplacementTemplate { + /// Construct from the parts + pub fn new(parts: Vec) -> Self { + let max_group_number = parts + .iter() + .filter_map(|part| match part { + ReplacementPart::Group(n) => Some(*n), + _ => None, + }) + .max() + .unwrap_or(0); + + Self { + parts, + max_group_number: max_group_number.try_into().unwrap(), + } + } + /// Apply the template to the given RE captures. /// Example: - /// let result = regex.replace_all(input, |caps: ®ex::Captures| { - /// template.apply(caps) }); + /// let result = regex.replace_all(input, |caps: &Captures| { + /// template.apply_captures(caps) }); /// Returns an error if a backreference in the template was not matched by the RE. - pub fn apply(&self, caps: &Captures) -> UResult { + pub fn apply_captures(&self, caps: &Captures) -> UResult { let mut result = String::new(); + // Invalid group numbers may end here through (unkown at compile time) + // reused REs. + if self.max_group_number > caps.len() - 1 { + return Err(USimpleError::new( + 2, + format!( + "invalid reference \\{} on `s' command's RHS", + self.max_group_number + ), + )); + } + for part in &self.parts { match part { ReplacementPart::Literal(s) => result.push_str(s), ReplacementPart::WholeMatch => { - result.push_str(caps.get(0).map_or("", |m| m.as_str())); + result.push_str(caps.get(0)?.map(|m| m.as_str()).unwrap_or("")); } ReplacementPart::Group(n) => { - // Compilation guarantees we only get valid group numbers - result.push_str( - caps.get((*n).try_into().unwrap()) - .map_or("", |m| m.as_str()), - ); + let i: usize = (*n).try_into().unwrap(); + result.push_str(caps.get(i)?.map(|m| m.as_str()).unwrap_or("")); } } } @@ -154,48 +187,37 @@ impl ReplacementTemplate { Ok(result) } - /// Returns the highest capture group number referenced in this template. - pub fn max_group_number(&self) -> u32 { - self.parts - .iter() - .filter_map(|part| { - if let ReplacementPart::Group(n) = part { - Some(*n) - } else { - None + /// Apply the template to the given RE single match. + pub fn apply_match(&self, m: &Match) -> String { + let mut result = String::new(); + + for part in &self.parts { + match part { + ReplacementPart::Literal(s) => result.push_str(s), + + ReplacementPart::WholeMatch => result.push_str(m.as_str()), + + ReplacementPart::Group(_) => { + panic!("unexpected Regex group replacement") } - }) - .max() - .unwrap_or(0) + } + } + result } } -#[derive(Debug)] +#[derive(Debug, Default)] /// Substitution command pub struct Substitution { pub occurrence: usize, // Which occurrence to substitute pub print_flag: bool, // True if 'p' flag pub ignore_case: bool, // True if 'I' flag pub write_file: Option>>, // Writer to file if 'w' flag is used - pub regex: Regex, // Regular expression + pub regex: Option, // Regular expression pub line_number: usize, // Line number pub replacement: ReplacementTemplate, // Specified broken-down replacement } -impl Default for Substitution { - fn default() -> Self { - Substitution { - occurrence: 0, - print_flag: false, - ignore_case: false, - write_file: None, - regex: Regex::new("").unwrap(), // safe dummy regex - line_number: 0, - replacement: ReplacementTemplate::default(), - } - } -} - /// The block of the first and most common Unicode characters: /// ASCII, Latin Extended, Greek, Curillic, Coptic, Arabic, etc. /// It comprises all UCS-2 characters. We use a fast lookup array for these. @@ -261,7 +283,6 @@ pub struct Command { pub addr2: Option
, // End address pub non_select: bool, // True if '!' pub start_line: Option, // Start line number (or None) - pub text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' pub data: CommandData, // Command-specific data pub next: Option>>, // Pointer to next command } @@ -274,7 +295,6 @@ impl Default for Command { addr2: None, non_select: false, start_line: None, - text: None, data: CommandData::None, next: None, } @@ -289,24 +309,14 @@ pub enum CommandData { Block(Option>>), // Commands for '{' BranchTarget(Option>>), // Commands for 'b', 't' Label(Option), // Label name for 'b', 't', ':' - NamedWriter(Box), // File descriptor for 'w' + Path(PathBuf), // File path for 'r' + NamedWriter(Rc>), // File output for 'w' + Number(usize), // Number for 'l', 'q', 'Q' (GNU) Substitution(Box), // Substitute command 's' + Text(Cow<'static, str>), // Text for 'a', 'c', 'i' Transliteration(Box), // Transliteration command 'y' } -#[derive(Debug)] -/// Text to append before a line is read -pub struct AppendBuffer { - append_type: AppendType, - content: String, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum AppendType { - String, - File, -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] /// Flag for space modifications pub enum SpaceFlag { @@ -326,107 +336,123 @@ pub struct InputAction { #[cfg(test)] mod tests { use super::*; - use regex::Regex; + use crate::fast_io::IOChunk; // Return the captures for the RE applied to the specified string - fn caps_for<'a>(re: &str, input: &'a str) -> regex::Captures<'a> { - Regex::new(re).unwrap().captures(input).unwrap() + fn caps_for<'a>(re: &str, chunk: &'a mut IOChunk) -> Captures<'a> { + Regex::new(re) + .unwrap() + .captures(chunk) + .unwrap() + .expect("captures") } #[test] // s/foo// fn test_empty_template() { let template = ReplacementTemplate::default(); - let caps = caps_for("foo", "foo"); + let input = &mut IOChunk::from_str("foo"); + let caps = caps_for("foo", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, ""); } #[test] // s/abc/hello/ fn test_literal_only() { - let template = ReplacementTemplate { - parts: vec![ReplacementPart::Literal("hello".into())], - }; - let caps = caps_for("abc", "abc"); + let template = ReplacementTemplate::new(vec![ReplacementPart::Literal("hello".into())]); + let input = &mut IOChunk::from_str("abc"); + let caps = caps_for("abc", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "hello"); } #[test] // s/foo\d+/got: &/ fn test_whole_match() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("got: ".into()), - ReplacementPart::WholeMatch, - ], - }; - let caps = caps_for(r"foo\d+", "foo42"); - - let result = template.apply(&caps).unwrap(); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("got: ".into()), + ReplacementPart::WholeMatch, + ]); + let input = &mut IOChunk::from_str("foo42"); + let caps = caps_for(r"foo\d+", input); + + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "got: foo42"); } #[test] // s/foo(\d+)/number: \1/ fn test_backreference() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("number: ".into()), - ReplacementPart::Group(1), - ], - }; - let caps = caps_for(r"foo(\d+)", "foo42"); - - let result = template.apply(&caps).unwrap(); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("number: ".into()), + ReplacementPart::Group(1), + ]); + let input = &mut IOChunk::from_str("foo42"); + let caps = caps_for(r"foo(\d+)", input); + + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "number: 42"); } #[test] // s/(\w+):(\d+)/key: \1, value: \2/ fn test_multiple_parts() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("key: ".into()), - ReplacementPart::Group(1), - ReplacementPart::Literal(", value: ".into()), - ReplacementPart::Group(2), - ], - }; - let caps = caps_for(r"(\w+):(\d+)", "x:123"); - - let result = template.apply(&caps).unwrap(); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("key: ".into()), + ReplacementPart::Group(1), + ReplacementPart::Literal(", value: ".into()), + ReplacementPart::Group(2), + ]); + let input = &mut IOChunk::from_str("x:123"); + let caps = caps_for(r"(\w+):(\d+)", input); + + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); } + #[test] + // s/(\w+):(\d+)/key: \1, value: \3/ + fn test_invalid_group() { + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("key: ".into()), + ReplacementPart::Group(1), + ReplacementPart::Literal(", value: ".into()), + ReplacementPart::Group(3), + ]); + let input = &mut IOChunk::from_str("x:123"); + let caps = caps_for(r"(\w+):(\d+)", input); + + let result = template.apply_captures(&caps); + assert!(result.is_err()); + + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("invalid reference \\3")); + } + // max_group_number #[test] fn test_max_group_number_with_groups() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("a".into()), - ReplacementPart::Group(2), - ReplacementPart::WholeMatch, - ReplacementPart::Group(5), - ReplacementPart::Literal("z".into()), - ], - }; - assert_eq!(template.max_group_number(), 5); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("a".into()), + ReplacementPart::Group(2), + ReplacementPart::WholeMatch, + ReplacementPart::Group(5), + ReplacementPart::Literal("z".into()), + ]); + assert_eq!(template.max_group_number, 5); } #[test] fn test_max_group_number_without_groups() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("no".into()), - ReplacementPart::WholeMatch, - ReplacementPart::Literal("groups".into()), - ], - }; - assert_eq!(template.max_group_number(), 0); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("no".into()), + ReplacementPart::WholeMatch, + ReplacementPart::Literal("groups".into()), + ]); + assert_eq!(template.max_group_number, 0); } // Transliteration @@ -486,6 +512,7 @@ mod tests { } // from_strings + #[test] fn test_basic_transliteration() { let t = Transliteration::from_strings("abcδ", "1234"); diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 3b5d30fa..a7ebd8e8 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -15,30 +15,36 @@ use crate::command::{ use crate::delimited_parser::{ compilation_error, parse_char_escape, parse_regex, parse_transliteration, }; +use crate::fast_regex::Regex; use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; -use regex::Regex; + +use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; use std::mem; use std::path::PathBuf; use std::rc::Rc; use std::sync::LazyLock; +use terminal_size::{Width, terminal_size}; use uucore::error::{UResult, USimpleError}; +const DEFAULT_OUTPUT_WIDTH: usize = 60; + // A global, immutable map of command properties, initialized on first access static CMD_MAP: LazyLock> = LazyLock::new(build_command_map); // Types of command arguments recognized by the parser #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CommandArgs { - Empty, // d D g G h H l n N p P q x = \0 + Empty, // d D g G h H n N p P x = \0 Text, // a c i NonSelect, // ! BeginGroup, // { EndGroup, // } Label, // b t : + Number, // l q Q (GNU extension) ReadFile, // r WriteFile, // w Substitute, // s @@ -119,7 +125,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'l', n_addr: 2, - args: CommandArgs::Empty, + args: CommandArgs::Number, }, CommandSpec { code: 'n', @@ -144,7 +150,12 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'q', n_addr: 1, - args: CommandArgs::Empty, + args: CommandArgs::Number, + }, + CommandSpec { + code: 'Q', // GNU extension + n_addr: 1, + args: CommandArgs::Number, }, CommandSpec { code: 'r', @@ -206,19 +217,21 @@ pub fn compile( let mut empty_line = ScriptCharProvider::new(""); let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; - // Link the ends of command blocks to their following commands. - if context.parsed_block_nesting > 0 { - return Err(USimpleError::new(1, "unmatched `{'")); - } - patch_block_endings(result.clone()); + // Comment-out the following to show the compiled script. + #[cfg(any())] + dbg!(&result); // Link branch commands to the target label commands. populate_label_map(result.clone(), context)?; resolve_branch_targets(result.clone(), context)?; - // Comment-out the following to show the compiled script. - #[cfg(any())] - dbg!(&result); + // Link the ends of command blocks to their following commands. + // This converts the tree into a graph, so it must be the last + // conversion than traverses the structure as a tree. + if context.parsed_block_nesting > 0 { + return Err(USimpleError::new(1, "unmatched `{'")); + } + patch_block_endings(result.clone()); // TODO: setup append & match structures Ok(result) @@ -459,6 +472,24 @@ fn compile_address_range( Ok(n_addr) } +/// Read the line's remaining characters as a file path and return it. +fn read_file_path(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + line.advance(); // Skip the command/w character + line.eat_spaces(); // Skip any leading whitespace + + let mut path = String::new(); + while !line.eol() { + path.push(line.current()); + line.advance(); + } + + if path.is_empty() { + compilation_error(lines, line, "missing file path") + } else { + Ok(PathBuf::from(path)) + } +} + /// Compile and return a single range address specification. fn compile_address( lines: &ScriptLineProvider, @@ -502,14 +533,14 @@ fn compile_address( } '+' => { line.advance(); - let number = parse_number(lines, line)?; + let number = parse_number(lines, line, true)?.unwrap(); Ok(Address { atype: AddressType::RelLine, value: AddressValue::LineNumber(number), }) } c if c.is_ascii_digit() => { - let number = parse_number(lines, line)?; + let number = parse_number(lines, line, true)?.unwrap(); Ok(Address { atype: AddressType::Line, value: AddressValue::LineNumber(number), @@ -521,7 +552,12 @@ fn compile_address( /// Parse and return the decimal number at the current line position. /// Advance the line to first non-digit or EOL. -fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { +/// Issue an error if the number is required. +fn parse_number( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + required: bool, +) -> UResult> { let mut num_str = String::new(); while !line.eol() && line.current().is_ascii_digit() { @@ -529,10 +565,19 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR line.advance(); } + if num_str.is_empty() { + if required { + return compilation_error(lines, line, "number expected"); + } else { + return Ok(None); + } + } + num_str .parse::() .map_err(|_| format!("invalid number '{}'", num_str)) .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) + .map(Some) } /// Parse the end of a command, failing with an error on extra characters. @@ -558,90 +603,116 @@ fn parse_command_ending( } /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. -/// - Replacces `\(` and `\)` with `(` and `)` -/// - Escapes ERE-only metacharacters: `+ ? { } | ( )` -/// - Leaves all other characters as-is +/// - Replaces `\(` and `\)` with `(` and `)`. +/// - Puts single-digit back-references in non-capturing groups.. +/// - Escapes ERE-only metacharacters: `+ ? { } | ( )`. +/// - Leaves all other characters as-is. fn bre_to_ere(pattern: &str) -> String { let mut result = String::with_capacity(pattern.len()); let mut chars = pattern.chars().peekable(); + let mut at_beginning = true; + let mut previous: Option = None; while let Some(c) = chars.next() { if c == '\\' { match chars.peek() { Some('(') => { chars.next(); - result.push('('); // group start + result.push('('); // Group start } Some(')') => { chars.next(); - result.push(')'); // group end + result.push(')'); // Group end + } + Some(v) if v.is_ascii_digit() => { + // Back-reference. In sed BREs these are single-digit + // (\1-\9) whereas fancy_regex supports multi-digit + // back-references. Put them in a non-capturing group + // to avoid having the number extend beyond the single + // digit. Example: In sed \11 matches group 1 followed + // by '1', not group 11. + result.push_str(&format!(r"(?:\{})", v)); + chars.next(); } Some(&next) => { + // Preserve other escaped characters. chars.next(); result.push('\\'); - result.push(next); // preserve other escaped characters + result.push(next); } None => { - result.push('\\'); // trailing backslash, keep it + // Trailing backslash; keep it. + result.push('\\'); } } } else { match c { '+' | '?' | '{' | '}' | '|' | '(' | ')' => { - result.push('\\'); // escape unsupported ERE metacharacters + // Escape unsupported ERE metacharacters. + result.push('\\'); + result.push(c); + } + '^' if !at_beginning && previous != Some('[') => { + // In BREs ^ has special meaning at the beginning + // and as bracket negation. This heuristic escapes + // all other uses, which per POSIX are valid in EREs. + // "the ERE "a^b" is valid, but can never match because + // the 'a' prevents the expression "^b" from matching + // starting at the first character." + // POSIX 9.4.9 ERE Expression Anchoring + result.push('\\'); + result.push(c); + } + '$' if chars.peek().is_some() => { + // Similarly for $ appearing not at the end. + result.push('\\'); result.push(c); } _ => result.push(c), } } + at_beginning = false; + previous = Some(c); } result } /// Compile the provided regular expression string into a corresponding engine. +/// An empty pattern results in None, which means that the last RE employed +/// at runtime will be used. fn compile_regex( lines: &ScriptLineProvider, line: &ScriptCharProvider, pattern: &str, context: &ProcessingContext, icase: bool, -) -> UResult { +) -> UResult> { if pattern.is_empty() { - let maybe_existing = context.saved_regex.borrow(); - if let Some(existing) = &*maybe_existing { - Ok(existing.clone()) - } else { - compilation_error(lines, line, "no previously compiled regex available") - } - } else { - // Convert basic to extended regular expression if needed. - let ere_pattern = if context.regex_extended { - pattern - } else { - &bre_to_ere(pattern) - }; + return Ok(None); + } - // Add case-insensitive modifier if needed. - let full_pattern = if icase { - if ere_pattern.is_empty() { - return compilation_error(lines, line, "cannot specify a modifier on an empty RE"); - } - format!("(?i){}", ere_pattern) - } else { - ere_pattern.to_string() - }; + // Convert basic to extended regular expression if needed. + let pattern = if context.regex_extended { + pattern + } else { + &bre_to_ere(pattern) + }; - // Compile into engine. - let compiled = Regex::new(&full_pattern).map_err(|e| { - compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) - .unwrap_err() - })?; + // Add case-insensitive modifier if needed. + let pattern = if icase { + format!("(?i){}", pattern) + } else { + pattern.to_string() + }; - *context.saved_regex.borrow_mut() = Some(compiled.clone()); + // Compile into engine. + let compiled = Regex::new(&pattern).map_err(|e| { + compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) + .unwrap_err() + })?; - Ok(compiled) - } + Ok(Some(compiled)) } /// Compile a regular expression replacement string. @@ -677,23 +748,33 @@ pub fn compile_replacement( } match line.current() { - // \1 - \9 - c @ '1'..='9' => { + // \0 - \9 + c @ '0'..='9' => { let ref_num = c.to_digit(10).unwrap(); if !literal.is_empty() { parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); } - parts.push(ReplacementPart::Group(ref_num)); + if ref_num == 0 { + parts.push(ReplacementPart::WholeMatch); + } else { + parts.push(ReplacementPart::Group(ref_num)); + } line.advance(); } - // literal \ and & + // Literal \ and & '\\' | '&' => { literal.push(line.current()); line.advance(); } + // Literal delimiter + v if v == delimiter => { + literal.push(line.current()); + line.advance(); + } + // other escape sequences _ => match parse_char_escape(line) { Some(decoded) => literal.push(decoded), @@ -727,7 +808,7 @@ pub fn compile_replacement( if !literal.is_empty() { parts.push(ReplacementPart::Literal(literal)); } - return Ok(ReplacementTemplate { parts }); + return Ok(ReplacementTemplate::new(parts)); } c => { @@ -764,9 +845,6 @@ fn compile_subst_command( } let pattern = parse_regex(lines, line)?; - if pattern.is_empty() { - return compilation_error(lines, line, "unterminated substitute pattern"); - } let mut subst = Box::new(Substitution { line_number: lines.get_line_number(), @@ -776,27 +854,31 @@ fn compile_subst_command( subst.replacement = compile_replacement(lines, line)?; compile_subst_flags(lines, line, &mut subst)?; - // Compile regex with now known ignore_case flag. - subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; - - let re_captures: u32 = subst - .regex - .captures_len() - .saturating_sub(1) - .try_into() - .unwrap(); - let max_group_number = subst.replacement.max_group_number(); - if max_group_number > re_captures { + if pattern.is_empty() && subst.ignore_case { return compilation_error( lines, line, - format!( - "group number \\{} is larger than the {} available RE groups", - max_group_number, re_captures - ), + "cannot specify modifiers on an empty regular expression", ); } + // Compile regex with now known ignore_case flag. + subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; + + // Catch invalid group references at compile time, if possible. + if let Some(regex) = &subst.regex { + if subst.replacement.max_group_number > regex.captures_len() - 1 { + return compilation_error( + lines, + line, + format!( + "invalid reference \\{} on `s' command's RHS", + subst.replacement.max_group_number + ), + ); + } + } + cmd.data = CommandData::Substitution(subst); parse_command_ending(lines, line, cmd) @@ -908,20 +990,8 @@ pub fn compile_subst_flags( } 'w' => { - line.advance(); - line.eat_spaces(); - - let mut path = String::new(); - while !line.eol() && line.current() != ';' { - path.push(line.current()); - line.advance(); - } - - if path.is_empty() { - return compilation_error(lines, line, "missing filename after 'w' flag"); - } - - subst.write_file = Some(NamedWriter::new(PathBuf::from(path))?); + let path = read_file_path(lines, line)?; + subst.write_file = Some(NamedWriter::new(path)?); return Ok(()); // 'w' is the last flag allowed } @@ -958,11 +1028,17 @@ fn compile_label_command( line: &mut ScriptCharProvider, cmd: &mut Command, ) -> UResult<()> { + /// Return true if `c` is in the POSIX portable filename character set. + fn is_portable_filename_char(c: char) -> bool { + c.is_ascii_alphanumeric() // A–Z, a–z, 0–9 + || matches!(c, '.' | '_' | '-') + } + line.advance(); // Skip the command character line.eat_spaces(); // Skip any leading whitespace let mut label = String::new(); - while !line.eol() && line.current().is_ascii_alphanumeric() { + while !line.eol() && is_portable_filename_char(line.current()) { label.push(line.current()); line.advance(); } @@ -980,6 +1056,87 @@ fn compile_label_command( parse_command_ending(lines, line, cmd) } +/// Return the width of the command's terminal or a default. +fn output_width() -> usize { + if let Some((Width(w), _)) = terminal_size() { + w as usize + } else { + DEFAULT_OUTPUT_WIDTH + } +} + +fn compile_number_command( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult<()> { + line.advance(); // Skip the command character + line.eat_spaces(); // Skip any leading whitespace + + match parse_number(lines, line, false)? { + Some(n) => { + cmd.data = CommandData::Number(n); + } + None => match cmd.code { + 'q' | 'Q' => { + cmd.data = CommandData::Number(0); + } + 'l' => { + cmd.data = CommandData::Number(output_width()); + } + _ => panic!("invalid number-expecting command"), + }, + } + + line.eat_spaces(); // Skip any trailing whitespace + parse_command_ending(lines, line, cmd) +} + +fn compile_text_command( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult<()> { + line.advance(); // Skip the command character. + + line.eat_spaces(); // Skip any leading whitespace. + if line.eol() || line.current() != '\\' { + return compilation_error( + lines, + line, + format!("command `{}' expects \\ followed by text", cmd.code), + ); + } + + line.advance(); // Skip \. + line.eat_spaces(); // Skip any whitespace at the end of \. + if !line.eol() { + return compilation_error( + lines, + line, + format!( + "extra characters after \\ at the end of `{}' command", + cmd.code + ), + ); + } + + let mut text = String::new(); + while let Some(line) = lines.next_line()? { + if line.ends_with('\\') { + // Line ends with \ to escape \n; remove the trailing \. + text.push_str(&line[..line.len() - 1]); + text.push('\n'); + } else { + text.push_str(&line); + text.push('\n'); + break; + } + } + cmd.data = CommandData::Text(Cow::Owned(text)); + Ok(()) +} + // Compile the specified command fn compile_command( lines: &mut ScriptLineProvider, @@ -1013,6 +1170,10 @@ fn compile_command( CommandArgs::NonSelect => { // ! // Implemented at a higher level. } + CommandArgs::Number => { + // l q Q + compile_number_command(lines, line, &mut cmd)?; + } CommandArgs::Substitute => { // s return compile_subst_command(lines, line, &mut cmd, context); @@ -1021,12 +1182,19 @@ fn compile_command( // y return compile_trans_command(lines, line, &mut cmd); } - // TODO - CommandArgs::Text => { // a c i + CommandArgs::Text => { + // a c i + compile_text_command(lines, line, &mut cmd)?; } - CommandArgs::ReadFile => { // r + CommandArgs::ReadFile => { + // r + let path = read_file_path(lines, line)?; + cmd.data = CommandData::Path(path); } - CommandArgs::WriteFile => { // w + CommandArgs::WriteFile => { + // w + let path = read_file_path(lines, line)?; + cmd.data = CommandData::NamedWriter(NamedWriter::new(path)?); } } @@ -1074,13 +1242,27 @@ fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { #[cfg(test)] mod tests { use super::*; + use crate::fast_io::IOChunk; + // Return an empty line provider and a char provider for the specified str. fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { let lines = ScriptLineProvider::new(vec![]); // Empty for tests let line = ScriptCharProvider::new(input); (lines, line) } + fn make_line_provider(lines: &[&str]) -> ScriptLineProvider { + let input = lines + .iter() + .map(|s| ScriptValue::StringVal(s.to_string())) + .collect(); + ScriptLineProvider::new(input) + } + + fn make_char_provider(input: &str) -> ScriptCharProvider { + ScriptCharProvider::new(input) + } + /// Return a default ProcessingContext for use in tests. pub fn ctx() -> ProcessingContext { ProcessingContext::default() @@ -1257,14 +1439,28 @@ mod tests { #[test] fn test_parse_number_basic() { let (lines, mut chars) = make_providers("123abc"); - assert_eq!(parse_number(&lines, &mut chars).unwrap(), 123); + assert_eq!(parse_number(&lines, &mut chars, true).unwrap(), Some(123)); assert_eq!(chars.current(), 'a'); // Should stop at first non-digit } + #[test] + fn test_parse_optional_number_missing() { + let (lines, mut chars) = make_providers(" ;"); + assert_eq!(parse_number(&lines, &mut chars, false).unwrap(), None); + } + #[test] fn test_parse_number_invalid() { - let (lines, mut chars) = make_providers("abc"); - assert!(parse_number(&lines, &mut chars).is_err()); + let (lines, mut chars) = make_providers("537654897563495734653453434534534534545"); + let err = parse_number(&lines, &mut chars, true).unwrap_err(); + assert!(err.to_string().contains("invalid number")); + } + + #[test] + fn test_parse_required_number_missing() { + let (lines, mut chars) = make_providers(""); + let err = parse_number(&lines, &mut chars, true).unwrap_err(); + assert!(err.to_string().contains("number expected")); } // compile_re @@ -1275,39 +1471,22 @@ mod tests { #[test] fn test_compile_re_basic() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", &ctx(), false).unwrap(); - assert!(regex.is_match("abc")); - assert!(!regex.is_match("ABC")); + let regex = compile_regex(&lines, &chars, "abc", &ctx(), false) + .unwrap() + .expect("regex should be present"); + assert!(regex.is_match(&mut IOChunk::from_str("abc")).unwrap()); + assert!(!regex.is_match(&mut IOChunk::from_str("ABC")).unwrap()); } #[test] fn test_compile_re_case_insensitive() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", &ctx(), true).unwrap(); - assert!(regex.is_match("abc")); - assert!(regex.is_match("ABC")); - assert!(regex.is_match("AbC")); - } - - #[test] - fn test_compile_re_saved_and_reuse() { - let context = ctx(); - // Save a regex - let (lines1, chars1) = dummy_providers(); - let _ = compile_regex(&lines1, &chars1, "abc", &context, false).unwrap(); - - // Now try to reuse it - let (lines2, chars2) = dummy_providers(); - let reused = compile_regex(&lines2, &chars2, "", &context, false).unwrap(); - - assert!(reused.is_match("abc")); - } - - #[test] - fn test_compile_re_empty_and_not_saved() { - let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "", &ctx(), false); - assert!(result.is_err()); // Should fail because nothing was saved + let regex = compile_regex(&lines, &chars, "abc", &ctx(), true) + .unwrap() + .expect("regex should be present"); + assert!(regex.is_match(&mut IOChunk::from_str("abc")).unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("ABC")).unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("AbC")).unwrap()); } #[test] @@ -1354,50 +1533,56 @@ mod tests { let (lines, mut chars) = make_providers("/hello/"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { - assert!(re.is_match("hello")); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); } else { panic!("expected Regex address value"); } } #[test] - fn test_compile_addr_regex_other_delimiter() { - let (lines, mut chars) = make_providers("\\#hello#"); + fn test_compile_addr_regex_backref_match() { + let (lines, mut chars) = make_providers(r"/he\(.\)\1o/"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { - assert!(re.is_match("hello")); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); } else { panic!("expected Regex address value"); } } #[test] - fn test_compile_addr_regex_with_modifier() { - let (lines, mut chars) = make_providers("/hello/I"); + fn test_compile_addr_regex_backref_no_match() { + let (lines, mut chars) = make_providers(r"/he\(.\)\1o/"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { - assert!(re.is_match("HELLO")); // case-insensitive + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(!re.is_match(&mut IOChunk::from_str("helio")).unwrap()); } else { panic!("expected Regex address value"); } } #[test] - fn test_compile_addr_empty_regex_saved() { - let context = ctx(); - // First save a regex - let (lines1, mut chars1) = make_providers("/saved/"); - let _ = compile_address(&lines1, &mut chars1, &context).unwrap(); + fn test_compile_addr_regex_other_delimiter() { + let (lines, mut chars) = make_providers("\\#hello#"); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); + } else { + panic!("expected Regex address value"); + } + } - // Then reuse it with empty regex - let (lines2, mut chars2) = make_providers("//"); - let addr = compile_address(&lines2, &mut chars2, &context).unwrap(); + #[test] + fn test_compile_addr_regex_with_modifier() { + let (lines, mut chars) = make_providers("/hello/I"); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { - assert!(re.is_match("saved")); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(re.is_match(&mut IOChunk::from_str("HELLO")).unwrap()); // case-insensitive } else { panic!("expected Regex address value"); } @@ -1487,9 +1672,9 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo")); - assert!(!re.is_match("bar")); + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("bar")).unwrap()); } else { panic!("expected a regex address"); }; @@ -1507,9 +1692,9 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo")); - assert!(!re.is_match("bar")); + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("bar")).unwrap()); } else { panic!("expected a regex address"); } @@ -1518,9 +1703,9 @@ mod tests { cmd.borrow().addr2.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr2.as_ref().unwrap().value { - assert!(re.is_match("bar")); - assert!(!re.is_match("foo")); + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr2.as_ref().unwrap().value { + assert!(re.is_match(&mut IOChunk::from_str("bar")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("foo")).unwrap()); } else { panic!("expected a regex address"); }; @@ -1537,53 +1722,22 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("FOO")); - assert!(re.is_match("foo")); + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { + assert!(re.is_match(&mut IOChunk::from_str("FOO")).unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); } else { panic!("expected a regex address with case-insensitive match"); }; } - #[test] - fn test_compile_re_reuse_saved() { - let context = ctx(); - // First save a regex - let (lines1, mut chars1) = make_providers("/abc/"); - let mut cmd1 = Rc::new(RefCell::new(Command::default())); - compile_address_range(&lines1, &mut chars1, &mut cmd1, &context).unwrap(); - - // Now reuse it - let (lines2, mut chars2) = make_providers("//"); - let mut cmd2 = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2, &context).unwrap(); - - assert_eq!(n_addr, 1); - assert!(matches!( - cmd2.borrow().addr1.as_ref().unwrap().atype, - AddressType::Re - )); - if let AddressValue::Regex(re) = &cmd2.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("abc")); - }; - } - // compile_sequence - fn make_provider(lines: &[&str]) -> ScriptLineProvider { - let input = lines - .iter() - .map(|s| ScriptValue::StringVal(s.to_string())) - .collect(); - ScriptLineProvider::new(input) - } - fn empty_line() -> ScriptCharProvider { ScriptCharProvider::new("") } #[test] fn test_compile_sequence_empty_input() { - let mut provider = make_provider(&[]); + let mut provider = make_line_provider(&[]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1592,7 +1746,7 @@ mod tests { #[test] fn test_compile_sequence_comment_only() { - let mut provider = make_provider(&["# comment", " ", ";;"]); + let mut provider = make_line_provider(&["# comment", " ", ";;"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1601,7 +1755,7 @@ mod tests { #[test] fn test_compile_sequence_single_command() { - let mut provider = make_provider(&["42q"]); + let mut provider = make_line_provider(&["42q"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1625,7 +1779,7 @@ mod tests { #[test] fn test_compile_sequence_non_selected_single_command() { - let mut provider = make_provider(&["42!p"]); + let mut provider = make_line_provider(&["42!p"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1649,7 +1803,7 @@ mod tests { #[test] fn test_compile_sequence_multiple_lines() { - let mut provider = make_provider(&["1q", "2d"]); + let mut provider = make_line_provider(&["1q", "2d"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1665,7 +1819,7 @@ mod tests { #[test] fn test_compile_sequence_single_line_multiple_commands() { - let mut provider = make_provider(&["1q;2d"]); + let mut provider = make_line_provider(&["1q;2d"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1713,6 +1867,15 @@ mod tests { assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hello")); } + #[test] + fn test_compile_replacement_escaped_delimiter() { + let (mut lines, mut chars) = make_providers(r"/hell\/o/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 1); + assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hell/o")); + } + #[test] fn test_compile_replacement_backrefs_and_literal() { let (mut lines, mut chars) = make_providers("/prefix \\1 and \\2/"); @@ -1737,6 +1900,18 @@ mod tests { assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); } + #[test] + fn test_compile_replacement_whole_match_synonym() { + let (mut lines, mut chars) = make_providers(r"/The match was: \0/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 2); + assert!( + matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "The match was: ") + ); + assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); + } + #[test] fn test_compile_replacement_ampersand() { let (mut lines, mut chars) = make_providers("/Simon \\& Garfunkel/"); @@ -1854,7 +2029,7 @@ mod tests { let mut subst = Substitution::default(); let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); - assert!(err.to_string().contains("missing filename")); + assert!(err.to_string().contains("missing file path")); } #[test] @@ -1877,6 +2052,7 @@ mod tests { let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); assert!(err.to_string().contains("invalid substitute flag")); } + // compile_subst_command #[test] fn test_compile_subst_invalid_delimiter_backslash() { @@ -1890,15 +2066,6 @@ mod tests { ); } - #[test] - fn test_compile_subst_empty_pattern() { - let (mut lines, mut chars) = make_providers("s//bar/"); - let mut cmd = Command::default(); - - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!(err.to_string().contains("unterminated substitute pattern")); - } - #[test] fn test_compile_subst_extra_characters_at_end() { let (mut lines, mut chars) = make_providers("s/foo/bar/x"); @@ -1940,15 +2107,12 @@ mod tests { } #[test] - fn test_compile_subst_invalid_group_number() { - let (mut lines, mut chars) = make_providers(r"s/\(.\)\(.\)/\3\2\1/"); + fn test_compile_subst_invalid_group_reference() { + let (mut lines, mut chars) = make_providers(r"s/f(o)o/\2/"); let mut cmd = Command::default(); let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!( - err.to_string() - .contains("group number \\3 is larger than the 2 available RE groups") - ); + assert!(err.to_string().contains("invalid reference \\2")); } // bre_to_ere @@ -1985,6 +2149,21 @@ mod tests { assert_eq!(bre_to_ere(r"abc\"), r"abc\"); } + #[test] + fn test_caret_escaped_in_middle() { + assert_eq!(bre_to_ere(r"^a^[^x]c"), r"^a\^[^x]c"); + } + + #[test] + fn test_dollar_escaped_in_middle() { + assert_eq!(bre_to_ere(r"a$c$"), r"a\$c$"); + } + + #[test] + fn test_bre_back_reference() { + assert_eq!(bre_to_ere(r"\(.\)\1\(.\)\2"), r"(.)(?:\1)(.)(?:\2)"); + } + // patch_block_endings // Create a command with the specified code. @@ -2339,4 +2518,91 @@ mod tests { _ => panic!("Expected BranchTarget(Some(...))"), } } + + // compile_text_command + #[test] + fn test_compile_single_line_text_command() { + let mut chars = make_char_provider("a\\"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_spaces_single_line_text_command() { + let mut chars = make_char_provider("a \\ "); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_two_line_text_command() { + let mut chars = make_char_provider("a\\"); + let mut lines = make_line_provider(&["line1\\", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\nline2\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_text_command_without_backslash() { + let mut chars = make_char_provider("a"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + let result = compile_text_command(&mut lines, &mut chars, &mut cmd); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("expects \\ followed by text")); + } + + #[test] + fn test_compile_text_command_with_trailing_chars() { + let mut chars = make_char_provider("a \\ foo"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + let result = compile_text_command(&mut lines, &mut chars, &mut cmd); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("extra characters after \\")); + } + + // read_file_path + #[test] + fn test_read_existing_file_path() { + let (lines, mut chars) = make_providers("r /etc/motd"); + + let path = read_file_path(&lines, &mut chars).unwrap(); + assert_eq!(path.to_str().unwrap(), "/etc/motd"); + } + + #[test] + fn test_read_missing_file_path() { + let (lines, mut chars) = make_providers("w "); + + let err = read_file_path(&lines, &mut chars).unwrap_err(); + assert!(err.to_string().contains("missing file path")); + } } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index cb46c7f5..c101163d 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -14,9 +14,12 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +#[cfg(unix)] +use memchr::memchr; #[cfg(unix)] use memmap2::Mmap; +use std::cell::Cell; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; @@ -69,10 +72,12 @@ impl<'a> MmapLineCursor<'a> { } let start = self.pos; - let mut end = start; - while end < self.data.len() && self.data[end] != b'\n' { - end += 1; - } + + let mut end = if let Some(pos) = memchr(b'\n', &self.data[start..]) { + pos + start + } else { + self.data.len() + }; if end < self.data.len() { end += 1; // include \n in full span @@ -135,7 +140,7 @@ impl ReadLineCursor { /// As chunk of data that is input and can be output, often very efficiently #[derive(Debug, PartialEq, Eq)] pub struct IOChunk<'a> { - utf8_verified: bool, // True if the contents are valid UTF-8 + utf8_verified: Cell, // True if the contents are valid UTF-8 content: IOChunkContent<'a>, } @@ -143,14 +148,14 @@ impl<'a> IOChunk<'a> { /// Construct an IOChunk from the given content fn from_content(content: IOChunkContent<'a>) -> Self { Self { - utf8_verified: false, + utf8_verified: Cell::new(false), content, } } /// Clear the object's contents, converting it into Owned if needed. pub fn clear(&mut self) { - self.utf8_verified = true; + self.utf8_verified.set(true); match &mut self.content { IOChunkContent::Owned { content, @@ -167,6 +172,11 @@ impl<'a> IOChunk<'a> { } } + /// Return true if the content is empty. + pub fn is_empty(&self) -> bool { + self.content.len() == 0 + } + /// Return true if the content ends with a newline. pub fn is_newline_terminated(&self) -> bool { match &self.content { @@ -182,10 +192,19 @@ impl<'a> IOChunk<'a> { } } + #[cfg(test)] + /// Create an Owned newline-terminated IOChunk from a string. + pub fn from_str(s: &str) -> Self { + IOChunk { + content: IOChunkContent::new_owned(s.to_string(), true), + utf8_verified: Cell::new(false), + } + } + /// Set the object's contents to the specified string. /// Convert it into Owned if needed. pub fn set_to_string(&mut self, new_content: String, add_newline: bool) { - self.utf8_verified = true; + self.utf8_verified.set(true); match &mut self.content { IOChunkContent::Owned { content, @@ -203,23 +222,32 @@ impl<'a> IOChunk<'a> { } /// Return the content as a str. - pub fn as_str(&mut self) -> Result<&str, Box> { + pub fn as_str(&self) -> Result<&str, Box> { match &self.content { #[cfg(unix)] IOChunkContent::MmapInput { content, .. } => { - if self.utf8_verified { + if self.utf8_verified.get() { // Use cached result Ok(unsafe { self.content.as_str_unchecked() }) } else { let result = str::from_utf8(content); - self.utf8_verified = true; - result.map_err(|e| USimpleError::new(1, e.to_string())) + self.utf8_verified.set(true); + result.map_err(|e| USimpleError::new(2, e.to_string())) } } IOChunkContent::Owned { content, .. } => Ok(content), } } + /// Return the raw byte content (always safe). + pub fn as_bytes(&self) -> &[u8] { + match &self.content { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => content, + IOChunkContent::Owned { content, .. } => content.as_bytes(), + } + } + /// Convert content to the Owned variant if it's not already. /// Fails if the conversion to UTF-8 fails. pub fn ensure_owned(&mut self) -> Result<(), Box> { @@ -232,7 +260,7 @@ impl<'a> IOChunk<'a> { let has_newline = full_span.last().copied() == Some(b'\n'); self.content = IOChunkContent::new_owned(valid_str.to_string(), has_newline); - self.utf8_verified = true; + self.utf8_verified.set(true); Ok(()) } Err(e) => Err(USimpleError::new(1, e.to_string())), @@ -303,6 +331,16 @@ impl IOChunkContent<'_> { IOChunkContent::Owned { content, .. } => content, } } + + /// Return the content's length (in bytes or characters). + pub fn len(&self) -> usize { + match self { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => content.len(), + + IOChunkContent::Owned { content, .. } => content.len(), + } + } } /// Unified reader that uses mmap when possible, falls back to buffered reading. @@ -469,6 +507,36 @@ impl OutputBuffer { false, ))) } + + /// Copy the specified file to the output. + pub fn copy_file(&mut self, path: &PathBuf) -> io::Result<()> { + #[cfg(unix)] + self.flush_mmap()?; // Flush mmap writes, if any. + + let file = match File::open(path) { + Ok(f) => f, + // Per POSIX, if the file can't be read treat it as empty. + Err(_) => return Ok(()), + }; + + let mut reader = BufReader::new(file); + io::copy(&mut reader, &mut self.out)?; + Ok(()) + } +} + +/// Implementation of the std::io::Write trait +impl Write for OutputBuffer { + fn write(&mut self, buf: &[u8]) -> io::Result { + let s = + std::str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + self.write_str(s)?; + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + self.flush() + } } #[cfg(unix)] @@ -923,8 +991,9 @@ mod tests { )) = reader.get_line()? { assert_eq!(content, "first line"); + assert_eq!(content.len(), 10); assert!(has_newline); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::Owned"); @@ -950,7 +1019,7 @@ mod tests { panic!("Expected IOChunkContent::Owned"); } - if let Some((mut content, last_line)) = reader.get_line()? { + if let Some((content, last_line)) = reader.get_line()? { assert_eq!(content.as_str().unwrap(), "last line"); assert!(last_line); } else { @@ -987,8 +1056,9 @@ mod tests { )) = reader.get_line()? { assert_eq!(content, b"first line"); + assert_eq!(content.len(), 10); assert_eq!(full_span, b"first line\n"); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); @@ -1008,15 +1078,16 @@ mod tests { { assert_eq!(content, b"second line"); assert_eq!(full_span, b"second line\n"); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); } - if let Some((mut content, last_line)) = reader.get_line()? { + if let Some((content, last_line)) = reader.get_line()? { + assert_eq!(content.as_bytes(), b"last line"); assert_eq!(content.as_str().unwrap(), "last line"); - assert!(content.utf8_verified); + assert!(content.utf8_verified.get()); assert!(last_line); // Cached version assert_eq!(content.as_str().unwrap(), "last line"); @@ -1029,11 +1100,19 @@ mod tests { Ok(()) } - // is_newline_terminated + // is_newline_terminated, is_empty #[test] - fn test_owned_newline_terminated() { + fn test_owned_newline_terminated_non_empty() { let chunk = IOChunk::from_content(IOChunkContent::new_owned("line".to_string(), true)); assert!(chunk.is_newline_terminated()); + assert!(!chunk.is_empty()); + } + + #[test] + fn test_owned_newline_terminated_empty() { + let chunk = IOChunk::from_content(IOChunkContent::new_owned("".to_string(), true)); + assert!(chunk.is_newline_terminated()); + assert!(chunk.is_empty()); } #[test] diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs new file mode 100644 index 00000000..b45d36c9 --- /dev/null +++ b/src/uu/sed/src/fast_regex.rs @@ -0,0 +1,745 @@ +// A unified interface to byte and fancy Regex +// +// This allows using byte Regex when possible, resorting to the +// slower fancy_regex crate when needed. +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use fancy_regex::{ + CaptureMatches as FancyCaptureMatches, Captures as FancyCaptures, Regex as FancyRegex, +}; +use memchr::memmem; +use once_cell::sync::Lazy; +use regex::Regex as RustRegex; +use regex::bytes::{ + CaptureMatches as ByteCaptureMatches, Captures as ByteCaptures, Regex as ByteRegex, +}; +use std::error::Error; +use uucore::error::{UResult, USimpleError}; + +use crate::fast_io::IOChunk; + +/// REs requiring the fancy_regex capabilities rather than the +/// faster regex::bytes engine +// Consider . as one character that requires fancy_regex, +// because it can match more than one byte when matching a +// two or more byte Unicode UTF-8 representation. +// It is an RE . rather than a literal one in the following +// example sitations. +// . First character of the line +// [^\\]. Second character after non \ +// +// \*. A consumed backslash anywhere on the line +// \\. An escaped backslash anywhere on the line +// xx. A non-escaped sequence anywhere on the line +// But the following are literal dots and can be captured by bytes: +// \. escaped at the beginning of the line +// x\. escaped after a non escaped \ anywhere on the line +// +// The following RE captures these situations. +static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { + regex::Regex::new( + r"(?x) # Turn on verbose mode + ( # An ASCII-incompatible RE + ( ^ # Non-escaped: i.e. at BOL + | ^[^\\] # or after a BOL non \ + | [^\\] {2} # or after two non \ characters + | \\. # or after a consumed or escaped \ + ) + ( # A potentially incompatible match + \. # . matches any Unicode character + | \[\^ # Bracketed -ve character class + | \(\?i # (Unicode) case insensitive + | \\[WwDdSsBbPp] # Unicode classes + | \\[0-9] # Back-references need fancy + ) + ) + | [^\x01-\x7f] # Any non-ASCII character + ", + ) + .unwrap() +}); + +/// All characters signifying that the match must be handled by an RE +/// rather than by plain string pattern matching. +// These do not include the ^$ metacharacters, which we can easily handle. +// Plain string fixed-string matching is currently faster than Regex +// matching, because Regex always constructs an automaton and needs +// to handle state transitions, whereas plain string matching can +// use tailored CPU string or vectored instructions. +static NEEDS_RE: Lazy = Lazy::new(|| { + regex::Regex::new( + r"(?x) # Turn on verbose mode + ( ^ # Non-escaped: i.e. at BOL + | ^[^\\] # or after a BOL non \ + | [^\\] {2} # or after two non \ characters + | \\. # or after a consumed or escaped \ + ) + ( # A potentially incompatible match + [.?|+(\[{*] # Any magic RE character + # Some are operators so illegal at + # BOL but they should error there, + # not use them as literals. + | \\[WwDdSsPp] # Unicode classes + | \\[AzBb] # Empty matches + | \\[0-9] # Back-references + ) + ", + ) + .unwrap() +}); + +#[derive(Clone, Debug)] +/// Types of literal string anchored matches +enum AnchoredMatch { + Begin, // ^... + End, // ...$ + Both, // ^...$ + Free, // ... +} + +#[derive(Clone, Debug)] +/// A fast Regex-like matcher for literal strings using memchr:memmem +pub struct LiteralMatcher { + needle: Vec, // Bytes without any anchors + match_type: AnchoredMatch, // Type of anchoring specified +} + +impl LiteralMatcher { + /// Construct a new matcher based on a needle possible with anchors. + pub fn new(needle: &str) -> Self { + let needle_bytes = needle.as_bytes(); + if needle_bytes[0] == b'^' && needle_bytes[needle_bytes.len() - 1] == b'$' { + LiteralMatcher { + match_type: AnchoredMatch::Both, + needle: needle_bytes[1..needle_bytes.len() - 1].to_vec(), + } + } else if needle_bytes[0] == b'^' { + LiteralMatcher { + match_type: AnchoredMatch::Begin, + needle: needle_bytes[1..needle_bytes.len()].to_vec(), + } + } else if needle_bytes[needle_bytes.len() - 1] == b'$' { + LiteralMatcher { + match_type: AnchoredMatch::End, + needle: needle_bytes[0..needle_bytes.len() - 1].to_vec(), + } + } else { + LiteralMatcher { + match_type: AnchoredMatch::Free, + needle: needle_bytes.to_vec(), + } + } + } + + /// Returns the start index of a match, if any + fn anchored_find(&self, haystack: &[u8]) -> Option { + let nlen = self.needle.len(); + let hlen = haystack.len(); + + match self.match_type { + AnchoredMatch::Both => { + if hlen == nlen && haystack == self.needle.as_slice() { + Some(0) + } else { + None + } + } + AnchoredMatch::Begin => { + if hlen >= nlen && &haystack[..nlen] == self.needle.as_slice() { + Some(0) + } else { + None + } + } + AnchoredMatch::End => { + if hlen >= nlen && &haystack[hlen - nlen..] == self.needle.as_slice() { + Some(hlen - nlen) + } else { + None + } + } + AnchoredMatch::Free => memmem::find(haystack, &self.needle), + } + } + + /// Return true if the needle occurs in the haystack. + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.anchored_find(haystack).is_some() + } + + /// Return the position and contents of the matched needle. + pub fn find<'t>(&self, haystack: &'t [u8]) -> Option<(usize, usize, &'t str)> { + self.anchored_find(haystack).and_then(|start| { + let end = start + self.needle.len(); + std::str::from_utf8(&haystack[start..end]) + .ok() + .map(|s| (start, end, s)) + }) + } + + /// Return all positions and contents of the matched needle. + pub fn iter<'t>( + &'t self, + haystack: &'t [u8], + ) -> Box + 't> { + let needle = &self.needle; + let nlen = needle.len(); + + match self.match_type { + AnchoredMatch::Both | AnchoredMatch::Begin | AnchoredMatch::End => { + // At most one match; yield it if present + Box::new(self.find(haystack).into_iter()) + } + AnchoredMatch::Free => { + // Multiple potential matches + Box::new( + memmem::find_iter(haystack, needle).filter_map(move |start| { + let end = start + nlen; + std::str::from_utf8(&haystack[start..end]) + .ok() + .map(|s| (start, end, s)) + }), + ) + } + } + } +} + +/// Return the passed pattern without any backslash escapes. +pub fn remove_escapes(pattern: &str) -> String { + let mut chars = pattern.chars().peekable(); + let mut result = String::with_capacity(pattern.len()); + + while let Some(c) = chars.next() { + if c == '\\' { + // Look ahead and consume the next character if present + if let Some(&next) = chars.peek() { + result.push(next); + chars.next(); // consume the peeked char + } + } else { + result.push(c); + } + } + + result +} + +#[derive(Clone, Debug)] +/// A regular expression that can be implemented in diverse efficient ways +pub enum Regex { + Literal(LiteralMatcher), // Fastest: literal bytes + Byte(ByteRegex), // Slower: byte-based RE + Fancy(FancyRegex), // Slowest: RE supporting UTF-8 and back-references +} + +impl Regex { + /// Construct the most efficient RE-like matching engine possible. + pub fn new(pattern: &str) -> Result> { + if NEEDS_FANCY_RE.is_match(pattern) { + Ok(Self::Fancy(FancyRegex::new(pattern)?)) + } else if NEEDS_RE.is_match(pattern) { + Ok(Self::Byte(ByteRegex::new(pattern)?)) + } else { + Ok(Self::Literal(LiteralMatcher::new(&remove_escapes(pattern)))) + } + } + + /// Check if the regex matches the content of the IOChunk. + pub fn is_match(&self, chunk: &mut IOChunk) -> UResult { + match self { + Regex::Literal(m) => Ok(m.is_match(chunk.as_bytes())), + Regex::Byte(re) => Ok(re.is_match(chunk.as_bytes())), + Regex::Fancy(re) => { + let text = chunk.as_str()?; + re.is_match(text) + .map_err(|e| USimpleError::new(2, e.to_string())) + } + } + } + + /// Return an iterator over capture groups. + pub fn captures_iter<'t>(&'t self, chunk: &'t IOChunk) -> UResult> { + match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + Ok(CaptureMatches::Literal(Box::new(m.iter(haystack).map( + |(start, end, text)| Ok(Captures::Literal(Match { start, end, text })), + )))) + } + + Regex::Byte(re) => Ok(CaptureMatches::Byte(re.captures_iter(chunk.as_bytes()))), + + Regex::Fancy(re) => { + let text = chunk.as_str()?; + Ok(CaptureMatches::Fancy(re.captures_iter(text))) + } + } + } + + /// Return the number of capture groups, including group 0. + pub fn captures_len(&self) -> usize { + match self { + Regex::Literal(_) => 1, // Only group 0 + Regex::Byte(re) => re.captures_len(), + Regex::Fancy(re) => re.captures_len(), + } + } + + /// Return the elements of the first capture. + pub fn captures<'t>(&self, chunk: &'t IOChunk) -> UResult>> { + match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + match m.find(haystack) { + Some((start, end, text)) => { + Ok(Some(Captures::Literal(Match { start, end, text }))) + } + None => Ok(None), + } + } + + Regex::Byte(re) => { + let bytes = chunk.as_bytes(); + Ok(re.captures(bytes).map(Captures::Byte)) + } + + Regex::Fancy(re) => { + let text = chunk.as_str()?; + match re.captures(text) { + Ok(Some(caps)) => Ok(Some(Captures::Fancy(caps))), + Ok(None) => Ok(None), + Err(e) => Err(USimpleError::new(2, e.to_string())), + } + } + } + } + + /// Return a non-capturing result for a single match. + pub fn find<'t>(&self, chunk: &'t IOChunk) -> UResult>> { + match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + match m.find(haystack) { + Some((start, end, text)) => Ok(Some(Match { start, end, text })), + None => Ok(None), + } + } + + Regex::Byte(re) => { + let haystack = chunk.as_bytes(); + if let Some(m) = re.find(haystack) { + // Attempt UTF-8 decode for the match region only + let text = std::str::from_utf8(&haystack[m.start()..m.end()]) + .map_err(|e| USimpleError::new(2, e.to_string()))?; + Ok(Some(Match { + start: m.start(), + end: m.end(), + text, + })) + } else { + Ok(None) + } + } + + Regex::Fancy(re) => { + let text = chunk.as_str()?; + match re.find(text) { + Ok(Some(m)) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: m.as_str(), + })), + Ok(None) => Ok(None), + Err(e) => Err(USimpleError::new(2, e.to_string())), + } + } + } + } +} + +/// Unified enum for holding either byte or fancy capture iterators. +pub enum CaptureMatches<'t> { + Literal(Box>> + 't>), + Byte(ByteCaptureMatches<'t, 't>), + Fancy(FancyCaptureMatches<'t, 't>), +} + +impl<'t> Iterator for CaptureMatches<'t> { + type Item = UResult>; + + fn next(&mut self) -> Option { + match self { + CaptureMatches::Literal(iter) => iter.next(), + CaptureMatches::Byte(iter) => iter.next().map(|caps| Ok(Captures::Byte(caps))), + CaptureMatches::Fancy(iter) => match iter.next() { + Some(Ok(caps)) => Some(Ok(Captures::Fancy(caps))), + Some(Err(e)) => Some(Err(USimpleError::new( + 2, + format!("error retrieving RE captures: {}", e), + ))), + None => None, + }, + } + } +} + +#[derive(Clone, Debug)] +/// Result type for RE capture get(n) +pub struct Match<'t> { + start: usize, // Match start + end: usize, // Match end + text: &'t str, // Actual match +} + +/// Provide interface compatible with Regex::Match. +impl<'t> Match<'t> { + pub fn start(&self) -> usize { + self.start + } + + pub fn end(&self) -> usize { + self.end + } + + pub fn as_str(&self) -> &'t str { + self.text + } +} + +/// Provide interface compatible with Regex::Captures. +pub enum Captures<'t> { + Literal(Match<'t>), // only group 0 + Byte(ByteCaptures<'t>), + Fancy(FancyCaptures<'t>), +} + +impl<'t> Captures<'t> { + /// Get capture group at index `i` + /// Returns Ok(None) if the group didn't match. + /// Returns Err if UTF-8 conversion fails (in Byte variant). + pub fn get(&self, i: usize) -> UResult>> { + match self { + Captures::Literal(m) => Ok(if i == 0 { Some(m.clone()) } else { None }), + Captures::Byte(caps) => match caps.get(i) { + Some(m) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: std::str::from_utf8(m.as_bytes()) + .map_err(|e| USimpleError::new(1, e.to_string()))?, + })), + None => Ok(None), + }, + Captures::Fancy(caps) => match caps.get(i) { + Some(m) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: m.as_str(), + })), + None => Ok(None), + }, + } + } + + /// Return the number of capture groups (including group 0). + pub fn len(&self) -> usize { + match self { + Captures::Literal(_) => 1, + Captures::Byte(caps) => caps.len(), + Captures::Fancy(caps) => caps.len(), + } + } + + /// Return true if there are no captures. + // Unused, but provided for completeness. + pub fn is_empty(&self) -> bool { + match self { + Captures::Literal(_) => false, // A literal match always has group 0 + Captures::Byte(caps) => caps.len() == 0, + Captures::Fancy(caps) => caps.len() == 0, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // FANCY_RE + #[test] + fn test_needs_fancy_re_matches() { + let should_match = [ + // Unicode classes BOL + r"\p{L}+", // Unicode letter class + r"\W", // \W is Unicode-aware. + r"\S+", // \S is Unicode-aware. + r"\d", // \d includes all Unicode digits. + // Unicode classes non-BOL + r"x\p{L}+", // Unicode letter class + r"x\W", // \W is Unicode-aware. + r"x\S+", // \S is Unicode-aware. + r"x\d", // \d includes all Unicode digits. + // . + r".", + r"x.", + r"xx.", + // Consumed \ + r"\*.", + r"x\*.", + // Escaped \ + r"\\.", + r"x\\.", + // Inline flags + r"(?i)abc", // Unicode case-insensitive + r"x(?i)abc", // Unicode case-insensitive + r"(\w+):\1", // back-reference \1 + // Non-ASCII literals + "naïve", // Contains literal non-ASCII. + "café", // Contains literal non-ASCII. + ]; + + for pat in &should_match { + assert!( + NEEDS_FANCY_RE.is_match(pat), + "Expected NEEDS_FANCY_RE to match: {:?}", + pat + ); + } + } + + #[test] + fn test_needs_fancy_re_does_not_match() { + let should_not_match = [ + r"\.", // Escaped . at BOL + r"x\.", // Escaped . at non BOL + r"\[^x]", // Escaped character class + r"\(?i\)", // Escaped case insesitive flag + r"\\w", // Escaped Unicode class + // Simple ASCII + r"foo", + r"foo|bar", + r"^foo[0-9]+bar$", + ]; + + for pat in &should_not_match { + assert!( + !NEEDS_FANCY_RE.is_match(pat), + "Expected NEEDS_FANCY_RE to NOT match: {:?}", + pat + ); + } + } + + // NEEDS_RE + #[test] + fn test_needs_re_matches() { + let should_match = [ + r".", // Single regex wildcard + r"a+b", // Regex + + r"foo|bar", // Regex alternation + r"abc?", // Regex optional + r"a*b", // Regex star + r"[abc]", // Character class + r"(abc)", // Group + r"{1,2}", // Repetition + r"\d", // Class shorthand + r"\S", // Class shorthand + r"\1", // Backreference + r"a\Pb", // Unicode property + ]; + + for pat in &should_match { + assert!( + NEEDS_RE.is_match(pat), + "Expected NEEDS_RE to match: {:?}", + pat + ); + } + } + + #[test] + fn test_needs_re_does_not_match() { + let should_not_match = [ + r"abc", + r"a\.b", // Escaped dot + r"hello world", + r"^abc$", // Anchors alone + r"file\.", // Escaped dot + r"literal123", + r"\\", // Escaped backslash + ]; + + for pat in &should_not_match { + assert!( + !NEEDS_RE.is_match(pat), + "Expected NEEDS_RE to NOT match: {:?}", + pat + ); + } + } + + // Regex::new + #[test] + fn assert_byte_selection() { + let re = Regex::new(r"x*").unwrap(); + assert!(matches!(re, Regex::Byte(_))); + } + + #[test] + fn assert_fancy() { + let re = Regex::new(r"\d").unwrap(); + assert!(matches!(re, Regex::Fancy(_))); + } + + #[test] + fn assert_literal() { + let re = Regex::new(r"x\.").unwrap(); + assert!(matches!(re, Regex::Literal(_))); + } + + #[test] + fn handles_invalid_regex_gracefully() { + let err = Regex::new("(").unwrap_err().to_string(); + assert!( + err.contains("unclosed group") || err.contains("error parsing"), + "Unexpected error: {}", + err + ); + } + + // remove_escapes + #[test] + fn test_remove_escapes() { + use super::remove_escapes; + + assert_eq!(remove_escapes("abc"), "abc"); + assert_eq!(remove_escapes(r"a\.c"), "a.c"); + assert_eq!(remove_escapes(r"\\d"), r"\d"); + assert_eq!(remove_escapes(r"\.\*\+\?"), ".*+?"); + assert_eq!(remove_escapes(r"escaped\\backslash"), r"escaped\backslash"); + assert_eq!(remove_escapes(r"trailing\\"), r"trailing\"); + } + + // LiteralMatcher + #[test] + fn test_literal_matcher_basic_match() { + let matcher = LiteralMatcher::new("needle"); + assert!(matcher.is_match(b"this is a needle in a haystack")); + assert!(!matcher.is_match(b"no match here")); + } + + #[test] + fn test_literal_matcher_anchor_start_match() { + let matcher = LiteralMatcher::new("^needle"); + assert!(matcher.is_match(b"needle in a haystack")); + assert!(!matcher.is_match(b"no needle match here")); + assert!(!matcher.is_match(b"no")); + } + + #[test] + fn test_literal_matcher_anchor_end_match() { + let matcher = LiteralMatcher::new("needle$"); + assert!(matcher.is_match(b"In a haystack there's a needle")); + assert!(!matcher.is_match(b"no needle match here")); + assert!(!matcher.is_match(b"no")); + } + + #[test] + fn test_literal_matcher_anchor_begin_end_match() { + let matcher = LiteralMatcher::new("^needle$"); + assert!(matcher.is_match(b"needle")); + assert!(!matcher.is_match(b"no needle match")); + assert!(!matcher.is_match(b"needle no match")); + assert!(!matcher.is_match(b"no match needle")); + assert!(!matcher.is_match(b"nada")); + } + + #[test] + fn test_literal_matcher_utf8_match() { + let matcher = LiteralMatcher::new("✓"); // U+2713 CHECK MARK (3 bytes) + let haystack = "contains ✓ unicode".as_bytes(); + assert!(matcher.is_match(haystack)); + let found = matcher.find(haystack).unwrap(); + assert_eq!(found.2, "✓"); + } + + #[test] + fn test_literal_matcher_find_location() { + let matcher = LiteralMatcher::new("abc"); + let haystack = b"___abc___"; + let result = matcher.find(haystack); + assert!(result.is_some()); + let (start, end, text) = result.unwrap(); + assert_eq!((start, end), (3, 6)); + assert_eq!(text, "abc"); + } + + #[test] + fn test_literal_matcher_find_location_end() { + let matcher = LiteralMatcher::new("abc$"); + let haystack = b"012abc"; + let result = matcher.find(haystack); + assert!(result.is_some()); + let (start, end, text) = result.unwrap(); + assert_eq!((start, end), (3, 6)); + assert_eq!(text, "abc"); + } + + #[test] + fn test_literal_matcher_iter_multiple() { + let matcher = LiteralMatcher::new("test"); + let haystack = b"this test is a test of test matching"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 3); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test", "test", "test"]); + } + + #[test] + fn test_literal_matcher_iter_begin() { + let matcher = LiteralMatcher::new("^test"); + let haystack = b"test is a test of test matching"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 1); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test"]); + } + + #[test] + fn test_literal_matcher_iter_end() { + let matcher = LiteralMatcher::new("test$"); + let haystack = b"this test is a test of test"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 1); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test"]); + } + + #[test] + fn test_literal_matcher_no_match() { + let matcher = LiteralMatcher::new("missing"); + let haystack = b"nothing to see here"; + assert!(!matcher.is_match(haystack)); + assert!(matcher.find(haystack).is_none()); + assert_eq!(matcher.iter(haystack).count(), 0); + } + + #[test] + fn test_literal_matcher_anchored_no_match() { + let matcher = LiteralMatcher::new("^see$"); + let haystack = b"nothing to see here"; + assert!(!matcher.is_match(haystack)); + assert!(matcher.find(haystack).is_none()); + assert_eq!(matcher.iter(haystack).count(), 0); + } +} diff --git a/src/uu/sed/src/in_place.rs b/src/uu/sed/src/in_place.rs index eb4d9403..acdf8aee 100644 --- a/src/uu/sed/src/in_place.rs +++ b/src/uu/sed/src/in_place.rs @@ -8,39 +8,298 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use std::fs; +use std::io::stdout; +use std::path::{Path, PathBuf}; + +#[cfg(unix)] +use std::os::unix::fs::MetadataExt; +#[cfg(unix)] +use std::os::unix::fs::PermissionsExt; + +use tempfile::NamedTempFile; +use uucore::display::Quotable; +use uucore::error::{FromIo, UIoError, UResult, USimpleError}; + use crate::command::ProcessingContext; use crate::fast_io::OutputBuffer; -use std::io::stdout; -use std::path::Path; -use uucore::error::UResult; /// Context for in-place editing pub struct InPlace { pub output: OutputBuffer, - pub processing_context: ProcessingContext, + pub in_place: bool, + pub in_place_suffix: Option, + pub follow_symlinks: bool, + pub temp_file: Option, + pub original_path: Option, } impl InPlace { - /// Create a new `ProcessingContext` taking ownership of processing_context - pub fn new(processing_context: ProcessingContext) -> UResult { - let output = OutputBuffer::new(Box::new(stdout())); + /// Create an in-place editing engine based on ProcessingContext. + /// Depending on its settings it may or may not perform in-place + /// editing, backup the original file, or follow symlinks. + pub fn new(context: ProcessingContext) -> Self { + Self { + output: OutputBuffer::new(Box::new(stdout())), + in_place: context.in_place, + in_place_suffix: context.in_place_suffix, + follow_symlinks: context.follow_symlinks, + temp_file: None, + original_path: None, + } + } - Ok(InPlace { - output, - processing_context, - }) + /// Return an OutputBuffer for outputting the edits to the specified file. + /// The file may be a symbolic link, which will be processed according + /// to the context specification. + pub fn begin(&mut self, file_name: &Path) -> UResult<&mut OutputBuffer> { + let resolved = if self.follow_symlinks { + fs::canonicalize(file_name) + .map_err_context(|| format!("resolving symlink {}", file_name.quote()))? + } else { + file_name.to_path_buf() + }; + self.begin_resolved(&resolved) } /// Return an OutputBuffer for outputting the edits to the specified file. - pub fn begin(&mut self, _file_name: &Path) -> UResult<&mut OutputBuffer> { - // TODO: Adjust output for in-place editing, if needed. + /// The passed file name should have resolved symbolic links according + /// to the context settings. + fn begin_resolved(&mut self, file_name: &Path) -> UResult<&mut OutputBuffer> { + if !self.in_place { + self.output = OutputBuffer::new(Box::new(stdout())); + return Ok(&mut self.output); + } + + let metadata = fs::metadata(file_name).map_err_context(|| { + format!( + "error Reading metadata of {} for in-place edit", + file_name.quote() + ) + })?; + + if !metadata.is_file() { + return Err(USimpleError::new( + 2, + format!( + "cannot in-place edit non-regular file {}", + file_name.quote() + ), + )); + } + + let dir = file_name.parent().unwrap_or_else(|| Path::new(".")); + let temp_file = NamedTempFile::new_in(dir) + .map_err_context(|| format!("error creating temporary file in {}", dir.quote()))?; + + // TODO: On Unix use fchown(metadata.{uid,dig}) and fchmod(mode) + // on let fd = temp_file.as_file().as_raw_fd() when uucore::libc + // support them. + #[cfg(unix)] + { + let mode = metadata.mode() & 0o7777; + let perms = fs::Permissions::from_mode(mode); + fs::set_permissions(temp_file.path(), perms)?; + } + + let output = OutputBuffer::new(Box::new( + temp_file.reopen().expect("reopening NamedTempFile"), + )); + self.output = output; + self.temp_file = Some(temp_file); + self.original_path = Some(file_name.to_path_buf()); + Ok(&mut self.output) } - /// Finish in-place editing. + /// Finish (potentially in-place) editing. pub fn end(&mut self) -> UResult<()> { self.output.flush()?; - // TODO: Rename and delete output file, if needed. + + if !self.in_place { + return Ok(()); + } + + let orig = self.original_path.take().expect("original_path unset"); + let temp = self.temp_file.take().expect("temp_file unset"); + + // Backup original if suffix is provided + if let Some(ref suffix) = self.in_place_suffix { + let mut backup_path = orig.clone(); + let file_name = backup_path + .file_name() + .expect("Missing file name for backup") + .to_os_string(); + let mut backup_name = file_name; + backup_name.push(suffix); + backup_path.set_file_name(backup_name); + + #[cfg(windows)] + // Try to remove to ensure the rename won't fail on Windows. + let _ = fs::remove_file(&backup_path); + + fs::rename(&orig, &backup_path).map_err_context(|| { + format!( + "error backing up {} to {}", + orig.quote(), + backup_path.quote() + ) + })?; + } else { + #[cfg(windows)] + // On Windows delete the original file for temp.persist to work + if orig.exists() { + fs::remove_file(&orig).map_err_context(|| { + format!("error removing original input file {}", orig.quote()) + })?; + } + } + + // Atomically replace the original + match temp.persist(&orig) { + Ok(_) => {} + Err(e) => { + return Err(UIoError::new( + e.error.kind(), + format!( + "error persisting temporary file {} to {}", + e.file.path().quote(), + orig.quote() + ), + )); + } + } + Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use assert_fs::TempDir; + use assert_fs::fixture::PathChild; + use std::fs; + use std::io::{Read, Write}; + use std::path::Path; + + fn minimal_context() -> ProcessingContext { + ProcessingContext { + in_place: false, + in_place_suffix: None, + follow_symlinks: false, + // fill in default values for the rest as needed + ..Default::default() + } + } + + fn write_original(file: &Path, content: &str) { + fs::write(file, content).unwrap(); + } + + fn read_file(file: &Path) -> String { + let mut contents = String::new(); + fs::File::open(file) + .unwrap() + .read_to_string(&mut contents) + .unwrap(); + contents + } + + #[test] + fn test_in_place_editing() { + let temp = TempDir::new().unwrap(); + let file = temp.child("file.txt"); + write_original(file.path(), "original\n"); + + let mut ctx = minimal_context(); + ctx.in_place = true; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(file.path()).unwrap(); + write!(buf, "updated\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(file.path()), "updated\n"); + } + + #[test] + fn test_in_place_backup() { + let temp = TempDir::new().unwrap(); + let file = temp.child("file.txt"); + let backup = temp.child("file.txt.bak"); + write_original(file.path(), "original\n"); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.in_place_suffix = Some(".bak".to_string()); + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(file.path()).unwrap(); + write!(buf, "new content\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(file.path()), "new content\n"); + assert_eq!(read_file(backup.path()), "original\n"); + } + + #[cfg(unix)] + #[test] + fn test_symlink_follow_true() { + let temp = TempDir::new().unwrap(); + let real = temp.child("target.txt"); + let link = temp.child("link.txt"); + + write_original(real.path(), "real\n"); + std::os::unix::fs::symlink(real.path(), link.path()).unwrap(); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.follow_symlinks = true; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(link.path()).unwrap(); + write!(buf, "changed\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(real.path()), "changed\n"); + assert!(link.path().exists()); // Symlink still exists + } + + #[cfg(unix)] + #[test] + fn test_symlink_follow_false() { + let temp = TempDir::new().unwrap(); + let real = temp.child("target.txt"); + let link = temp.child("link.txt"); + + write_original(real.path(), "real\n"); + std::os::unix::fs::symlink(real.path(), link.path()).unwrap(); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.follow_symlinks = false; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(link.path()).unwrap(); + write!(buf, "linked\n").unwrap(); + inplace.end().unwrap(); + + // real file should remain untouched + assert_eq!(read_file(real.path()), "real\n"); + + // link (symlink path) now contains the new content + let contents = read_file(link.path()); + assert_eq!(contents, "linked\n"); + } + + #[test] + fn test_no_in_place_outputs_to_stdout() { + let mut ctx = minimal_context(); + ctx.in_place = false; + + let mut inplace = InPlace::new(ctx); + let _buf = inplace.begin(Path::new("fake.txt")).unwrap(); + assert!(inplace.end().is_ok()); + } +} diff --git a/src/uu/sed/src/named_writer.rs b/src/uu/sed/src/named_writer.rs index 26482acf..eb60db2b 100644 --- a/src/uu/sed/src/named_writer.rs +++ b/src/uu/sed/src/named_writer.rs @@ -14,7 +14,8 @@ use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult}; thread_local! { /// Global list of all writers that should be flushed at shutdown @@ -36,12 +37,7 @@ impl NamedWriter { .write(true) .truncate(true) .open(&path) - .map_err(|e| { - USimpleError::new( - 2, - format!("Error opening output file {}: {}", path.display(), e), - ) - })?; + .map_err_context(|| format!("error opening output file {}", path.quote()))?; let writer = Rc::new(RefCell::new(NamedWriter { path, @@ -55,14 +51,14 @@ impl NamedWriter { /// Write a line to the file with a newline, returning descriptive errors. pub fn write_line(&mut self, line: &str) -> UResult<()> { writeln!(self.writer, "{}", line) - .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + .map_err_context(|| format!("error writing to file {}", self.path.quote())) } /// Flush the writer, returning a descriptive error. pub fn flush(&mut self) -> UResult<()> { self.writer .flush() - .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + .map_err_context(|| format!("error writing to file {}", self.path.quote())) } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index fcbc143f..e1f01764 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -9,28 +9,33 @@ // file that was distributed with this source code. use crate::command::{ - Address, AddressType, AddressValue, Command, CommandData, InputAction, ProcessingContext, - Substitution, Transliteration, + Address, AddressType, AddressValue, AppendElement, Command, CommandData, InputAction, + ProcessingContext, Substitution, Transliteration, }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; +use crate::fast_regex::Regex; use crate::in_place::InPlace; use crate::named_writer; + +use std::borrow::Cow; use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult, USimpleError, set_exit_code}; /// Return true if the passed address matches the current I/O context. fn match_address( addr: &Address, pattern: &mut IOChunk, - context: &ProcessingContext, + context: &mut ProcessingContext, ) -> UResult { match addr.atype { AddressType::Re => { if let AddressValue::Regex(ref re) = addr.value { - Ok(re.is_match(pattern.as_str()?)) + let regex = re_or_saved_re(re, context)?; + regex.is_match(pattern) } else { Ok(false) } @@ -161,6 +166,25 @@ fn write_chunk( Ok(()) } +/// Return a reference to the current or the saved RE if the RE is None. +/// Update the saved RE to RE. +fn re_or_saved_re<'a>( + regex: &Option, + context: &'a mut ProcessingContext, +) -> UResult<&'a Regex> { + if let Some(re) = regex { + // First time we see this regex: clone it *once* into the context. + context.saved_regex = Some(re.clone()); + // Return a reference into context.saved_regex. + Ok(context.saved_regex.as_ref().unwrap()) + } else if let Some(ref saved_re) = context.saved_regex { + // We already have one: just borrow it. + Ok(saved_re) + } else { + Err(USimpleError::new(2, "no previous regular expression")) + } +} + /// Perform the specified RE replacement in the provided pattern space. fn substitute( pattern: &mut IOChunk, @@ -173,30 +197,78 @@ fn substitute( let mut result = String::new(); let mut replaced = false; - let text = pattern.as_str()?; + let mut text: Option<&str> = None; - for caps in sub.regex.captures_iter(text) { - count += 1; - let m = caps.get(0).unwrap(); + let regex = re_or_saved_re(&sub.regex, context)?; - // Always write the unmatched text before this match. - result.push_str(&text[last_end..m.start()]); + match (sub.occurrence, sub.replacement.max_group_number) { + (1, 0) => { + // Example: s/foo/bar/: find() is enough. + let m = regex.find(pattern)?; + if let Some(m) = m { + text = Some(pattern.as_str()?); + result.push_str(&text.unwrap()[last_end..m.start()]); - if sub.occurrence == 0 || count == sub.occurrence { - let replacement = sub.replacement.apply(&caps)?; - result.push_str(&replacement); - replaced = true; - } else { - // Not the target match — leave the match unchanged. - result.push_str(m.as_str()); + let replacement = sub.replacement.apply_match(&m); + result.push_str(&replacement); + replaced = true; + last_end = m.end(); + } + } + + (1, _) => { + // Example: s/\(.\)\(.\)/\2\1/: captures() is enough. + let caps = regex.captures(pattern)?; + if let Some(caps) = caps { + let m = caps.get(0)?.unwrap(); + text = Some(pattern.as_str()?); + result.push_str(&text.unwrap()[last_end..m.start()]); + + let replacement = sub.replacement.apply_captures(&caps)?; + result.push_str(&replacement); + replaced = true; + last_end = m.end(); + } } - last_end = m.end(); + (_, _) => { + // Example: s/(.)(.)/\2\1/3: captures_iter() is needed. + // Iterate over multiple captures of the RE in the pattern. + for caps in regex.captures_iter(pattern)? { + count += 1; + let caps = caps?; + + let m = caps.get(0)?.unwrap(); + + // Always write the unmatched text before this match. + if text.is_none() { + text = Some(pattern.as_str()?); + } + result.push_str(&text.unwrap()[last_end..m.start()]); + + if sub.occurrence == 0 || count == sub.occurrence { + let replacement = sub.replacement.apply_captures(&caps)?; + result.push_str(&replacement); + replaced = true; + } else { + // Not the target match — leave the match unchanged. + result.push_str(m.as_str()); + } + + last_end = m.end(); + + // Early exit if only a specific occurrence, + // (likely 1) needed replacing. + if count == sub.occurrence { + break; + } + } + } } // Handle substitution success. if replaced { - result.push_str(&text[last_end..]); + result.push_str(&text.unwrap()[last_end..]); pattern.set_to_string(result, pattern.is_newline_terminated()); @@ -237,6 +309,90 @@ fn transliterate(pattern: &mut IOChunk, trans: &Transliteration) -> UResult<()> Ok(()) } +/// Output any data queued for output at the end of the cycle. +fn flush_appends(output: &mut OutputBuffer, context: &mut ProcessingContext) -> UResult<()> { + for elem in &context.append_elements { + match elem { + AppendElement::Text(text) => { + output.write_str(text.clone())?; + } + AppendElement::Path(path) => { + output.copy_file(path)?; + } + } + } + context.append_elements.clear(); + Ok(()) +} + +/// Return the specified command variant or panic. +// Example: let path = extract_variant!(command, Path); +macro_rules! extract_variant { + ($cmd:expr, $variant:ident) => { + match &$cmd.data { + CommandData::$variant(inner) => inner, + _ => panic!(concat!("Expected ", stringify!($variant), " command data")), + } + }; +} + +/// List the passed pattern space in unambiguous form. +fn list(output: &mut OutputBuffer, line: &IOChunk, max_width: usize) -> UResult<()> { + // Special case for an empty pattern space + if line.is_empty() { + if line.is_newline_terminated() { + output.write_str("$\n")?; + } + return Ok(()); + } + + let line = line.as_str()?; + let mut buff = String::new(); + let mut line_width = 0; + + for ch in line.chars() { + if ch == '\n' { + buff.push_str("$\n"); + output.write_str(&buff)?; + line_width = 0; + continue; + } + + let mut char_buff = [0u8; 1]; + let out_str: Cow = match ch { + '\x07' => Cow::Borrowed(r"\a"), + '\x08' => Cow::Borrowed(r"\b"), + '\x0b' => Cow::Borrowed(r"\v"), + '\x0c' => Cow::Borrowed(r"\f"), + '\\' => Cow::Borrowed(r"\\"), + '\r' => Cow::Borrowed(r"\r"), + '\t' => Cow::Borrowed(r"\t"), + c if c.is_ascii_control() => Cow::Owned(format!("\\{:03o}", ch as u8)), + c if c == ' ' || c.is_ascii_graphic() => Cow::Borrowed(ch.encode_utf8(&mut char_buff)), + c if (c as u32) <= 0xFFFF => Cow::Owned(format!("\\u{:04X}", c as u32)), + _ => Cow::Owned(format!("\\U{:08X}", ch as u32)), + }; + + // See if folding is required before adding out_str and terminator. + let out_len = out_str.len(); + if line_width + out_len + 1 > max_width { + buff.push_str("\\\n"); + output.write_str(&buff)?; + line_width = 0; + buff.clear(); + } + buff.push_str(out_str.as_ref()); + line_width += out_len; + } + + if !buff.is_empty() { + buff.push_str("$\n"); + output.write_str(buff)?; + } + Ok(()) +} + +#[allow(clippy::cognitive_complexity)] /// Process a single input file fn process_file( commands: &Option>>, @@ -278,9 +434,7 @@ fn process_file( match command.code { '{' => { // Block begin; start processing the enclosed ones. - let CommandData::Block(body) = &command.data else { - panic!("Expected Block command data"); - }; + let body = extract_variant!(command, Block); current = body.clone(); continue; } @@ -288,13 +442,15 @@ fn process_file( // Block end: continue with the block's patched next. } 'a' => { - // TODO + // Write the text to standard output at a later point. + let text = extract_variant!(command, Text); + context + .append_elements + .push(AppendElement::Text(text.clone().into_owned())); } 'b' => { // Branch to the specified label or end if none is given. - let CommandData::BranchTarget(target) = &command.data else { - panic!("Expected BranchTarget command data"); - }; + let target = extract_variant!(command, BranchTarget); if target.is_some() { // New command to execute current = target.clone(); @@ -305,7 +461,14 @@ fn process_file( } } 'c' => { - // TODO + // At range end replace pattern space with text and + // start the next cycle. + pattern.clear(); + if command.addr2.is_none() || context.last_address || context.last_line { + let text = extract_variant!(command, Text); + output.write_str(text.as_ref())?; + } + break; } 'd' => { // Delete the pattern space and start the next cycle. @@ -348,15 +511,19 @@ fn process_file( context.hold.has_newline = pattern.is_newline_terminated(); } 'i' => { - // TODO + // Write text to standard output. + let text = extract_variant!(command, Text); + output.write_str(text.as_ref())?; } 'l' => { - // TODO + let width = *extract_variant!(command, Number); + list(output, &pattern, width)?; } 'n' => { break; } 'N' => { + flush_appends(output, context)?; // Append to pattern `\n` and the next line // Rather than reading input here, which would result // in a double borrow on reader, modify the action @@ -385,27 +552,37 @@ fn process_file( } } 'q' => { + // Quit after printing the pattern space. + set_exit_code(*extract_variant!(command, Number) as i32); context.stop_processing = true; break; } + 'Q' => { + // Quit immediatelly. + set_exit_code(*extract_variant!(command, Number) as i32); + context.stop_processing = true; + context.quiet = true; + break; + } 'r' => { - // TODO + // Copy the file to standard output at a later point. + let path = extract_variant!(command, Path); + context + .append_elements + .push(AppendElement::Path(path.clone())); } 's' => { let subst = match &mut command.data { CommandData::Substitution(subst) => subst, _ => panic!("Expected Substitution command data"), }; - substitute(&mut pattern, &mut *subst, context, output)?; } 't' if !context.substitution_made => { /* Do nothing. */ } 't' => { // Branch to the specified label or end if none is given // if a substitution was made since last cycle or t. - let CommandData::BranchTarget(target) = &command.data else { - panic!("Expected BranchTarget command data"); - }; + let target = extract_variant!(command, BranchTarget); context.substitution_made = false; if target.is_some() { // New command to execute @@ -417,7 +594,9 @@ fn process_file( } } 'w' => { - // TODO + // Append the pattern space to the specified file. + let writer = extract_variant!(command, NamedWriter); + writer.borrow_mut().write_line(pattern.as_str()?)?; } 'x' => { // Exchange the contents of the pattern and hold spaces. @@ -426,18 +605,15 @@ fn process_file( std::mem::swap(pat_has_newline, &mut context.hold.has_newline); } 'y' => { - let trans = match &mut command.data { - CommandData::Transliteration(trans) => trans, - _ => panic!("Expected Transliteration command data"), - }; - + let trans = extract_variant!(command, Transliteration); transliterate(&mut pattern, trans)?; } ':' => { // Branch target; do nothing. } '=' => { - // TODO + // Output current line number. + output.write_str(format!("{}\n", context.line_number))?; } // The compilation should supply only valid codes. _ => panic!("invalid command code"), @@ -450,6 +626,8 @@ fn process_file( write_chunk(output, context, &pattern)?; } + flush_appends(output, context)?; + if context.stop_processing { break; } @@ -474,27 +652,23 @@ fn process_file( pub fn process_all_files( commands: Option>>, files: Vec, - mut context: ProcessingContext, + context: &mut ProcessingContext, ) -> UResult<()> { context.unbuffered = context.unbuffered || io::stdout().is_terminal(); - let mut in_place = InPlace::new(context.clone())?; + let mut in_place = InPlace::new(context.clone()); let last_file_index = files.len() - 1; for (index, path) in files.iter().enumerate() { context.last_file = index == last_file_index; - let mut reader = LineReader::open(path).map_err(|e| { - USimpleError::new( - 2, - format!("Error opening input file {}: {}", path.display(), e), - ) - })?; + let mut reader = LineReader::open(path) + .map_err_context(|| format!("error opening input file {}", path.quote()))?; let output = in_place.begin(path)?; if context.separate { context.line_number = 0; } - process_file(&commands, &mut reader, output, &mut context)?; + process_file(&commands, &mut reader, output, context)?; // Handle any N command remains. if context.last_file && !context.separate && !context.quiet { diff --git a/src/uu/sed/src/script_line_provider.rs b/src/uu/sed/src/script_line_provider.rs index f287e982..d75eb9b9 100644 --- a/src/uu/sed/src/script_line_provider.rs +++ b/src/uu/sed/src/script_line_provider.rs @@ -12,7 +12,8 @@ use crate::command::ScriptValue; use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader}; -use uucore::error::{UResult, USimpleError}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult}; #[derive(Debug)] /// The provider of script lines across all specified scripts @@ -131,12 +132,8 @@ impl ScriptLineProvider { line_number: 0, }; } else { - let file = File::open(p).map_err(|e| { - USimpleError::new( - 2, - format!("Error opening script file {}: {}", p.display(), e), - ) - })?; + let file = File::open(p) + .map_err_context(|| format!("error opening script file {}", p.quote()))?; self.state = State::Active { index: next_index, reader: Box::new(BufReader::new(file)), diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 776edede..fa34a512 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -12,6 +12,7 @@ pub mod command; pub mod compiler; pub mod delimited_parser; pub mod fast_io; +pub mod fast_regex; pub mod in_place; pub mod named_writer; pub mod processor; @@ -22,7 +23,6 @@ use crate::command::{ProcessingContext, ScriptValue, StringSpace}; use crate::compiler::compile; use crate::processor::process_all_files; use clap::{Arg, ArgMatches, Command, arg}; -use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; use uucore::error::{UResult, UUsageError}; @@ -35,10 +35,10 @@ const USAGE: &str = "sed [OPTION]... [script] [file]..."; pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; let (scripts, files) = get_scripts_files(&matches)?; - let mut processing_context = build_context(&matches); + let mut context = build_context(&matches); - let executable = compile(scripts, &mut processing_context)?; - process_all_files(executable, files, processing_context)?; + let executable = compile(scripts, &mut context)?; + process_all_files(executable, files, &mut context)?; Ok(()) } @@ -202,12 +202,13 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { last_line: false, last_file: false, stop_processing: false, - saved_regex: const { RefCell::new(None) }, + saved_regex: None, input_action: None, hold: StringSpace::default(), parsed_block_nesting: 0, label_to_command_map: HashMap::new(), substitution_made: false, + append_elements: Vec::new(), } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 74517d81..20dac473 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -10,11 +10,16 @@ use std::fs; use std::io::{Read, Write}; + +#[cfg(unix)] +use assert_fs::fixture::{FileWriteStr, PathChild}; + use tempfile::NamedTempFile; use uutests::new_ucmd; use uutests::util::TestScenario; use uutests::util_name; +//////////////////////////////////////////////////////////// // Test application's invocation #[test] fn test_invalid_arg() { @@ -56,13 +61,11 @@ fn test_e_script_ok() { #[test] fn test_f_script_ok() { - let mut temp = NamedTempFile::new().expect("Failed to create temp file"); - writeln!(temp, "l").expect("Failed to write to temp file"); - let path = temp.path(); - - new_ucmd!().arg("-f").arg(path).succeeds(); + new_ucmd!().arg("-f").arg("script/hanoi.sed").succeeds(); } +//////////////////////////////////////////////////////////// +// Test simple I/O processing const INPUT_FILES: &[&str] = &[ "input/two-lines.txt", "input/no-new-line.txt", @@ -126,31 +129,26 @@ macro_rules! check_output { }; } +//////////////////////////////////////////////////////////// +// Individual command tests + // Input files const LINES1: &str = "input/lines1"; const LINES2: &str = "input/lines2"; const NO_NEW_LINE: &str = "input/no-new-line.txt"; -// Test address ranges +//////////////////////////////////////////////////////////// +// Address ranges: One and two, numeric and pattern check_output!(addr_one_line, ["-n", "-e", "4p", LINES1]); check_output!(addr_straddle, ["-n", "-e", "20p", LINES1, LINES2]); check_output!(addr_last_one_file, ["-n", "-e", "$p", LINES1]); check_output!(addr_last_two_files, ["-n", "-e", "$p", LINES1, LINES2]); -// TODO: Enable and configure for Unix/Windows, when "a" is implemented. -#[cfg(any())] -check_output!(addr_append_with_empty, ["-e", "$a\nhello", "/dev/null"]); +check_output!(addr_append_empty, ["-e", "$a\\\nhello", "input/empty"]); -#[cfg(unix)] check_output!( - addr_last_with_empty, - ["-n", "-e", "$p", LINES1, "/dev/null", LINES2] -); - -#[cfg(windows)] -check_output!( - addr_last_with_empty, - ["-n", "-e", "$p", LINES1, "NUL", LINES2] + addr_last_empty, + ["-n", "-e", "$p", LINES1, "input/empty", LINES2] ); check_output!(addr_past_last, ["-n", "-e", "20p", LINES1]); @@ -196,7 +194,8 @@ check_output!( check_output!(addr_empty_re_reuse, ["-n", "/_2/,//p", LINES1, LINES2]); check_output!(addr_simple_negation, ["-e", r"4,12!s/^/^/", LINES1]); -// Test substitutions +//////////////////////////////////////////////////////////// +// Substitution: s check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); check_output!(subst_any_global, ["-e", r"s,.,X,g", LINES1]); check_output!(subst_escaped_magic_separator, ["-e", r"s.\..X.g", LINES1]); @@ -226,6 +225,9 @@ check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); check_output!(subst_case_insensitive, ["-e", r"s/L/Line/", LINES1]); check_output!(subst_no_new_line, ["-e", r"s/l/L/g", NO_NEW_LINE]); +check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]); +check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); +check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); #[test] fn subst_write_file() -> std::io::Result<()> { @@ -244,6 +246,8 @@ fn subst_write_file() -> std::io::Result<()> { Ok(()) } +//////////////////////////////////////////////////////////// +// Transliteration: y check_output!(trans_simple, ["-e", r"y/0123456789/9876543210/", LINES1]); check_output!( trans_delimiter, @@ -251,10 +255,10 @@ check_output!( ); check_output!(trans_no_new_line, ["-e", r"y/l/L/", NO_NEW_LINE]); check_output!(trans_newline, ["-e", r"1N;2y/\n/X/", LINES1]); -check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); -check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); -check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); +//////////////////////////////////////////////////////////// +// Pattern space manipulation: D, d, H, h, N, n, P, p, q, x +check_output!(pattern_print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); check_output!(pattern_delete_to_newline, ["-n", r"2N;3p;3D;3p", LINES1]); check_output!(pattern_delete_no_newline, ["-e", r"2D", LINES1]); @@ -278,7 +282,29 @@ check_output!(pattern_next_print_output, ["-e", r"4n;p", LINES1]); check_output!(pattern_next_print_no_output, ["-n", "-e", r"4n;p", LINES1]); check_output!(pattern_quit, [r"5q", LINES1]); check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); +check_output!(pattern_re_reuse, ["-n", r"/_1/p;//p", LINES1]); +check_output!(pattern_subst_re_reuse, ["-n", r"/_1/p;s//-N/p", LINES1]); +#[test] +fn test_quit_exit_code() { + new_ucmd!() + .args(&["5q 42", LINES1]) + .fails() + .code_is(42) + .stdout_is_fixture("output/pattern_quit"); +} + +#[test] +fn test_quit_now_exit_code() { + new_ucmd!() + .args(&["6Q 12", LINES1]) + .fails() + .code_is(12) + .stdout_is_fixture("output/pattern_quit"); +} + +//////////////////////////////////////////////////////////// +// Command blocks: {} check_output!( block_simple_range, [ @@ -379,6 +405,8 @@ b label3 ] ); +//////////////////////////////////////////////////////////// +// Branching: :, b, t check_output!( branch_conditional_simple, [ @@ -465,3 +493,407 @@ tb"#, LINES1 ] ); + +//////////////////////////////////////////////////////////// +// Text: a, c, i +check_output!( + text_insert_quit, + [ + "-e", + r#" +5i\ +hello +5q +"#, + LINES1 + ] +); + +check_output!( + text_insert_between_subst, + [ + "-n", + "-e", + r#" +s/^/before_i/p +20i\ +inserted +s/^/after_i/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_append_between_subst, + [ + "-n", + "-e", + r#" +5,12s/^/5-12/ +s/^/before_a/p +/5-12/a\ +appended +s/^/after_a/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_append_before_next, + [ + "-n", + "-e", + r#" +s/^/^/p +/l1_/a\ +appended +8,10N +s/$/$/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_change_global, + [ + "-n", + "-e", + r#" +c\ +hello +"#, + LINES1 + ] +); + +check_output!( + text_change_line, + [ + "-n", + "-e", + r#" +8c\ +hello +"#, + LINES1 + ] +); + +check_output!( + text_change_range, + [ + "-n", + "-e", + r#" +3,14c\ +hello +"#, + LINES1 + ] +); + +// SunOS and GNU sed behave differently. We follow POSIX. +check_output!( + text_change_reverse_range, + [ + "-n", + "-e", + r#" +8,3c\ +hello +"#, + LINES1 + ] +); + +check_output!(text_delete, ["d", LINES1]); + +// Check that the pattern space is deleted. +check_output!( + text_change_print, + [ + "-n", + "-e", + r#" +c\ +changed +p +"#, + LINES1 + ] +); + +//////////////////////////////////////////////////////////// +// r, w commands +check_output!(read_ok, [format!("4r {}", LINES2), LINES1.to_string()]); +check_output!(read_missing, ["5r /xyzzyxyzy42", LINES1]); +check_output!(read_empty, ["6r input/empty", LINES1]); + +#[test] +fn write_single_file() -> std::io::Result<()> { + let temp = NamedTempFile::new()?; + let cmd = format!("3,12w {}", temp.path().display()); + + new_ucmd!().args(&["-e", &cmd, LINES1]).succeeds(); + + let mut actual = String::new(); + temp.reopen()?.read_to_string(&mut actual)?; + + let expected = fs::read_to_string("tests/fixtures/sed/output/write_single_file")?; + assert_eq!(actual, expected, "Output did not match fixture"); + + Ok(()) +} + +#[test] +fn write_two_files() -> std::io::Result<()> { + let temp1 = NamedTempFile::new()?; + let temp2 = NamedTempFile::new()?; + let cmd = format!( + "3,12w {}\n1,2w {}", + temp1.path().display(), + temp2.path().display() + ); + + new_ucmd!().args(&["-e", &cmd, LINES1]).succeeds(); + + let mut actual = String::new(); + + temp1.reopen()?.read_to_string(&mut actual)?; + let expected = fs::read_to_string("tests/fixtures/sed/output/write_two_files_1")?; + assert_eq!(actual, expected, "Output 1 did not match fixture"); + + actual.clear(); + + temp2.reopen()?.read_to_string(&mut actual)?; + let expected = fs::read_to_string("tests/fixtures/sed/output/write_two_files_2")?; + assert_eq!(actual, expected, "Output 2 did not match fixture"); + + Ok(()) +} + +//////////////////////////////////////////////////////////// +// =, l commands +check_output!(number_continuous, ["/l2_/=", LINES1, LINES2]); +check_output!(number_separate, ["-s", "/l._8/=", LINES1, LINES2]); + +check_output!(list_ascii, ["-n", "l 60", "input/ascii"]); +check_output!(list_empty, ["-n", "l 60", "input/empty"]); +check_output!(list_unicode, ["l 60", "input/unicode"]); + +//////////////////////////////////////////////////////////// +// In-place editing +#[test] +fn in_place_edit_replace() -> std::io::Result<()> { + let mut temp = NamedTempFile::new()?; + writeln!(temp.as_file_mut(), "hello, world")?; + + // Get the file path before converting to TempPath + let path = temp.path().to_path_buf(); + + // Close temp file and preserve path + let temp_path = temp.into_temp_path(); + + // Call your tool on the path + new_ucmd!() + .args(&["-i", "-e", "s/world/universe/", path.to_str().unwrap()]) + .succeeds(); + + // Read the file using standard fs + let actual = std::fs::read_to_string(&path)?; + temp_path.close()?; // Clean up + + assert_eq!(actual, "hello, universe\n"); + Ok(()) +} + +#[test] +fn in_place_edit_backup() -> std::io::Result<()> { + let mut temp = NamedTempFile::new()?; + writeln!(temp.as_file_mut(), "hello, world")?; + + let path = temp.path().to_path_buf(); + let temp_path = temp.into_temp_path(); + + // Run the sed-like command with -i (in-place edit) + new_ucmd!() + .args(&[ + "-i", + ".bak", + "-e", + "s/world/universe/", + path.to_str().unwrap(), + ]) + .succeeds(); + + // Read edited file + let actual = std::fs::read_to_string(&path)?; + assert_eq!(actual, "hello, universe\n"); + + // Read backup file + let backup_path = path.with_file_name(format!( + "{}.bak", + path.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + temp_path.close()?; // Cleanup + std::fs::remove_file(backup_path)?; // Cleanup backup + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_follow_symlink_edits_target() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "--follow-symlinks", + "-i", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + let actual = std::fs::read_to_string(target.path())?; + assert_eq!(actual, "hello, universe\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_symlink_replaced_when_not_following() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + // Run command without --follow-symlinks + new_ucmd!() + .args(&[ + "-i", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // The original target should be untouched + let original = std::fs::read_to_string(target.path())?; + assert_eq!(original, "hello, world\n"); + + // The symlink path should now contain the edited content + let edited = std::fs::read_to_string(link.path())?; + assert_eq!(edited, "hello, universe\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_follow_symlink_with_backup() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "--follow-symlinks", + "-i", + ".bak", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // Verify target was modified + let edited = std::fs::read_to_string(target.path())?; + assert_eq!(edited, "hello, universe\n"); + + // Backup file is created alongside the target + let backup_path = target.path().with_file_name(format!( + "{}.bak", + target.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_symlink_replaced_with_backup() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "-i", + ".bak", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // Target should remain untouched + let unchanged = std::fs::read_to_string(target.path())?; + assert_eq!(unchanged, "hello, world\n"); + + // Symlink path should now contain the updated content + let edited = std::fs::read_to_string(link.path())?; + assert_eq!(edited, "hello, universe\n"); + + // Backup of the symlink file (not the target) should exist + let backup_path = link.path().with_file_name(format!( + "{}.bak", + link.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + Ok(()) +} + +//////////////////////////////////////////////////////////// +// Large complex scripts + +// Math expression evaluation +check_output!(math1, ["-f", "script/math.sed", "input/expression1"]); + +// Calculate π (scaled) to several decimal places +check_output!(pi, ["-f", "script/math.sed", "input/pi"]); + +// Solve the Towers of Hanoi puzzle +check_output!(hanoi, ["-f", "script/hanoi.sed", "input/hanoi"]); diff --git a/tests/fixtures/sed/input/ascii b/tests/fixtures/sed/input/ascii new file mode 100644 index 00000000..68219ef6 Binary files /dev/null and b/tests/fixtures/sed/input/ascii differ diff --git a/tests/fixtures/sed/input/empty b/tests/fixtures/sed/input/empty new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/input/expression1 b/tests/fixtures/sed/input/expression1 new file mode 100644 index 00000000..4378eeb8 --- /dev/null +++ b/tests/fixtures/sed/input/expression1 @@ -0,0 +1 @@ +(2^10 + 5! - (12 % 5)) * 3 diff --git a/tests/fixtures/sed/input/hanoi b/tests/fixtures/sed/input/hanoi new file mode 100644 index 00000000..1f7fa813 --- /dev/null +++ b/tests/fixtures/sed/input/hanoi @@ -0,0 +1 @@ +:abcd: : : diff --git a/tests/fixtures/sed/input/pi b/tests/fixtures/sed/input/pi new file mode 100644 index 00000000..445c740a --- /dev/null +++ b/tests/fixtures/sed/input/pi @@ -0,0 +1 @@ +(104348 * 1000000000000000) / 33215 diff --git a/tests/fixtures/sed/input/unicode b/tests/fixtures/sed/input/unicode new file mode 100644 index 00000000..94b4dd52 --- /dev/null +++ b/tests/fixtures/sed/input/unicode @@ -0,0 +1 @@ +Hello World or Καλημέρα κόσμε or こんにちは 世界 😀 diff --git a/tests/fixtures/sed/output/addr_last_with_empty b/tests/fixtures/sed/output/addr_last_empty similarity index 100% rename from tests/fixtures/sed/output/addr_last_with_empty rename to tests/fixtures/sed/output/addr_last_empty diff --git a/tests/fixtures/sed/output/hanoi b/tests/fixtures/sed/output/hanoi new file mode 100644 index 00000000..ec39ea26 --- /dev/null +++ b/tests/fixtures/sed/output/hanoi @@ -0,0 +1,17 @@ +:abcd: : : +:abc : :d : +:ab :c :d : +:ab :cd : : +:a :cd :b : +:ad :c :b : +:ad : :bc : +:a : :bcd : +: :a :bcd : +: :ad :bc : +:c :ad :b : +:cd :a :b : +:cd :ab : : +:c :ab :d : +: :abc :d : +: :abcd: : +Done! Try another, or end with ^D. diff --git a/tests/fixtures/sed/output/list_ascii b/tests/fixtures/sed/output/list_ascii new file mode 100644 index 00000000..bc469d5b --- /dev/null +++ b/tests/fixtures/sed/output/list_ascii @@ -0,0 +1,6 @@ +\000\001\002\003\004\005\006\a\b\t$ +\v\f\r\016\017\020\021\022\023\024\025\026\027\030\031\032\ +\033\034\035\036\037 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEF\ +GHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\ +\177$ +$ diff --git a/tests/fixtures/sed/output/list_empty b/tests/fixtures/sed/output/list_empty new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/list_unicode b/tests/fixtures/sed/output/list_unicode new file mode 100644 index 00000000..f15bfa2c --- /dev/null +++ b/tests/fixtures/sed/output/list_unicode @@ -0,0 +1,4 @@ +Hello World or \u039A\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\ +\u03B1 \u03BA\u03CC\u03C3\u03BC\u03B5 or \u3053\u3093\u306B\ +\u3061\u306F \u4E16\u754C \U0001F600$ +Hello World or Καλημέρα κόσμε or こんにちは 世界 😀 diff --git a/tests/fixtures/sed/output/math1 b/tests/fixtures/sed/output/math1 new file mode 100644 index 00000000..767be6bb --- /dev/null +++ b/tests/fixtures/sed/output/math1 @@ -0,0 +1 @@ +3426 diff --git a/tests/fixtures/sed/output/number_continuous b/tests/fixtures/sed/output/number_continuous new file mode 100644 index 00000000..cb8d266f --- /dev/null +++ b/tests/fixtures/sed/output/number_continuous @@ -0,0 +1,32 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +15 +l2_1 +16 +l2_2 +17 +l2_3 +18 +l2_4 +19 +l2_5 +20 +l2_6 +21 +l2_7 +22 +l2_8 +23 +l2_9 diff --git a/tests/fixtures/sed/output/number_separate b/tests/fixtures/sed/output/number_separate new file mode 100644 index 00000000..df6ffdda --- /dev/null +++ b/tests/fixtures/sed/output/number_separate @@ -0,0 +1,25 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +8 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +8 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/print_to_newline b/tests/fixtures/sed/output/pattern_print_to_newline similarity index 100% rename from tests/fixtures/sed/output/print_to_newline rename to tests/fixtures/sed/output/pattern_print_to_newline diff --git a/tests/fixtures/sed/output/pattern_re_reuse b/tests/fixtures/sed/output/pattern_re_reuse new file mode 100644 index 00000000..3eef2361 --- /dev/null +++ b/tests/fixtures/sed/output/pattern_re_reuse @@ -0,0 +1,12 @@ +l1_1 +l1_1 +l1_10 +l1_10 +l1_11 +l1_11 +l1_12 +l1_12 +l1_13 +l1_13 +l1_14 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_subst_re_reuse b/tests/fixtures/sed/output/pattern_subst_re_reuse new file mode 100644 index 00000000..c8a099eb --- /dev/null +++ b/tests/fixtures/sed/output/pattern_subst_re_reuse @@ -0,0 +1,12 @@ +l1_1 +l1-N +l1_10 +l1-N0 +l1_11 +l1-N1 +l1_12 +l1-N2 +l1_13 +l1-N3 +l1_14 +l1-N4 diff --git a/tests/fixtures/sed/output/pi b/tests/fixtures/sed/output/pi new file mode 100644 index 00000000..06817077 --- /dev/null +++ b/tests/fixtures/sed/output/pi @@ -0,0 +1 @@ +3141592653921421 diff --git a/tests/fixtures/sed/output/read_empty b/tests/fixtures/sed/output/read_empty new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/read_empty @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/read_missing b/tests/fixtures/sed/output/read_missing new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/read_missing @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/read_ok b/tests/fixtures/sed/output/read_ok new file mode 100644 index 00000000..19a94616 --- /dev/null +++ b/tests/fixtures/sed/output/read_ok @@ -0,0 +1,23 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/subst_re_reuse b/tests/fixtures/sed/output/subst_re_reuse new file mode 100644 index 00000000..20a49d1c --- /dev/null +++ b/tests/fixtures/sed/output/subst_re_reuse @@ -0,0 +1,14 @@ +L1_1 +M1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/text_append_before_next b/tests/fixtures/sed/output/text_append_before_next new file mode 100644 index 00000000..ccfa1946 --- /dev/null +++ b/tests/fixtures/sed/output/text_append_before_next @@ -0,0 +1,56 @@ +^l1_1 +^l1_1$ +appended +^l1_2 +^l1_2$ +appended +^l1_3 +^l1_3$ +appended +^l1_4 +^l1_4$ +appended +^l1_5 +^l1_5$ +appended +^l1_6 +^l1_6$ +appended +^l1_7 +^l1_7$ +appended +^l1_8 +appended +^l1_8 +l1_9$ +^l1_10 +appended +^l1_10 +l1_11$ +^l1_12 +^l1_12$ +appended +^l1_13 +^l1_13$ +appended +^l1_14 +^l1_14$ +appended +^l2_1 +^l2_1$ +^l2_2 +^l2_2$ +^l2_3 +^l2_3$ +^l2_4 +^l2_4$ +^l2_5 +^l2_5$ +^l2_6 +^l2_6$ +^l2_7 +^l2_7$ +^l2_8 +^l2_8$ +^l2_9 +^l2_9$ diff --git a/tests/fixtures/sed/output/text_append_between_subst b/tests/fixtures/sed/output/text_append_between_subst new file mode 100644 index 00000000..4161c1c0 --- /dev/null +++ b/tests/fixtures/sed/output/text_append_between_subst @@ -0,0 +1,54 @@ +before_al1_1 +after_abefore_al1_1 +before_al1_2 +after_abefore_al1_2 +before_al1_3 +after_abefore_al1_3 +before_al1_4 +after_abefore_al1_4 +before_a5-12l1_5 +after_abefore_a5-12l1_5 +appended +before_a5-12l1_6 +after_abefore_a5-12l1_6 +appended +before_a5-12l1_7 +after_abefore_a5-12l1_7 +appended +before_a5-12l1_8 +after_abefore_a5-12l1_8 +appended +before_a5-12l1_9 +after_abefore_a5-12l1_9 +appended +before_a5-12l1_10 +after_abefore_a5-12l1_10 +appended +before_a5-12l1_11 +after_abefore_a5-12l1_11 +appended +before_a5-12l1_12 +after_abefore_a5-12l1_12 +appended +before_al1_13 +after_abefore_al1_13 +before_al1_14 +after_abefore_al1_14 +before_al2_1 +after_abefore_al2_1 +before_al2_2 +after_abefore_al2_2 +before_al2_3 +after_abefore_al2_3 +before_al2_4 +after_abefore_al2_4 +before_al2_5 +after_abefore_al2_5 +before_al2_6 +after_abefore_al2_6 +before_al2_7 +after_abefore_al2_7 +before_al2_8 +after_abefore_al2_8 +before_al2_9 +after_abefore_al2_9 diff --git a/tests/fixtures/sed/output/text_change_global b/tests/fixtures/sed/output/text_change_global new file mode 100644 index 00000000..94e9a6d3 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_global @@ -0,0 +1,14 @@ +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello diff --git a/tests/fixtures/sed/output/text_change_line b/tests/fixtures/sed/output/text_change_line new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_line @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_change_print b/tests/fixtures/sed/output/text_change_print new file mode 100644 index 00000000..78db2a52 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_print @@ -0,0 +1,14 @@ +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed diff --git a/tests/fixtures/sed/output/text_change_range b/tests/fixtures/sed/output/text_change_range new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_range @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_change_reverse_range b/tests/fixtures/sed/output/text_change_reverse_range new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_reverse_range @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_delete b/tests/fixtures/sed/output/text_delete new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/text_insert_between_subst b/tests/fixtures/sed/output/text_insert_between_subst new file mode 100644 index 00000000..455093ce --- /dev/null +++ b/tests/fixtures/sed/output/text_insert_between_subst @@ -0,0 +1,47 @@ +before_il1_1 +after_ibefore_il1_1 +before_il1_2 +after_ibefore_il1_2 +before_il1_3 +after_ibefore_il1_3 +before_il1_4 +after_ibefore_il1_4 +before_il1_5 +after_ibefore_il1_5 +before_il1_6 +after_ibefore_il1_6 +before_il1_7 +after_ibefore_il1_7 +before_il1_8 +after_ibefore_il1_8 +before_il1_9 +after_ibefore_il1_9 +before_il1_10 +after_ibefore_il1_10 +before_il1_11 +after_ibefore_il1_11 +before_il1_12 +after_ibefore_il1_12 +before_il1_13 +after_ibefore_il1_13 +before_il1_14 +after_ibefore_il1_14 +before_il2_1 +after_ibefore_il2_1 +before_il2_2 +after_ibefore_il2_2 +before_il2_3 +after_ibefore_il2_3 +before_il2_4 +after_ibefore_il2_4 +before_il2_5 +after_ibefore_il2_5 +before_il2_6 +inserted +after_ibefore_il2_6 +before_il2_7 +after_ibefore_il2_7 +before_il2_8 +after_ibefore_il2_8 +before_il2_9 +after_ibefore_il2_9 diff --git a/tests/fixtures/sed/output/text_insert_quit b/tests/fixtures/sed/output/text_insert_quit new file mode 100644 index 00000000..f15f6e3e --- /dev/null +++ b/tests/fixtures/sed/output/text_insert_quit @@ -0,0 +1,6 @@ +l1_1 +l1_2 +l1_3 +l1_4 +hello +l1_5 diff --git a/tests/fixtures/sed/output/write_single_file b/tests/fixtures/sed/output/write_single_file new file mode 100644 index 00000000..d90ffa1e --- /dev/null +++ b/tests/fixtures/sed/output/write_single_file @@ -0,0 +1,10 @@ +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 diff --git a/tests/fixtures/sed/output/write_two_files_1 b/tests/fixtures/sed/output/write_two_files_1 new file mode 100644 index 00000000..d90ffa1e --- /dev/null +++ b/tests/fixtures/sed/output/write_two_files_1 @@ -0,0 +1,10 @@ +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 diff --git a/tests/fixtures/sed/output/write_two_files_2 b/tests/fixtures/sed/output/write_two_files_2 new file mode 100644 index 00000000..4c8acc52 --- /dev/null +++ b/tests/fixtures/sed/output/write_two_files_2 @@ -0,0 +1,2 @@ +l1_1 +l1_2 diff --git a/tests/fixtures/sed/script/http-log-redact.sed b/tests/fixtures/sed/script/http-log-redact.sed new file mode 100644 index 00000000..600ae9f4 --- /dev/null +++ b/tests/fixtures/sed/script/http-log-redact.sed @@ -0,0 +1,4 @@ +#!/ +s/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/[REDACTED_IP]/; +s/\[[0-9]{2}\/[A-Za-z]{3}\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2} [+-][0-9]{4}\]/[TIMESTAMP]/; +s/"Mozilla[^"]*"/"[UA]"/ diff --git a/util/bench-compare.sh b/util/bench-compare.sh new file mode 100755 index 00000000..7e829c60 --- /dev/null +++ b/util/bench-compare.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# +# Compare the results of two benchmarks reporting fastest for each task +# + + +set -eu + +if [ $# -ne 2 ] ; then + echo "Usage: $(basename $0) results-1 results-2" 1>&2 + + exit 1 +fi + +paste -d , "$1" "$2" | + sed "s/^/$1,$2,/" | + + awk -F, 'FNR != 1 { + printf("%22s ", $3); + if (!$4) + printf("%12s encountered an error\n", $1); + else if (!$12) + printf("%12s encountered an error\n", $2); + else if ($4 / $12 > 1.01) + printf("%12s is %5.2f times faster than %s\n", $2, $4 / $12, $1); + else if ($12 / $4 > 1.01) + printf("%12s is %5.2f times faster than %s\n", $1, $12 / $4, $2); + else + printf("%12s is similarly fast as %s\n", $1, $2); + }' diff --git a/util/benchmark.sh b/util/benchmark.sh new file mode 100755 index 00000000..ebbb02aa --- /dev/null +++ b/util/benchmark.sh @@ -0,0 +1,141 @@ +#!/bin/sh + +set -eu + +if [ $# -ne 2 ] ; then + cat <&2 +Usage: $(basename $0) sed-command output-file" + +Examples: +$(basename $0) 'target/release/sedapp sed' rust-sed +$(basename $0) /usr/bin/sed gnu-sed +EOF + exit 1 +fi + +PROG="$1" +OUT="$2" + +SCRIPTS=tests/fixtures/sed/script + + +# Run hyperfine with the specified name and command and collect the results. +bench_run() +{ + if hyperfine --command-name "$1" --warmup 2 --export-csv out.csv "$2" ; then + # Output the results sans-heading. + sed 1d out.csv >>"$OUT" + else + # Unable to run; output a named empty record. + echo "$1,,,,,,," >>"$OUT" + fi + + rm out.csv +} + +# Shared heading +echo 'command,mean,stddev,median,user,system,min,max' >"$OUT" + +# Short line processing +awk 'BEGIN { for (i = 0; i < 50000000; i++) { print i } }' > lines.txt + +# No operation +bench_run no-op-short "$PROG '' lines.txt" + +# Log file processing + +# Create an access.log file with the specified number of lines +create_access_log() +{ + awk 'BEGIN { + for (i = 0; i < 5000000; i++) { + printf("192.168.%d.%d - - [01/Jan/2024:00:00:00 +0000] \"GET /index.html HTTP/1.1\" 200 1234 \"-\" \"Mozilla/5.0\"\n", int(i/256)%256, i%256); + } + }' > access.log +} + +# Create long file for simple commands +create_access_log 5000000 + +# No operation +bench_run access-log-no-op "$PROG '' access.log" + +# No substitution +bench_run access-log-no-subst "$PROG s/Chrome/Chromium/ access.log" + +# Substitution +bench_run access-log-subst "$PROG s/Mozilla/Chromium/ access.log" + +# No deletion +bench_run access-log-no-del "$PROG /Chrome/d access.log" + +# Full deletion +bench_run access-log-all-del "$PROG /Mozilla/d access.log" + +# Create shorter file for more complex commands +create_access_log 50000 + +# Transliteration +bench_run access-log-translit "$PROG y/0123456789/9876543210/ access.log" + +# Multiple substitutions +bench_run access-log-complex-sub "$PROG -f $SCRIPTS/http-log-redact.sed access.log" + +rm access.log + +# Remove \r +awk 'BEGIN { + for (i = 0; i < 1500000; i++) { + printf("line %d with windows endings\r\n", i); + } +}' > legacy_input.txt + +bench_run remove-cr "$PROG 's/\r$//' legacy_input.txt" + +rm legacy_input.txt + +# Genomic data cleanup + +awk 'BEGIN { + for (i = 0; i < 1000000; i++) { + chr = "chr" (1 + i % 22); + printf("%s\t%d\t%s\t.\t%s\t.\t.\n", chr, i * 100, (i % 2 ? "A" : "T"), (i % 2 ? "G" : "C")); + } +}' > genome.tsv + +CMD='/^#/d; s/\t\./\tNA/g; s/\.$/NA/' +bench_run genome-subst "$PROG '$CMD' genome.tsv" + +rm -f genome.tsv + +# Number fixups: remove thousands separator, change , into . +awk 'BEGIN { + for (i = 0; i < 700000; i++) { + euros = int(i % 10000); + cents = int(i % 100); + thousands = int(euros / 1000); + remainder = euros % 1000; + printf("%d.%03d,%02d\n", thousands, remainder, cents); + } +}' > finance.csv + +CMD='s/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' +bench_run number-fix "$PROG '$CMD' finance.csv" + +rm -f finance.csv + +# Long script compilation +for i in $(seq 1 99) ; do + awk -v tag="$i" '{ gsub(/[tb:] [[:alnum:]_]+/, "&" tag); print }' $SCRIPTS/math.sed +done >long_script.sed + +bench_run long-script "echo -n '' | $PROG -f long_script.sed " + +rm long_script.sed + + +# Towers of Hanoi +bench_run hanoi "echo ':abcdefghijkl: : :' | $PROG -f $SCRIPTS/hanoi.sed" + +# Arbitrary precision arithmetic +bench_run factorial "echo 30\! | $PROG -f $SCRIPTS/math.sed"