From 09d4693cdb198ddd3eaa6ff080a871e28462ab50 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 11:41:48 +0300 Subject: [PATCH 01/42] Improve label name compliance According to POSIX, labels consist of portable filename characters. --- src/uu/sed/src/compiler.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 3b5d30fa..5ff0a1d1 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -958,11 +958,17 @@ fn compile_label_command( line: &mut ScriptCharProvider, cmd: &mut Command, ) -> UResult<()> { + /// Return true if `c` is in the POSIX portable filename character set. + fn is_portable_filename_char(c: char) -> bool { + c.is_ascii_alphanumeric() // A–Z, a–z, 0–9 + || matches!(c, '.' | '_' | '-') + } + line.advance(); // Skip the command character line.eat_spaces(); // Skip any leading whitespace let mut label = String::new(); - while !line.eol() && line.current().is_ascii_alphanumeric() { + while !line.eol() && is_portable_filename_char(line.current()) { label.push(line.current()); line.advance(); } From 826ebf9fd46061f26bd9114a0e3164840064b240 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 12:41:46 +0300 Subject: [PATCH 02/42] Support the compilation of text (aci) commands --- src/uu/sed/src/command.rs | 1 + src/uu/sed/src/compiler.rs | 154 +++++++++++++++++++++++++++++++++---- 2 files changed, 139 insertions(+), 16 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index af673f19..21f0d09a 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -291,6 +291,7 @@ pub enum CommandData { Label(Option), // Label name for 'b', 't', ':' NamedWriter(Box), // File descriptor for 'w' Substitution(Box), // Substitute command 's' + Text(String), // Text for 'a', 'c', 'i' Transliteration(Box), // Transliteration command 'y' } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 5ff0a1d1..7a36bd25 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -986,6 +986,51 @@ fn compile_label_command( parse_command_ending(lines, line, cmd) } +fn compile_text_command( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult<()> { + line.advance(); // Skip the command character. + + line.eat_spaces(); // Skip any leading whitespace. + if line.eol() || line.current() != '\\' { + return compilation_error( + lines, + line, + format!("command `{}' expects \\ followed by text", cmd.code), + ); + } + + line.advance(); // Skip \. + line.eat_spaces(); // Skip any whitespace at the end of \. + if !line.eol() { + return compilation_error( + lines, + line, + format!( + "extra characters after \\ at the end of `{}' command", + cmd.code + ), + ); + } + + let mut text = String::new(); + while let Some(line) = lines.next_line()? { + if line.ends_with('\\') { + // Line ends with \ to escape \n; remove the trailing \. + text.push_str(&line[..line.len() - 1]); + text.push('\n'); + } else { + text.push_str(&line); + text.push('\n'); + break; + } + } + cmd.data = CommandData::Text(text); + Ok(()) +} + // Compile the specified command fn compile_command( lines: &mut ScriptLineProvider, @@ -1027,9 +1072,11 @@ fn compile_command( // y return compile_trans_command(lines, line, &mut cmd); } - // TODO - CommandArgs::Text => { // a c i + CommandArgs::Text => { + // a c i + compile_text_command(lines, line, &mut cmd)?; } + // TODO CommandArgs::ReadFile => { // r } CommandArgs::WriteFile => { // w @@ -1081,12 +1128,25 @@ fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { mod tests { use super::*; + // Return an empty line provider and a char provider for the specified str. fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { let lines = ScriptLineProvider::new(vec![]); // Empty for tests let line = ScriptCharProvider::new(input); (lines, line) } + fn make_line_provider(lines: &[&str]) -> ScriptLineProvider { + let input = lines + .iter() + .map(|s| ScriptValue::StringVal(s.to_string())) + .collect(); + ScriptLineProvider::new(input) + } + + fn make_char_provider(input: &str) -> ScriptCharProvider { + ScriptCharProvider::new(input) + } + /// Return a default ProcessingContext for use in tests. pub fn ctx() -> ProcessingContext { ProcessingContext::default() @@ -1575,21 +1635,13 @@ mod tests { } // compile_sequence - fn make_provider(lines: &[&str]) -> ScriptLineProvider { - let input = lines - .iter() - .map(|s| ScriptValue::StringVal(s.to_string())) - .collect(); - ScriptLineProvider::new(input) - } - fn empty_line() -> ScriptCharProvider { ScriptCharProvider::new("") } #[test] fn test_compile_sequence_empty_input() { - let mut provider = make_provider(&[]); + let mut provider = make_line_provider(&[]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1598,7 +1650,7 @@ mod tests { #[test] fn test_compile_sequence_comment_only() { - let mut provider = make_provider(&["# comment", " ", ";;"]); + let mut provider = make_line_provider(&["# comment", " ", ";;"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1607,7 +1659,7 @@ mod tests { #[test] fn test_compile_sequence_single_command() { - let mut provider = make_provider(&["42q"]); + let mut provider = make_line_provider(&["42q"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1631,7 +1683,7 @@ mod tests { #[test] fn test_compile_sequence_non_selected_single_command() { - let mut provider = make_provider(&["42!p"]); + let mut provider = make_line_provider(&["42!p"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1655,7 +1707,7 @@ mod tests { #[test] fn test_compile_sequence_multiple_lines() { - let mut provider = make_provider(&["1q", "2d"]); + let mut provider = make_line_provider(&["1q", "2d"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -1671,7 +1723,7 @@ mod tests { #[test] fn test_compile_sequence_single_line_multiple_commands() { - let mut provider = make_provider(&["1q;2d"]); + let mut provider = make_line_provider(&["1q;2d"]); let mut opts = ctx(); let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); @@ -2345,4 +2397,74 @@ mod tests { _ => panic!("Expected BranchTarget(Some(...))"), } } + + // compile_text_command + #[test] + fn test_compile_single_line_text_command() { + let mut chars = make_char_provider("a\\"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_spaces_single_line_text_command() { + let mut chars = make_char_provider("a \\ "); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_two_line_text_command() { + let mut chars = make_char_provider("a\\"); + let mut lines = make_line_provider(&["line1\\", "line2"]); + let mut cmd = Command::default(); + + compile_text_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Text(text) => { + assert_eq!(text, "line1\nline2\n"); + } + _ => panic!("Expected CommandData::Text"), + } + } + + #[test] + fn test_compile_text_command_without_backslash() { + let mut chars = make_char_provider("a"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + let result = compile_text_command(&mut lines, &mut chars, &mut cmd); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("expects \\ followed by text")); + } + + #[test] + fn test_compile_text_command_with_trailing_chars() { + let mut chars = make_char_provider("a \\ foo"); + let mut lines = make_line_provider(&["line1", "line2"]); + let mut cmd = Command::default(); + + let result = compile_text_command(&mut lines, &mut chars, &mut cmd); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("extra characters after \\")); + } } From 858c3a6384d6d0edf10d62286d55f1ee89125602 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 13:08:11 +0300 Subject: [PATCH 03/42] Implement the i command --- src/uu/sed/src/processor.rs | 7 ++++++- tests/by-util/test_sed.rs | 13 +++++++++++++ tests/fixtures/sed/output/text_insert_quit | 6 ++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/sed/output/text_insert_quit diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index fcbc143f..38ad9715 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -348,7 +348,12 @@ fn process_file( context.hold.has_newline = pattern.is_newline_terminated(); } 'i' => { - // TODO + // Write text to standard output. + let text = match &command.data { + CommandData::Text(text) => text, + _ => panic!("Expected Text command data"), + }; + output.write_str(text)?; } 'l' => { // TODO diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 74517d81..6222c737 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -465,3 +465,16 @@ tb"#, LINES1 ] ); + +check_output!( + text_insert_quit, + [ + "-e", + r#" +5i\ +hello +5q +"#, + LINES1 + ] +); diff --git a/tests/fixtures/sed/output/text_insert_quit b/tests/fixtures/sed/output/text_insert_quit new file mode 100644 index 00000000..f15f6e3e --- /dev/null +++ b/tests/fixtures/sed/output/text_insert_quit @@ -0,0 +1,6 @@ +l1_1 +l1_2 +l1_3 +l1_4 +hello +l1_5 From 6afd775ac4e1058d5c0ae76a0329ea0e0295f724 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 19:06:26 +0300 Subject: [PATCH 04/42] Implement a c i commands --- src/uu/sed/src/command.rs | 12 +- src/uu/sed/src/compiler.rs | 3 +- src/uu/sed/src/fast_io.rs | 10 ++ src/uu/sed/src/processor.rs | 46 ++++++- src/uu/sed/src/sed.rs | 1 + tests/by-util/test_sed.rs | 120 ++++++++++++++++++ .../sed/output/text_append_before_next | 56 ++++++++ .../sed/output/text_append_between_subst | 54 ++++++++ tests/fixtures/sed/output/text_change_global | 14 ++ tests/fixtures/sed/output/text_change_line | 1 + tests/fixtures/sed/output/text_change_print | 14 ++ tests/fixtures/sed/output/text_change_range | 1 + .../sed/output/text_change_reverse_range | 1 + tests/fixtures/sed/output/text_delete | 0 .../sed/output/text_insert_between_subst | 47 +++++++ 15 files changed, 373 insertions(+), 7 deletions(-) create mode 100644 tests/fixtures/sed/output/text_append_before_next create mode 100644 tests/fixtures/sed/output/text_append_between_subst create mode 100644 tests/fixtures/sed/output/text_change_global create mode 100644 tests/fixtures/sed/output/text_change_line create mode 100644 tests/fixtures/sed/output/text_change_print create mode 100644 tests/fixtures/sed/output/text_change_range create mode 100644 tests/fixtures/sed/output/text_change_reverse_range create mode 100644 tests/fixtures/sed/output/text_delete create mode 100644 tests/fixtures/sed/output/text_insert_between_subst diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 21f0d09a..6526797b 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -15,6 +15,7 @@ use crate::named_writer::NamedWriter; use regex::Captures; use regex::Regex; +use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; // For file descriptors and equivalent @@ -65,6 +66,15 @@ pub struct ProcessingContext { pub label_to_command_map: HashMap>>, /// True if a substitution was made as specified in the t command pub substitution_made: bool, + /// Elements to append at the end of each command processing cycle + pub append_elements: Vec, +} + +#[derive(Clone, Debug)] +/// Elements that shall be appended at the end of each command processing cycle +pub enum AppendElement { + Text(String), // The specified text string + File(PathBuf), // The contents of the specified file } #[derive(Clone, Debug, Default, PartialEq)] @@ -291,7 +301,7 @@ pub enum CommandData { Label(Option), // Label name for 'b', 't', ':' NamedWriter(Box), // File descriptor for 'w' Substitution(Box), // Substitute command 's' - Text(String), // Text for 'a', 'c', 'i' + Text(Cow<'static, str>), // Text for 'a', 'c', 'i' Transliteration(Box), // Transliteration command 'y' } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 7a36bd25..337c662a 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -19,6 +19,7 @@ use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use regex::Regex; +use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; use std::mem; @@ -1027,7 +1028,7 @@ fn compile_text_command( break; } } - cmd.data = CommandData::Text(text); + cmd.data = CommandData::Text(Cow::Owned(text)); Ok(()) } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index cb46c7f5..36bb53e9 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -469,6 +469,16 @@ impl OutputBuffer { false, ))) } + + /// Copy the specified file to the output. + pub fn copy_file(&mut self, path: &PathBuf) -> io::Result<()> { + #[cfg(unix)] + self.flush_mmap()?; // Flush mmap writes, if any. + let file = File::open(path)?; + let mut reader = BufReader::new(file); + io::copy(&mut reader, &mut self.out)?; + Ok(()) + } } #[cfg(unix)] diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 38ad9715..47c40ea5 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -9,8 +9,8 @@ // file that was distributed with this source code. use crate::command::{ - Address, AddressType, AddressValue, Command, CommandData, InputAction, ProcessingContext, - Substitution, Transliteration, + Address, AddressType, AddressValue, AppendElement, Command, CommandData, InputAction, + ProcessingContext, Substitution, Transliteration, }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; @@ -237,6 +237,22 @@ fn transliterate(pattern: &mut IOChunk, trans: &Transliteration) -> UResult<()> Ok(()) } +/// Output any data queued for output at the end of the cycle. +fn flush_appends(output: &mut OutputBuffer, context: &mut ProcessingContext) -> UResult<()> { + for elem in &context.append_elements { + match elem { + AppendElement::Text(text) => { + output.write_str(text.clone())?; + } + AppendElement::File(path) => { + output.copy_file(path)?; + } + } + } + context.append_elements.clear(); + Ok(()) +} + /// Process a single input file fn process_file( commands: &Option>>, @@ -288,7 +304,14 @@ fn process_file( // Block end: continue with the block's patched next. } 'a' => { - // TODO + // Write the text to standard output at a later point. + let text = match &command.data { + CommandData::Text(text) => text, + _ => panic!("Expected Text command data"), + }; + context + .append_elements + .push(AppendElement::Text(text.clone().into_owned())); } 'b' => { // Branch to the specified label or end if none is given. @@ -305,7 +328,17 @@ fn process_file( } } 'c' => { - // TODO + // At range end replace pattern space with text and + // start the next cycle. + pattern.clear(); + if command.addr2.is_none() || context.last_address || context.last_line { + let text = match &command.data { + CommandData::Text(text) => text, + _ => panic!("Expected Text command data"), + }; + output.write_str(text.as_ref())?; + } + break; } 'd' => { // Delete the pattern space and start the next cycle. @@ -353,7 +386,7 @@ fn process_file( CommandData::Text(text) => text, _ => panic!("Expected Text command data"), }; - output.write_str(text)?; + output.write_str(text.as_ref())?; } 'l' => { // TODO @@ -362,6 +395,7 @@ fn process_file( break; } 'N' => { + flush_appends(output, context)?; // Append to pattern `\n` and the next line // Rather than reading input here, which would result // in a double borrow on reader, modify the action @@ -455,6 +489,8 @@ fn process_file( write_chunk(output, context, &pattern)?; } + flush_appends(output, context)?; + if context.stop_processing { break; } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 776edede..f81daf31 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -208,6 +208,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { parsed_block_nesting: 0, label_to_command_map: HashMap::new(), substitution_made: false, + append_elements: Vec::new(), } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 6222c737..63ca3df3 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -478,3 +478,123 @@ hello LINES1 ] ); + +check_output!( + text_insert_between_subst, + [ + "-n", + "-e", + r#" +s/^/before_i/p +20i\ +inserted +s/^/after_i/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_append_between_subst, + [ + "-n", + "-e", + r#" +5,12s/^/5-12/ +s/^/before_a/p +/5-12/a\ +appended +s/^/after_a/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_append_before_next, + [ + "-n", + "-e", + r#" +s/^/^/p +/l1_/a\ +appended +8,10N +s/$/$/p +"#, + LINES1, + LINES2 + ] +); + +check_output!( + text_change_global, + [ + "-n", + "-e", + r#" +c\ +hello +"#, + LINES1 + ] +); + +check_output!( + text_change_line, + [ + "-n", + "-e", + r#" +8c\ +hello +"#, + LINES1 + ] +); + +check_output!( + text_change_range, + [ + "-n", + "-e", + r#" +3,14c\ +hello +"#, + LINES1 + ] +); + +// SunOS and GNU sed behave differently. We follow POSIX. +check_output!( + text_change_reverse_range, + [ + "-n", + "-e", + r#" +8,3c\ +hello +"#, + LINES1 + ] +); + +check_output!(text_delete, ["d", LINES1]); + +// Check that the pattern space is deleted. +check_output!( + text_change_print, + [ + "-n", + "-e", + r#" +c\ +changed +p +"#, + LINES1 + ] +); diff --git a/tests/fixtures/sed/output/text_append_before_next b/tests/fixtures/sed/output/text_append_before_next new file mode 100644 index 00000000..ccfa1946 --- /dev/null +++ b/tests/fixtures/sed/output/text_append_before_next @@ -0,0 +1,56 @@ +^l1_1 +^l1_1$ +appended +^l1_2 +^l1_2$ +appended +^l1_3 +^l1_3$ +appended +^l1_4 +^l1_4$ +appended +^l1_5 +^l1_5$ +appended +^l1_6 +^l1_6$ +appended +^l1_7 +^l1_7$ +appended +^l1_8 +appended +^l1_8 +l1_9$ +^l1_10 +appended +^l1_10 +l1_11$ +^l1_12 +^l1_12$ +appended +^l1_13 +^l1_13$ +appended +^l1_14 +^l1_14$ +appended +^l2_1 +^l2_1$ +^l2_2 +^l2_2$ +^l2_3 +^l2_3$ +^l2_4 +^l2_4$ +^l2_5 +^l2_5$ +^l2_6 +^l2_6$ +^l2_7 +^l2_7$ +^l2_8 +^l2_8$ +^l2_9 +^l2_9$ diff --git a/tests/fixtures/sed/output/text_append_between_subst b/tests/fixtures/sed/output/text_append_between_subst new file mode 100644 index 00000000..4161c1c0 --- /dev/null +++ b/tests/fixtures/sed/output/text_append_between_subst @@ -0,0 +1,54 @@ +before_al1_1 +after_abefore_al1_1 +before_al1_2 +after_abefore_al1_2 +before_al1_3 +after_abefore_al1_3 +before_al1_4 +after_abefore_al1_4 +before_a5-12l1_5 +after_abefore_a5-12l1_5 +appended +before_a5-12l1_6 +after_abefore_a5-12l1_6 +appended +before_a5-12l1_7 +after_abefore_a5-12l1_7 +appended +before_a5-12l1_8 +after_abefore_a5-12l1_8 +appended +before_a5-12l1_9 +after_abefore_a5-12l1_9 +appended +before_a5-12l1_10 +after_abefore_a5-12l1_10 +appended +before_a5-12l1_11 +after_abefore_a5-12l1_11 +appended +before_a5-12l1_12 +after_abefore_a5-12l1_12 +appended +before_al1_13 +after_abefore_al1_13 +before_al1_14 +after_abefore_al1_14 +before_al2_1 +after_abefore_al2_1 +before_al2_2 +after_abefore_al2_2 +before_al2_3 +after_abefore_al2_3 +before_al2_4 +after_abefore_al2_4 +before_al2_5 +after_abefore_al2_5 +before_al2_6 +after_abefore_al2_6 +before_al2_7 +after_abefore_al2_7 +before_al2_8 +after_abefore_al2_8 +before_al2_9 +after_abefore_al2_9 diff --git a/tests/fixtures/sed/output/text_change_global b/tests/fixtures/sed/output/text_change_global new file mode 100644 index 00000000..94e9a6d3 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_global @@ -0,0 +1,14 @@ +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello +hello diff --git a/tests/fixtures/sed/output/text_change_line b/tests/fixtures/sed/output/text_change_line new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_line @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_change_print b/tests/fixtures/sed/output/text_change_print new file mode 100644 index 00000000..78db2a52 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_print @@ -0,0 +1,14 @@ +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed +changed diff --git a/tests/fixtures/sed/output/text_change_range b/tests/fixtures/sed/output/text_change_range new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_range @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_change_reverse_range b/tests/fixtures/sed/output/text_change_reverse_range new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/fixtures/sed/output/text_change_reverse_range @@ -0,0 +1 @@ +hello diff --git a/tests/fixtures/sed/output/text_delete b/tests/fixtures/sed/output/text_delete new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/text_insert_between_subst b/tests/fixtures/sed/output/text_insert_between_subst new file mode 100644 index 00000000..455093ce --- /dev/null +++ b/tests/fixtures/sed/output/text_insert_between_subst @@ -0,0 +1,47 @@ +before_il1_1 +after_ibefore_il1_1 +before_il1_2 +after_ibefore_il1_2 +before_il1_3 +after_ibefore_il1_3 +before_il1_4 +after_ibefore_il1_4 +before_il1_5 +after_ibefore_il1_5 +before_il1_6 +after_ibefore_il1_6 +before_il1_7 +after_ibefore_il1_7 +before_il1_8 +after_ibefore_il1_8 +before_il1_9 +after_ibefore_il1_9 +before_il1_10 +after_ibefore_il1_10 +before_il1_11 +after_ibefore_il1_11 +before_il1_12 +after_ibefore_il1_12 +before_il1_13 +after_ibefore_il1_13 +before_il1_14 +after_ibefore_il1_14 +before_il2_1 +after_ibefore_il2_1 +before_il2_2 +after_ibefore_il2_2 +before_il2_3 +after_ibefore_il2_3 +before_il2_4 +after_ibefore_il2_4 +before_il2_5 +after_ibefore_il2_5 +before_il2_6 +inserted +after_ibefore_il2_6 +before_il2_7 +after_ibefore_il2_7 +before_il2_8 +after_ibefore_il2_8 +before_il2_9 +after_ibefore_il2_9 From 0564f9579f1750f7f9112482cc66ea27bef8894e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 20:51:16 +0300 Subject: [PATCH 05/42] Handle empty REs at runtime Previously empty RE were saved and reused based on their occurrence in the compiled script. Per POSIX if an RE is empty (that is, no pattern is specified) sed shall behave as if the last RE used in the last command applied (either as an address or as part of a substitute command) was specified. Now REs are saved and reused at runtime. --- src/uu/sed/src/command.rs | 21 +-- src/uu/sed/src/compiler.rs | 165 +++++------------- src/uu/sed/src/processor.rs | 25 ++- tests/by-util/test_sed.rs | 3 + tests/fixtures/sed/output/pattern_re_reuse | 12 ++ .../sed/output/pattern_subst_re_reuse | 12 ++ tests/fixtures/sed/output/subst_re_reuse | 14 ++ 7 files changed, 112 insertions(+), 140 deletions(-) create mode 100644 tests/fixtures/sed/output/pattern_re_reuse create mode 100644 tests/fixtures/sed/output/pattern_subst_re_reuse create mode 100644 tests/fixtures/sed/output/subst_re_reuse diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 6526797b..c9a75bc3 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -110,7 +110,7 @@ pub struct Address { #[derive(Debug)] pub enum AddressValue { LineNumber(usize), - Regex(Regex), + Regex(Option), } #[derive(Debug)] @@ -180,32 +180,18 @@ impl ReplacementTemplate { } } -#[derive(Debug)] +#[derive(Debug, Default)] /// Substitution command pub struct Substitution { pub occurrence: usize, // Which occurrence to substitute pub print_flag: bool, // True if 'p' flag pub ignore_case: bool, // True if 'I' flag pub write_file: Option>>, // Writer to file if 'w' flag is used - pub regex: Regex, // Regular expression + pub regex: Option, // Regular expression pub line_number: usize, // Line number pub replacement: ReplacementTemplate, // Specified broken-down replacement } -impl Default for Substitution { - fn default() -> Self { - Substitution { - occurrence: 0, - print_flag: false, - ignore_case: false, - write_file: None, - regex: Regex::new("").unwrap(), // safe dummy regex - line_number: 0, - replacement: ReplacementTemplate::default(), - } - } -} - /// The block of the first and most common Unicode characters: /// ASCII, Latin Extended, Greek, Curillic, Coptic, Arabic, etc. /// It comprises all UCS-2 characters. We use a fast lookup array for these. @@ -337,7 +323,6 @@ pub struct InputAction { #[cfg(test)] mod tests { use super::*; - use regex::Regex; // Return the captures for the RE applied to the specified string fn caps_for<'a>(re: &str, input: &'a str) -> regex::Captures<'a> { diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 337c662a..f746e4fe 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -601,48 +601,40 @@ fn bre_to_ere(pattern: &str) -> String { } /// Compile the provided regular expression string into a corresponding engine. +/// An empty pattern results in None, which means that the last RE employed +/// at runtime will be used. fn compile_regex( lines: &ScriptLineProvider, line: &ScriptCharProvider, pattern: &str, context: &ProcessingContext, icase: bool, -) -> UResult { +) -> UResult> { if pattern.is_empty() { - let maybe_existing = context.saved_regex.borrow(); - if let Some(existing) = &*maybe_existing { - Ok(existing.clone()) - } else { - compilation_error(lines, line, "no previously compiled regex available") - } - } else { - // Convert basic to extended regular expression if needed. - let ere_pattern = if context.regex_extended { - pattern - } else { - &bre_to_ere(pattern) - }; + return Ok(None); + } - // Add case-insensitive modifier if needed. - let full_pattern = if icase { - if ere_pattern.is_empty() { - return compilation_error(lines, line, "cannot specify a modifier on an empty RE"); - } - format!("(?i){}", ere_pattern) - } else { - ere_pattern.to_string() - }; + // Convert basic to extended regular expression if needed. + let ere_pattern = if context.regex_extended { + pattern + } else { + &bre_to_ere(pattern) + }; - // Compile into engine. - let compiled = Regex::new(&full_pattern).map_err(|e| { - compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) - .unwrap_err() - })?; + // Add case-insensitive modifier if needed. + let full_pattern = if icase { + format!("(?i){}", ere_pattern) + } else { + ere_pattern.to_string() + }; - *context.saved_regex.borrow_mut() = Some(compiled.clone()); + // Compile into engine. + let compiled = Regex::new(&full_pattern).map_err(|e| { + compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) + .unwrap_err() + })?; - Ok(compiled) - } + Ok(Some(compiled)) } /// Compile a regular expression replacement string. @@ -765,9 +757,6 @@ fn compile_subst_command( } let pattern = parse_regex(lines, line)?; - if pattern.is_empty() { - return compilation_error(lines, line, "unterminated substitute pattern"); - } let mut subst = Box::new(Substitution { line_number: lines.get_line_number(), @@ -777,27 +766,21 @@ fn compile_subst_command( subst.replacement = compile_replacement(lines, line)?; compile_subst_flags(lines, line, &mut subst)?; - // Compile regex with now known ignore_case flag. - subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; - - let re_captures: u32 = subst - .regex - .captures_len() - .saturating_sub(1) - .try_into() - .unwrap(); - let max_group_number = subst.replacement.max_group_number(); - if max_group_number > re_captures { + if pattern.is_empty() && subst.ignore_case { return compilation_error( lines, line, - format!( - "group number \\{} is larger than the {} available RE groups", - max_group_number, re_captures - ), + "cannot specify modifiers on an empty regular expression", ); } + if pattern.is_empty() { + subst.regex = None; // Will be reuse saved RE at runtime. + } else { + // Compile regex with now known ignore_case flag. + subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; + } + cmd.data = CommandData::Substitution(subst); parse_command_ending(lines, line, cmd) @@ -1342,7 +1325,9 @@ mod tests { #[test] fn test_compile_re_basic() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", &ctx(), false).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", &ctx(), false) + .unwrap() + .expect("regex should be present"); assert!(regex.is_match("abc")); assert!(!regex.is_match("ABC")); } @@ -1350,33 +1335,14 @@ mod tests { #[test] fn test_compile_re_case_insensitive() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", &ctx(), true).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", &ctx(), true) + .unwrap() + .expect("regex should be present"); assert!(regex.is_match("abc")); assert!(regex.is_match("ABC")); assert!(regex.is_match("AbC")); } - #[test] - fn test_compile_re_saved_and_reuse() { - let context = ctx(); - // Save a regex - let (lines1, chars1) = dummy_providers(); - let _ = compile_regex(&lines1, &chars1, "abc", &context, false).unwrap(); - - // Now try to reuse it - let (lines2, chars2) = dummy_providers(); - let reused = compile_regex(&lines2, &chars2, "", &context, false).unwrap(); - - assert!(reused.is_match("abc")); - } - - #[test] - fn test_compile_re_empty_and_not_saved() { - let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "", &ctx(), false); - assert!(result.is_err()); // Should fail because nothing was saved - } - #[test] fn test_compile_re_invalid() { let (lines, chars) = dummy_providers(); @@ -1421,7 +1387,7 @@ mod tests { let (lines, mut chars) = make_providers("/hello/"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { + if let AddressValue::Regex(Some(re)) = addr.value { assert!(re.is_match("hello")); } else { panic!("expected Regex address value"); @@ -1433,7 +1399,7 @@ mod tests { let (lines, mut chars) = make_providers("\\#hello#"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { + if let AddressValue::Regex(Some(re)) = addr.value { assert!(re.is_match("hello")); } else { panic!("expected Regex address value"); @@ -1445,31 +1411,13 @@ mod tests { let (lines, mut chars) = make_providers("/hello/I"); let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { + if let AddressValue::Regex(Some(re)) = addr.value { assert!(re.is_match("HELLO")); // case-insensitive } else { panic!("expected Regex address value"); } } - #[test] - fn test_compile_addr_empty_regex_saved() { - let context = ctx(); - // First save a regex - let (lines1, mut chars1) = make_providers("/saved/"); - let _ = compile_address(&lines1, &mut chars1, &context).unwrap(); - - // Then reuse it with empty regex - let (lines2, mut chars2) = make_providers("//"); - let addr = compile_address(&lines2, &mut chars2, &context).unwrap(); - assert!(matches!(addr.atype, AddressType::Re)); - if let AddressValue::Regex(re) = addr.value { - assert!(re.is_match("saved")); - } else { - panic!("expected Regex address value"); - } - } - // compile_address_range #[test] fn test_compile_single_line_address() { @@ -1554,7 +1502,7 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("foo")); assert!(!re.is_match("bar")); } else { @@ -1574,7 +1522,7 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("foo")); assert!(!re.is_match("bar")); } else { @@ -1585,7 +1533,7 @@ mod tests { cmd.borrow().addr2.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr2.as_ref().unwrap().value { + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr2.as_ref().unwrap().value { assert!(re.is_match("bar")); assert!(!re.is_match("foo")); } else { @@ -1604,7 +1552,7 @@ mod tests { cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { + if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("FOO")); assert!(re.is_match("foo")); } else { @@ -1630,7 +1578,7 @@ mod tests { cmd2.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd2.borrow().addr1.as_ref().unwrap().value { + if let AddressValue::Regex(Some(re)) = &cmd2.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("abc")); }; } @@ -1949,15 +1897,6 @@ mod tests { ); } - #[test] - fn test_compile_subst_empty_pattern() { - let (mut lines, mut chars) = make_providers("s//bar/"); - let mut cmd = Command::default(); - - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!(err.to_string().contains("unterminated substitute pattern")); - } - #[test] fn test_compile_subst_extra_characters_at_end() { let (mut lines, mut chars) = make_providers("s/foo/bar/x"); @@ -1998,18 +1937,6 @@ mod tests { } } - #[test] - fn test_compile_subst_invalid_group_number() { - let (mut lines, mut chars) = make_providers(r"s/\(.\)\(.\)/\3\2\1/"); - let mut cmd = Command::default(); - - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!( - err.to_string() - .contains("group number \\3 is larger than the 2 available RE groups") - ); - } - // bre_to_ere #[test] fn test_bre_group_translation() { diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 47c40ea5..cdb2743e 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -15,6 +15,7 @@ use crate::command::{ use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; use crate::named_writer; +use regex::Regex; use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; @@ -25,12 +26,13 @@ use uucore::error::{UResult, USimpleError}; fn match_address( addr: &Address, pattern: &mut IOChunk, - context: &ProcessingContext, + context: &mut ProcessingContext, ) -> UResult { match addr.atype { AddressType::Re => { if let AddressValue::Regex(ref re) = addr.value { - Ok(re.is_match(pattern.as_str()?)) + let regex = re_or_saved_re(re, context)?; + Ok(regex.is_match(pattern.as_str()?)) } else { Ok(false) } @@ -161,6 +163,21 @@ fn write_chunk( Ok(()) } +/// Return the RE or the saved RE if the RE is None. +/// Update the saved RE to RE. +fn re_or_saved_re(regex: &Option, context: &mut ProcessingContext) -> UResult { + match regex { + Some(re) => { + *context.saved_regex.borrow_mut() = Some(re.clone()); + Ok(re.clone()) + } + None => match &*context.saved_regex.borrow() { + Some(saved_re) => Ok(saved_re.clone()), + None => Err(USimpleError::new(2, "no previous regular expression")), + }, + } +} + /// Perform the specified RE replacement in the provided pattern space. fn substitute( pattern: &mut IOChunk, @@ -175,7 +192,9 @@ fn substitute( let text = pattern.as_str()?; - for caps in sub.regex.captures_iter(text) { + let regex = re_or_saved_re(&sub.regex, context)?; + + for caps in regex.captures_iter(text) { count += 1; let m = caps.get(0).unwrap(); diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 63ca3df3..d5f9c148 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -226,6 +226,7 @@ check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); check_output!(subst_case_insensitive, ["-e", r"s/L/Line/", LINES1]); check_output!(subst_no_new_line, ["-e", r"s/l/L/g", NO_NEW_LINE]); +check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]); #[test] fn subst_write_file() -> std::io::Result<()> { @@ -278,6 +279,8 @@ check_output!(pattern_next_print_output, ["-e", r"4n;p", LINES1]); check_output!(pattern_next_print_no_output, ["-n", "-e", r"4n;p", LINES1]); check_output!(pattern_quit, [r"5q", LINES1]); check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); +check_output!(pattern_re_reuse, ["-n", r"/_1/p;//p", LINES1]); +check_output!(pattern_subst_re_reuse, ["-n", r"/_1/p;s//-N/p", LINES1]); check_output!( block_simple_range, diff --git a/tests/fixtures/sed/output/pattern_re_reuse b/tests/fixtures/sed/output/pattern_re_reuse new file mode 100644 index 00000000..3eef2361 --- /dev/null +++ b/tests/fixtures/sed/output/pattern_re_reuse @@ -0,0 +1,12 @@ +l1_1 +l1_1 +l1_10 +l1_10 +l1_11 +l1_11 +l1_12 +l1_12 +l1_13 +l1_13 +l1_14 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_subst_re_reuse b/tests/fixtures/sed/output/pattern_subst_re_reuse new file mode 100644 index 00000000..c8a099eb --- /dev/null +++ b/tests/fixtures/sed/output/pattern_subst_re_reuse @@ -0,0 +1,12 @@ +l1_1 +l1-N +l1_10 +l1-N0 +l1_11 +l1-N1 +l1_12 +l1-N2 +l1_13 +l1-N3 +l1_14 +l1-N4 diff --git a/tests/fixtures/sed/output/subst_re_reuse b/tests/fixtures/sed/output/subst_re_reuse new file mode 100644 index 00000000..20a49d1c --- /dev/null +++ b/tests/fixtures/sed/output/subst_re_reuse @@ -0,0 +1,14 @@ +L1_1 +M1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 From aa66386506db438ed64fd781287650170afd800d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 21:03:23 +0300 Subject: [PATCH 06/42] Fix compiled data structure patching order The previous order resulted in a duplicate label error. --- src/uu/sed/src/compiler.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index f746e4fe..0339cb34 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -207,16 +207,18 @@ pub fn compile( let mut empty_line = ScriptCharProvider::new(""); let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; + // Link branch commands to the target label commands. + populate_label_map(result.clone(), context)?; + resolve_branch_targets(result.clone(), context)?; + // Link the ends of command blocks to their following commands. + // This converts the tree into a graph, so it must be the last + // conversion than traverses the structure as a tree. if context.parsed_block_nesting > 0 { return Err(USimpleError::new(1, "unmatched `{'")); } patch_block_endings(result.clone()); - // Link branch commands to the target label commands. - populate_label_map(result.clone(), context)?; - resolve_branch_targets(result.clone(), context)?; - // Comment-out the following to show the compiled script. #[cfg(any())] dbg!(&result); From 416c4a7285ec7c93aebdfa75239e918e9981e8e3 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 22:03:40 +0300 Subject: [PATCH 07/42] Escape dollar and caret in BREs --- src/uu/sed/src/compiler.rs | 43 +++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 0339cb34..3b7c3e2b 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -568,35 +568,58 @@ fn bre_to_ere(pattern: &str) -> String { let mut result = String::with_capacity(pattern.len()); let mut chars = pattern.chars().peekable(); + let mut at_beginning = true; + let mut previous: Option = None; while let Some(c) = chars.next() { if c == '\\' { match chars.peek() { Some('(') => { chars.next(); - result.push('('); // group start + result.push('('); // Group start } Some(')') => { chars.next(); - result.push(')'); // group end + result.push(')'); // Group end } Some(&next) => { + // Preserve other escaped characters. chars.next(); result.push('\\'); - result.push(next); // preserve other escaped characters + result.push(next); } None => { - result.push('\\'); // trailing backslash, keep it + // Trailing backslash; keep it. + result.push('\\'); } } } else { match c { '+' | '?' | '{' | '}' | '|' | '(' | ')' => { - result.push('\\'); // escape unsupported ERE metacharacters + // Escape unsupported ERE metacharacters. + result.push('\\'); + result.push(c); + } + '^' if !at_beginning && previous != Some('[') => { + // In BREs ^ has special meaning at the beginning + // and as bracket negation. This heuristic escapes + // all other uses, which per POSIX are valid in EREs. + // "the ERE "a^b" is valid, but can never match because + // the 'a' prevents the expression "^b" from matching + // starting at the first character." + // POSIX 9.4.9 ERE Expression Anchoring + result.push('\\'); + result.push(c); + } + '$' if chars.peek().is_some() => { + // Similarly for $ appearing not at the end. + result.push('\\'); result.push(c); } _ => result.push(c), } } + at_beginning = false; + previous = Some(c); } result @@ -1973,6 +1996,16 @@ mod tests { assert_eq!(bre_to_ere(r"abc\"), r"abc\"); } + #[test] + fn test_caret_escaped_in_middle() { + assert_eq!(bre_to_ere(r"^a^[^x]c"), r"^a\^[^x]c"); + } + + #[test] + fn test_dollar_escaped_in_middle() { + assert_eq!(bre_to_ere(r"a$c$"), r"a\$c$"); + } + // patch_block_endings // Create a command with the specified code. From f844dddfe41f132e44620862984a80d2fd5d2820 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 23:20:24 +0300 Subject: [PATCH 08/42] Fix handling of escaped delimiter --- src/uu/sed/src/compiler.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 3b7c3e2b..9e4b2065 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -706,12 +706,18 @@ pub fn compile_replacement( line.advance(); } - // literal \ and & + // Literal \ and & '\\' | '&' => { literal.push(line.current()); line.advance(); } + // Literal delimiter + v @ _ if v == delimiter => { + literal.push(line.current()); + line.advance(); + } + // other escape sequences _ => match parse_char_escape(line) { Some(decoded) => literal.push(decoded), @@ -1745,6 +1751,15 @@ mod tests { assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hello")); } + #[test] + fn test_compile_replacement_escaped_delimiter() { + let (mut lines, mut chars) = make_providers(r"/hell\/o/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 1); + assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hell/o")); + } + #[test] fn test_compile_replacement_backrefs_and_literal() { let (mut lines, mut chars) = make_providers("/prefix \\1 and \\2/"); From 136017431f78d95b1896d1e1df5114abf52a1e18 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 18 May 2025 23:27:13 +0300 Subject: [PATCH 09/42] Add math torture test --- tests/by-util/test_sed.rs | 2 ++ tests/fixtures/sed/input/expression1 | 1 + tests/fixtures/sed/output/math1 | 1 + 3 files changed, 4 insertions(+) create mode 100644 tests/fixtures/sed/input/expression1 create mode 100644 tests/fixtures/sed/output/math1 diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index d5f9c148..233d01f6 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -601,3 +601,5 @@ p LINES1 ] ); + +check_output!(math1, ["-f", "script/math.sed", "input/expression1"]); diff --git a/tests/fixtures/sed/input/expression1 b/tests/fixtures/sed/input/expression1 new file mode 100644 index 00000000..4378eeb8 --- /dev/null +++ b/tests/fixtures/sed/input/expression1 @@ -0,0 +1 @@ +(2^10 + 5! - (12 % 5)) * 3 diff --git a/tests/fixtures/sed/output/math1 b/tests/fixtures/sed/output/math1 new file mode 100644 index 00000000..767be6bb --- /dev/null +++ b/tests/fixtures/sed/output/math1 @@ -0,0 +1 @@ +3426 From fddd6c18a2d76d509851a102daf35f41e1ae5bdd Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 10:22:46 +0300 Subject: [PATCH 10/42] Support regular expression back-references regex::Regex EREs don't support back-references, so switch to fancy_regex. While at it remove test_compile_re_reuse_saved test case, which is no longer valid, as REs are reused at runtime. --- Cargo.lock | 28 +++++++++++++ Cargo.toml | 2 + src/uu/sed/Cargo.toml | 2 + src/uu/sed/src/command.rs | 13 +++--- src/uu/sed/src/compiler.rs | 83 +++++++++++++++++++------------------ src/uu/sed/src/processor.rs | 17 +++++++- 6 files changed, 97 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0d3db1c4..e87e1f3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,6 +82,21 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.9.1" @@ -275,6 +290,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -675,6 +701,7 @@ dependencies = [ "clap_complete", "clap_mangen", "ctor", + "fancy-regex", "libc", "memmap2", "phf", @@ -868,6 +895,7 @@ name = "uu_sed" version = "0.0.1" dependencies = [ "clap", + "fancy-regex", "memmap2", "regex", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index f756bc7a..871f72ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ chrono = { version = "0.4.37", default-features = false, features = [ clap = { version = "4.4", features = ["wrap_help", "cargo"] } clap_complete = "4.5" clap_mangen = "0.2" +fancy-regex = "0.14.0" libc = "0.2.153" memmap2 = "0.9" phf = "0.11.2" @@ -54,6 +55,7 @@ clap = { workspace = true } clap_complete = { workspace = true } clap_mangen = { workspace = true } ctor = "0.4.1" +fancy-regex = { workspace = true } memmap2.workspace = true phf = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 4faf2d9b..219ceb37 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -14,6 +14,8 @@ categories = ["command-line-utilities"] [dependencies] clap = { workspace = true } +fancy-regex = { workspace = true } +once_cell = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index c9a75bc3..e26e793c 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -13,8 +13,7 @@ use crate::named_writer::NamedWriter; -use regex::Captures; -use regex::Regex; +use fancy_regex::{Captures, Regex}; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -137,7 +136,7 @@ impl Default for ReplacementTemplate { impl ReplacementTemplate { /// Apply the template to the given RE captures. /// Example: - /// let result = regex.replace_all(input, |caps: ®ex::Captures| { + /// let result = regex.replace_all(input, |caps: &Captures| { /// template.apply(caps) }); /// Returns an error if a backreference in the template was not matched by the RE. pub fn apply(&self, caps: &Captures) -> UResult { @@ -325,8 +324,12 @@ mod tests { use super::*; // Return the captures for the RE applied to the specified string - fn caps_for<'a>(re: &str, input: &'a str) -> regex::Captures<'a> { - Regex::new(re).unwrap().captures(input).unwrap() + fn caps_for<'a>(re: &str, input: &'a str) -> Captures<'a> { + Regex::new(re) + .unwrap() + .captures(input) + .unwrap() + .expect("captures") } #[test] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 9e4b2065..37254cee 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -18,7 +18,7 @@ use crate::delimited_parser::{ use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; -use regex::Regex; +use fancy_regex::Regex; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -713,7 +713,7 @@ pub fn compile_replacement( } // Literal delimiter - v @ _ if v == delimiter => { + v if v == delimiter => { literal.push(line.current()); line.advance(); } @@ -1359,8 +1359,8 @@ mod tests { let regex = compile_regex(&lines, &chars, "abc", &ctx(), false) .unwrap() .expect("regex should be present"); - assert!(regex.is_match("abc")); - assert!(!regex.is_match("ABC")); + assert!(regex.is_match("abc").unwrap()); + assert!(!regex.is_match("ABC").unwrap()); } #[test] @@ -1369,9 +1369,9 @@ mod tests { let regex = compile_regex(&lines, &chars, "abc", &ctx(), true) .unwrap() .expect("regex should be present"); - assert!(regex.is_match("abc")); - assert!(regex.is_match("ABC")); - assert!(regex.is_match("AbC")); + assert!(regex.is_match("abc").unwrap()); + assert!(regex.is_match("ABC").unwrap()); + assert!(regex.is_match("AbC").unwrap()); } #[test] @@ -1419,7 +1419,31 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("hello")); + assert!(re.is_match("hello").unwrap()); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_backref_match() { + let (lines, mut chars) = make_providers(r"/he\(.\)\1o/"); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(re.is_match("hello").unwrap()); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_backref_no_match() { + let (lines, mut chars) = make_providers(r"/he\(.\)\1o/"); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(Some(re)) = addr.value { + assert!(!re.is_match("helio").unwrap()); } else { panic!("expected Regex address value"); } @@ -1431,7 +1455,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("hello")); + assert!(re.is_match("hello").unwrap()); } else { panic!("expected Regex address value"); } @@ -1443,7 +1467,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("HELLO")); // case-insensitive + assert!(re.is_match("HELLO").unwrap()); // case-insensitive } else { panic!("expected Regex address value"); } @@ -1534,8 +1558,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo")); - assert!(!re.is_match("bar")); + assert!(re.is_match("foo").unwrap()); + assert!(!re.is_match("bar").unwrap()); } else { panic!("expected a regex address"); }; @@ -1554,8 +1578,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo")); - assert!(!re.is_match("bar")); + assert!(re.is_match("foo").unwrap()); + assert!(!re.is_match("bar").unwrap()); } else { panic!("expected a regex address"); } @@ -1565,8 +1589,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr2.as_ref().unwrap().value { - assert!(re.is_match("bar")); - assert!(!re.is_match("foo")); + assert!(re.is_match("bar").unwrap()); + assert!(!re.is_match("foo").unwrap()); } else { panic!("expected a regex address"); }; @@ -1584,36 +1608,13 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("FOO")); - assert!(re.is_match("foo")); + assert!(re.is_match("FOO").unwrap()); + assert!(re.is_match("foo").unwrap()); } else { panic!("expected a regex address with case-insensitive match"); }; } - #[test] - fn test_compile_re_reuse_saved() { - let context = ctx(); - // First save a regex - let (lines1, mut chars1) = make_providers("/abc/"); - let mut cmd1 = Rc::new(RefCell::new(Command::default())); - compile_address_range(&lines1, &mut chars1, &mut cmd1, &context).unwrap(); - - // Now reuse it - let (lines2, mut chars2) = make_providers("//"); - let mut cmd2 = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2, &context).unwrap(); - - assert_eq!(n_addr, 1); - assert!(matches!( - cmd2.borrow().addr1.as_ref().unwrap().atype, - AddressType::Re - )); - if let AddressValue::Regex(Some(re)) = &cmd2.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("abc")); - }; - } - // compile_sequence fn empty_line() -> ScriptCharProvider { ScriptCharProvider::new("") diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index cdb2743e..6ef5a0c9 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -15,7 +15,7 @@ use crate::command::{ use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; use crate::named_writer; -use regex::Regex; +use fancy_regex::Regex; use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; @@ -32,7 +32,10 @@ fn match_address( AddressType::Re => { if let AddressValue::Regex(ref re) = addr.value { let regex = re_or_saved_re(re, context)?; - Ok(regex.is_match(pattern.as_str()?)) + let matched = regex.is_match(pattern.as_str()?).map_err(|e| { + USimpleError::new(2, format!("regular expression match error: {}", e)) + })?; + Ok(matched) } else { Ok(false) } @@ -196,6 +199,16 @@ fn substitute( for caps in regex.captures_iter(text) { count += 1; + let caps = match caps { + Ok(c) => c, + Err(e) => { + return Err(USimpleError::new( + 2, + format!("regular expression capture retrieval error: {}", e), + )); + } + }; + let m = caps.get(0).unwrap(); // Always write the unmatched text before this match. From a0da132aeba233ad5e6da67cf60c6efd60207ccd Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 18:32:43 +0300 Subject: [PATCH 11/42] Fix ERE conversion of BRE back-references --- src/uu/sed/src/compiler.rs | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 37254cee..2bf21ba3 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -561,9 +561,10 @@ fn parse_command_ending( } /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. -/// - Replacces `\(` and `\)` with `(` and `)` -/// - Escapes ERE-only metacharacters: `+ ? { } | ( )` -/// - Leaves all other characters as-is +/// - Replaces `\(` and `\)` with `(` and `)`. +/// - Puts single-digit back-references in non-capturing groups.. +/// - Escapes ERE-only metacharacters: `+ ? { } | ( )`. +/// - Leaves all other characters as-is. fn bre_to_ere(pattern: &str) -> String { let mut result = String::with_capacity(pattern.len()); let mut chars = pattern.chars().peekable(); @@ -581,6 +582,16 @@ fn bre_to_ere(pattern: &str) -> String { chars.next(); result.push(')'); // Group end } + Some(v) if v.is_ascii_digit() => { + // Back-reference. In sed BREs these are single-digit + // (\1-\9) whereas fancy_regex supports multi-digit + // back-references. Put them in a non-capturing group + // to avoid having the number extend beyond the single + // digit. Example: In sed \11 matches group 1 followed + // by '1', not group 11. + result.push_str(&format!(r"(?:\{})", v)); + chars.next(); + } Some(&next) => { // Preserve other escaped characters. chars.next(); @@ -2022,6 +2033,11 @@ mod tests { assert_eq!(bre_to_ere(r"a$c$"), r"a\$c$"); } + #[test] + fn test_bre_back_reference() { + assert_eq!(bre_to_ere(r"\(.\)\1\(.\)\2"), r"(.)(?:\1)(.)(?:\2)"); + } + // patch_block_endings // Create a command with the specified code. From 84ca66ca8365dac0a6251049ed5c202b6aca4f87 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 18:35:02 +0300 Subject: [PATCH 12/42] Add Towers of Hanoi test case --- tests/by-util/test_sed.rs | 1 + tests/fixtures/sed/input/hanoi | 1 + tests/fixtures/sed/output/hanoi | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 tests/fixtures/sed/input/hanoi create mode 100644 tests/fixtures/sed/output/hanoi diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 233d01f6..c2bdd814 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -603,3 +603,4 @@ p ); check_output!(math1, ["-f", "script/math.sed", "input/expression1"]); +check_output!(hanoi, ["-f", "script/hanoi.sed", "input/hanoi"]); diff --git a/tests/fixtures/sed/input/hanoi b/tests/fixtures/sed/input/hanoi new file mode 100644 index 00000000..1f7fa813 --- /dev/null +++ b/tests/fixtures/sed/input/hanoi @@ -0,0 +1 @@ +:abcd: : : diff --git a/tests/fixtures/sed/output/hanoi b/tests/fixtures/sed/output/hanoi new file mode 100644 index 00000000..ec39ea26 --- /dev/null +++ b/tests/fixtures/sed/output/hanoi @@ -0,0 +1,17 @@ +:abcd: : : +:abc : :d : +:ab :c :d : +:ab :cd : : +:a :cd :b : +:ad :c :b : +:ad : :bc : +:a : :bcd : +: :a :bcd : +: :ad :bc : +:c :ad :b : +:cd :a :b : +:cd :ab : : +:c :ab :d : +: :abc :d : +: :abcd: : +Done! Try another, or end with ^D. From 1a6b078c8aff4b486a1bafc5e2e992ffc1b6356d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 18:49:31 +0300 Subject: [PATCH 13/42] Add Pi calculation test case --- tests/by-util/test_sed.rs | 3 +++ tests/fixtures/sed/input/pi | 1 + tests/fixtures/sed/output/pi | 1 + 3 files changed, 5 insertions(+) create mode 100644 tests/fixtures/sed/input/pi create mode 100644 tests/fixtures/sed/output/pi diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index c2bdd814..34e30141 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -603,4 +603,7 @@ p ); check_output!(math1, ["-f", "script/math.sed", "input/expression1"]); +// Calculate π (scaled) to several decimal places +check_output!(pi, ["-f", "script/math.sed", "input/pi"]); + check_output!(hanoi, ["-f", "script/hanoi.sed", "input/hanoi"]); diff --git a/tests/fixtures/sed/input/pi b/tests/fixtures/sed/input/pi new file mode 100644 index 00000000..445c740a --- /dev/null +++ b/tests/fixtures/sed/input/pi @@ -0,0 +1 @@ +(104348 * 1000000000000000) / 33215 diff --git a/tests/fixtures/sed/output/pi b/tests/fixtures/sed/output/pi new file mode 100644 index 00000000..06817077 --- /dev/null +++ b/tests/fixtures/sed/output/pi @@ -0,0 +1 @@ +3141592653921421 From bb9bfb66e4ac00e560f7b485a01024f677e4866b Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 19:23:59 +0300 Subject: [PATCH 14/42] Tidy up and document integration tests --- tests/by-util/test_sed.rs | 51 +++++++++++++------ ...nt_to_newline => pattern_print_to_newline} | 0 2 files changed, 36 insertions(+), 15 deletions(-) rename tests/fixtures/sed/output/{print_to_newline => pattern_print_to_newline} (100%) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 34e30141..1e47d5be 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -9,12 +9,13 @@ // file that was distributed with this source code. use std::fs; -use std::io::{Read, Write}; +use std::io::Read; use tempfile::NamedTempFile; use uutests::new_ucmd; use uutests::util::TestScenario; use uutests::util_name; +//////////////////////////////////////////////////////////// // Test application's invocation #[test] fn test_invalid_arg() { @@ -56,13 +57,11 @@ fn test_e_script_ok() { #[test] fn test_f_script_ok() { - let mut temp = NamedTempFile::new().expect("Failed to create temp file"); - writeln!(temp, "l").expect("Failed to write to temp file"); - let path = temp.path(); - - new_ucmd!().arg("-f").arg(path).succeeds(); + new_ucmd!().arg("-f").arg("script/hanoi.sed").succeeds(); } +//////////////////////////////////////////////////////////// +// Test simple I/O processing const INPUT_FILES: &[&str] = &[ "input/two-lines.txt", "input/no-new-line.txt", @@ -126,24 +125,29 @@ macro_rules! check_output { }; } +//////////////////////////////////////////////////////////// +// Individual command tests + // Input files const LINES1: &str = "input/lines1"; const LINES2: &str = "input/lines2"; const NO_NEW_LINE: &str = "input/no-new-line.txt"; -// Test address ranges +//////////////////////////////////////////////////////////// +// Address ranges: One and two, numeric and pattern check_output!(addr_one_line, ["-n", "-e", "4p", LINES1]); check_output!(addr_straddle, ["-n", "-e", "20p", LINES1, LINES2]); check_output!(addr_last_one_file, ["-n", "-e", "$p", LINES1]); check_output!(addr_last_two_files, ["-n", "-e", "$p", LINES1, LINES2]); -// TODO: Enable and configure for Unix/Windows, when "a" is implemented. -#[cfg(any())] -check_output!(addr_append_with_empty, ["-e", "$a\nhello", "/dev/null"]); +#[cfg(unix)] +check_output!(addr_append_empty, ["-e", "$a\\\nhello", "/dev/null"]); +#[cfg(windows)] +check_output!(addr_append_empty, ["-e", "$a\\\nhello", "NUL"]); #[cfg(unix)] check_output!( - addr_last_with_empty, + addr_last_empty, ["-n", "-e", "$p", LINES1, "/dev/null", LINES2] ); @@ -196,7 +200,8 @@ check_output!( check_output!(addr_empty_re_reuse, ["-n", "/_2/,//p", LINES1, LINES2]); check_output!(addr_simple_negation, ["-e", r"4,12!s/^/^/", LINES1]); -// Test substitutions +//////////////////////////////////////////////////////////// +// Substitution: s check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); check_output!(subst_any_global, ["-e", r"s,.,X,g", LINES1]); check_output!(subst_escaped_magic_separator, ["-e", r"s.\..X.g", LINES1]); @@ -227,6 +232,8 @@ check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); check_output!(subst_case_insensitive, ["-e", r"s/L/Line/", LINES1]); check_output!(subst_no_new_line, ["-e", r"s/l/L/g", NO_NEW_LINE]); check_output!(subst_re_reuse, ["-e", r"2s//M/;1s/l/L/", LINES1]); +check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); +check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); #[test] fn subst_write_file() -> std::io::Result<()> { @@ -245,6 +252,8 @@ fn subst_write_file() -> std::io::Result<()> { Ok(()) } +//////////////////////////////////////////////////////////// +// Transliteration: y check_output!(trans_simple, ["-e", r"y/0123456789/9876543210/", LINES1]); check_output!( trans_delimiter, @@ -252,10 +261,10 @@ check_output!( ); check_output!(trans_no_new_line, ["-e", r"y/l/L/", NO_NEW_LINE]); check_output!(trans_newline, ["-e", r"1N;2y/\n/X/", LINES1]); -check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); -check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); -check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); +//////////////////////////////////////////////////////////// +// Pattern space manipulation: D, d, H, h, N, n, P, p, q, x +check_output!(pattern_print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); check_output!(pattern_delete_to_newline, ["-n", r"2N;3p;3D;3p", LINES1]); check_output!(pattern_delete_no_newline, ["-e", r"2D", LINES1]); @@ -282,6 +291,8 @@ check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); check_output!(pattern_re_reuse, ["-n", r"/_1/p;//p", LINES1]); check_output!(pattern_subst_re_reuse, ["-n", r"/_1/p;s//-N/p", LINES1]); +//////////////////////////////////////////////////////////// +// Command blocks: {} check_output!( block_simple_range, [ @@ -382,6 +393,8 @@ b label3 ] ); +//////////////////////////////////////////////////////////// +// Branching: :, b, t check_output!( branch_conditional_simple, [ @@ -469,6 +482,8 @@ tb"#, ] ); +//////////////////////////////////////////////////////////// +// Text: a, c, i check_output!( text_insert_quit, [ @@ -602,8 +617,14 @@ p ] ); +//////////////////////////////////////////////////////////// +// Large complex scripts + +// Math expression evaluation check_output!(math1, ["-f", "script/math.sed", "input/expression1"]); + // Calculate π (scaled) to several decimal places check_output!(pi, ["-f", "script/math.sed", "input/pi"]); +// Solve the Towers of Hanoi puzzle check_output!(hanoi, ["-f", "script/hanoi.sed", "input/hanoi"]); diff --git a/tests/fixtures/sed/output/print_to_newline b/tests/fixtures/sed/output/pattern_print_to_newline similarity index 100% rename from tests/fixtures/sed/output/print_to_newline rename to tests/fixtures/sed/output/pattern_print_to_newline From d74e3a76dcd7be7c5a5ceaffcc2a55dd4d8d44c4 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 20:22:19 +0300 Subject: [PATCH 15/42] Fix CI test failures --- src/uu/sed/Cargo.toml | 1 - tests/by-util/test_sed.rs | 14 ++------------ tests/fixtures/sed/input/empty | 0 .../{addr_last_with_empty => addr_last_empty} | 0 4 files changed, 2 insertions(+), 13 deletions(-) create mode 100644 tests/fixtures/sed/input/empty rename tests/fixtures/sed/output/{addr_last_with_empty => addr_last_empty} (100%) diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 219ceb37..6d94616e 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -15,7 +15,6 @@ categories = ["command-line-utilities"] [dependencies] clap = { workspace = true } fancy-regex = { workspace = true } -once_cell = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 1e47d5be..91b4bad6 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -140,21 +140,11 @@ check_output!(addr_straddle, ["-n", "-e", "20p", LINES1, LINES2]); check_output!(addr_last_one_file, ["-n", "-e", "$p", LINES1]); check_output!(addr_last_two_files, ["-n", "-e", "$p", LINES1, LINES2]); -#[cfg(unix)] -check_output!(addr_append_empty, ["-e", "$a\\\nhello", "/dev/null"]); -#[cfg(windows)] -check_output!(addr_append_empty, ["-e", "$a\\\nhello", "NUL"]); +check_output!(addr_append_empty, ["-e", "$a\\\nhello", "input/empty"]); -#[cfg(unix)] check_output!( addr_last_empty, - ["-n", "-e", "$p", LINES1, "/dev/null", LINES2] -); - -#[cfg(windows)] -check_output!( - addr_last_with_empty, - ["-n", "-e", "$p", LINES1, "NUL", LINES2] + ["-n", "-e", "$p", LINES1, "input/empty", LINES2] ); check_output!(addr_past_last, ["-n", "-e", "20p", LINES1]); diff --git a/tests/fixtures/sed/input/empty b/tests/fixtures/sed/input/empty new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/addr_last_with_empty b/tests/fixtures/sed/output/addr_last_empty similarity index 100% rename from tests/fixtures/sed/output/addr_last_with_empty rename to tests/fixtures/sed/output/addr_last_empty From 7ffb4a450aecb281af92ab552ca09ad008f84355 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 23:42:25 +0300 Subject: [PATCH 16/42] Optimize away needless RE clone Suggested by: sylvestre --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/processor.rs | 26 +++++++++++++++----------- src/uu/sed/src/sed.rs | 3 +-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index e26e793c..24a80388 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -52,7 +52,7 @@ pub struct ProcessingContext { /// Stop processing further input. pub stop_processing: bool, /// Previously compiled RE, saved for reuse when specifying an empty RE - pub saved_regex: RefCell>, + pub saved_regex: Option, /// Modification of input processing action // This is required to avoid doubly borrowing the reader in the 'N' // command. diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 6ef5a0c9..4af6bb03 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -166,18 +166,22 @@ fn write_chunk( Ok(()) } -/// Return the RE or the saved RE if the RE is None. +/// Return a reference to the current or the saved RE if the RE is None. /// Update the saved RE to RE. -fn re_or_saved_re(regex: &Option, context: &mut ProcessingContext) -> UResult { - match regex { - Some(re) => { - *context.saved_regex.borrow_mut() = Some(re.clone()); - Ok(re.clone()) - } - None => match &*context.saved_regex.borrow() { - Some(saved_re) => Ok(saved_re.clone()), - None => Err(USimpleError::new(2, "no previous regular expression")), - }, +fn re_or_saved_re<'a>( + regex: &Option, + context: &'a mut ProcessingContext, +) -> UResult<&'a Regex> { + if let Some(re) = regex { + // First time we see this regex: clone it *once* into the context. + context.saved_regex = Some(re.clone()); + // Return a reference into context.saved_regex. + Ok(context.saved_regex.as_ref().unwrap()) + } else if let Some(ref saved_re) = context.saved_regex { + // We already have one: just borrow it. + Ok(saved_re) + } else { + Err(USimpleError::new(2, "no previous regular expression")) } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index f81daf31..13f3a1b3 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -22,7 +22,6 @@ use crate::command::{ProcessingContext, ScriptValue, StringSpace}; use crate::compiler::compile; use crate::processor::process_all_files; use clap::{Arg, ArgMatches, Command, arg}; -use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; use uucore::error::{UResult, UUsageError}; @@ -202,7 +201,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { last_line: false, last_file: false, stop_processing: false, - saved_regex: const { RefCell::new(None) }, + saved_regex: None, input_action: None, hold: StringSpace::default(), parsed_block_nesting: 0, From fd90cc0ae4a8457eb2dbe9d93195b181fc03fcb2 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 19 May 2025 23:45:14 +0300 Subject: [PATCH 17/42] Shorten error mapping code Suggested by: sylvestre --- src/uu/sed/src/processor.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 4af6bb03..7ac819b8 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -203,15 +203,12 @@ fn substitute( for caps in regex.captures_iter(text) { count += 1; - let caps = match caps { - Ok(c) => c, - Err(e) => { - return Err(USimpleError::new( - 2, - format!("regular expression capture retrieval error: {}", e), - )); - } - }; + let caps = caps.map_err(|e| { + USimpleError::new( + 2, + format!("regular expression capture retrieval error: {}", e), + ) + })?; let m = caps.get(0).unwrap(); From f0cceb76221e8e0084c266455de7140dc1230e27 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 21 May 2025 10:37:47 +0300 Subject: [PATCH 18/42] Utilize shadowing to avoid convoluted names --- src/uu/sed/src/compiler.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 2bf21ba3..cdfc1354 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -651,21 +651,21 @@ fn compile_regex( } // Convert basic to extended regular expression if needed. - let ere_pattern = if context.regex_extended { + let pattern = if context.regex_extended { pattern } else { &bre_to_ere(pattern) }; // Add case-insensitive modifier if needed. - let full_pattern = if icase { - format!("(?i){}", ere_pattern) + let pattern = if icase { + format!("(?i){}", pattern) } else { - ere_pattern.to_string() + pattern.to_string() }; // Compile into engine. - let compiled = Regex::new(&full_pattern).map_err(|e| { + let compiled = Regex::new(&pattern).map_err(|e| { compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) .unwrap_err() })?; From a23361968620facbc83248de82d69c7b5230454f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 23 May 2025 18:30:14 +0300 Subject: [PATCH 19/42] Update status --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 365064d5..c419bab6 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,12 @@ and other extensions. ## Status -It is currently work in progress! +This is still work in progress, but the current version can +already execute complex scripts, +such as solving the Towers of Hanoi puzzle and performing +arbitrary precision integer arithmetic. +The commands `=`, `l`, `r`, and `w`, as well as in-place replacement +are still missing. ## Installation From f7365fa4226a99e6c5bae51c378ffe633723c17b Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 12:46:32 +0300 Subject: [PATCH 20/42] Add benchmark and comparison scripts --- tests/fixtures/sed/script/http-log-redact.sed | 4 + util/bench-compare.sh | 26 ++++ util/benchmark.sh | 137 ++++++++++++++++++ 3 files changed, 167 insertions(+) create mode 100644 tests/fixtures/sed/script/http-log-redact.sed create mode 100755 util/bench-compare.sh create mode 100755 util/benchmark.sh diff --git a/tests/fixtures/sed/script/http-log-redact.sed b/tests/fixtures/sed/script/http-log-redact.sed new file mode 100644 index 00000000..600ae9f4 --- /dev/null +++ b/tests/fixtures/sed/script/http-log-redact.sed @@ -0,0 +1,4 @@ +#!/ +s/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/[REDACTED_IP]/; +s/\[[0-9]{2}\/[A-Za-z]{3}\/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2} [+-][0-9]{4}\]/[TIMESTAMP]/; +s/"Mozilla[^"]*"/"[UA]"/ diff --git a/util/bench-compare.sh b/util/bench-compare.sh new file mode 100755 index 00000000..da0fe27a --- /dev/null +++ b/util/bench-compare.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# +# Compare the results of two benchmarks reporting fastest for each task +# + + +set -eu + +if [ $# -ne 2 ] ; then + echo "Usage: $(basename $0) results-1 results-2" 1>&2 + + exit 1 +fi + +paste -d , "$1" "$2" | + sed "s/^/$1,$2,/" | + + awk -F, 'FNR != 1 { + printf("%22s ", $3); + if ($4 / $12 > 1.1) + printf("%12s is %5.1f times faster than %s\n", $2, $4 / $12, $1); + else if ($12 / $4 > 1.1) + printf("%12s is %5.1f times faster than %s\n", $1, $12 / $4, $2); + else + printf("%12s is similarly fast as %s\n", $1, $2); + }' diff --git a/util/benchmark.sh b/util/benchmark.sh new file mode 100755 index 00000000..ddade413 --- /dev/null +++ b/util/benchmark.sh @@ -0,0 +1,137 @@ +#!/bin/sh + +set -eu + +if [ $# -ne 2 ] ; then + cat <&2 +Usage: $(basename $0) sed-command output-file" + +Examples: +$(basename $0) 'target/release/sedapp sed' rust-sed +$(basename $0) /usr/bin/sed gnu-sed +EOF + exit 1 +fi + +PROG="$1" +OUT="$2" + +SCRIPTS=tests/fixtures/sed/script + + +# Run hyperfine with the specified name and command and collect the results. +bench_run() +{ + hyperfine --command-name "$1" --warmup 2 --export-csv out.csv "$2" + + # Append the results sans-heading + sed 1d out.csv >>"$OUT" + + rm out.csv +} + +echo 'command,mean,stddev,median,user,system,min,max' >"$OUT" + +# Short line processing +awk 'BEGIN { for (i = 0; i < 50000000; i++) { print i } }' > lines.txt + +# No operation +bench_run no-op-short "$PROG '' lines.txt" + +# Log file processing + +# Create an access.log file with the specified number of lines +create_access_log() +{ + awk 'BEGIN { + for (i = 0; i < 5000000; i++) { + printf("192.168.%d.%d - - [01/Jan/2024:00:00:00 +0000] \"GET /index.html HTTP/1.1\" 200 1234 \"-\" \"Mozilla/5.0\"\n", int(i/256)%256, i%256); + } + }' > access.log +} + +# Create long file for simple commands +create_access_log 5000000 + +# No operation +bench_run access-log-no-op "$PROG '' access.log" + +# No substitution +bench_run access-log-no-subst "$PROG s/Chrome/Chromium/ access.log" + +# Substitution +bench_run access-log-subst "$PROG s/Mozilla/Chromium/ access.log" + +# No deletion +bench_run access-log-no-del "$PROG /Chrome/d access.log" + +# Full deletion +bench_run access-log-all-del "$PROG /Mozilla/d access.log" + +# Create shorter file for more complex commands +create_access_log 50000 + +# Transliteration +bench_run access-log-translit "$PROG y/0123456789/9876543210/ access.log" + +# Multiple substitutions +bench_run access-log-complex-sub "$PROG -f $SCRIPTS/http-log-redact.sed access.log" + +rm access.log + +# Remove \r +awk 'BEGIN { + for (i = 0; i < 1500000; i++) { + printf("line %d with windows endings\r\n", i); + } +}' > legacy_input.txt + +bench_run remove-cr "$PROG 's/\r$//' legacy_input.txt" + +rm legacy_input.txt + +# Genomic data cleanup + +awk 'BEGIN { + for (i = 0; i < 1000000; i++) { + chr = "chr" (1 + i % 22); + printf("%s\t%d\t%s\t.\t%s\t.\t.\n", chr, i * 100, (i % 2 ? "A" : "T"), (i % 2 ? "G" : "C")); + } +}' > genome.tsv + +CMD='/^#/d; s/\t\./\tNA/g; s/\.$/NA/' +bench_run genome-subst "$PROG '$CMD' genome.tsv" + +rm -f genome.tsv + +# Number fixups: remove thousands separator, change , into . +awk 'BEGIN { + for (i = 0; i < 700000; i++) { + euros = int(i % 10000); + cents = int(i % 100); + thousands = int(euros / 1000); + remainder = euros % 1000; + printf("%d.%03d,%02d\n", thousands, remainder, cents); + } +}' > finance.csv + +CMD='s/\([0-9]\)\.\([0-9]\)/\1\2/g;s/\([0-9]\),\([0-9]\)/\1.\2/g' +bench_run number-fix "$PROG '$CMD' finance.csv" + +rm -f finance.csv + +# Long script compilation +for i in $(seq 1 99) ; do + awk -v tag="$i" '{ gsub(/[tb:] [[:alnum:]_]+/, "&" tag); print }' $SCRIPTS/math.sed +done >long_script.sed + +bench_run long-script "echo -n '' | $PROG -f long_script.sed " + +rm long_script.sed + + +# Towers of Hanoi +bench_run hanoi "echo ':abcdefghijkl: : :' | $PROG -f $SCRIPTS/hanoi.sed" + +# Arbitrary precision arithmetic +bench_run factorial "echo 30\! | $PROG -f $SCRIPTS/math.sed" From b31d9d15dbcf5d26cd29726dd6fd66fe28f9ca26 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 12:56:18 +0300 Subject: [PATCH 21/42] Use memchr to speed up line splitting This improves performance up to 1.5 times, with no downside. no-op-short rust-sed is similarly fast as rust-sed-f0 access-log-no-op rust-sed is 1.5 times faster than rust-sed-f0 access-log-no-subst rust-sed is similarly fast as rust-sed-f0 access-log-subst rust-sed is similarly fast as rust-sed-f0 access-log-no-del rust-sed is 1.1 times faster than rust-sed-f0 access-log-all-del rust-sed is similarly fast as rust-sed-f0 access-log-translit rust-sed is 1.2 times faster than rust-sed-f0 access-log-complex-sub rust-sed is similarly fast as rust-sed-f0 remove-cr rust-sed is similarly fast as rust-sed-f0 genome-subst rust-sed is similarly fast as rust-sed-f0 number-fix rust-sed is 1.1 times faster than rust-sed-f0 long-script rust-sed is similarly fast as rust-sed-f0 hanoi rust-sed is similarly fast as rust-sed-f0 factorial rust-sed is similarly fast as rust-sed-f0 --- Cargo.lock | 2 ++ Cargo.toml | 2 ++ src/uu/sed/Cargo.toml | 1 + src/uu/sed/src/fast_io.rs | 11 +++++++---- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e87e1f3d..15c84739 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -703,6 +703,7 @@ dependencies = [ "ctor", "fancy-regex", "libc", + "memchr", "memmap2", "phf", "phf_codegen", @@ -896,6 +897,7 @@ version = "0.0.1" dependencies = [ "clap", "fancy-regex", + "memchr", "memmap2", "regex", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 871f72ca..61a7d8cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ clap_complete = "4.5" clap_mangen = "0.2" fancy-regex = "0.14.0" libc = "0.2.153" +memchr = "2.7.4" memmap2 = "0.9" phf = "0.11.2" phf_codegen = "0.11.2" @@ -56,6 +57,7 @@ clap_complete = { workspace = true } clap_mangen = { workspace = true } ctor = "0.4.1" fancy-regex = { workspace = true } +memchr = { workspace = true } memmap2.workspace = true phf = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 6d94616e..8487ab22 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -15,6 +15,7 @@ categories = ["command-line-utilities"] [dependencies] clap = { workspace = true } fancy-regex = { workspace = true } +memchr = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 36bb53e9..1cc7e12d 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -14,6 +14,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use memchr::memchr; #[cfg(unix)] use memmap2::Mmap; @@ -69,10 +70,12 @@ impl<'a> MmapLineCursor<'a> { } let start = self.pos; - let mut end = start; - while end < self.data.len() && self.data[end] != b'\n' { - end += 1; - } + + let mut end = if let Some(pos) = memchr(b'\n', &self.data[start..]) { + pos + start + } else { + self.data.len() + }; if end < self.data.len() { end += 1; // include \n in full span From a4cee5127d91eeb5aab24bb23c04cdb451639982 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 16:15:24 +0300 Subject: [PATCH 22/42] Handle failed executions --- util/bench-compare.sh | 6 +++++- util/benchmark.sh | 12 ++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/util/bench-compare.sh b/util/bench-compare.sh index da0fe27a..0ee61b8d 100755 --- a/util/bench-compare.sh +++ b/util/bench-compare.sh @@ -17,7 +17,11 @@ paste -d , "$1" "$2" | awk -F, 'FNR != 1 { printf("%22s ", $3); - if ($4 / $12 > 1.1) + if (!$4) + printf("%12s encountered an error\n", $1); + else if (!$12) + printf("%12s encountered an error\n", $2); + else if ($4 / $12 > 1.1) printf("%12s is %5.1f times faster than %s\n", $2, $4 / $12, $1); else if ($12 / $4 > 1.1) printf("%12s is %5.1f times faster than %s\n", $1, $12 / $4, $2); diff --git a/util/benchmark.sh b/util/benchmark.sh index ddade413..ebbb02aa 100755 --- a/util/benchmark.sh +++ b/util/benchmark.sh @@ -22,14 +22,18 @@ SCRIPTS=tests/fixtures/sed/script # Run hyperfine with the specified name and command and collect the results. bench_run() { - hyperfine --command-name "$1" --warmup 2 --export-csv out.csv "$2" - - # Append the results sans-heading - sed 1d out.csv >>"$OUT" + if hyperfine --command-name "$1" --warmup 2 --export-csv out.csv "$2" ; then + # Output the results sans-heading. + sed 1d out.csv >>"$OUT" + else + # Unable to run; output a named empty record. + echo "$1,,,,,,," >>"$OUT" + fi rm out.csv } +# Shared heading echo 'command,mean,stddev,median,user,system,min,max' >"$OUT" # Short line processing From 25b84bda239d04e0b2261415df2740e06257876f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 18:38:26 +0300 Subject: [PATCH 23/42] Check at runtime for invalid s/// group regerences --- src/uu/sed/src/command.rs | 138 +++++++++++++++++++++---------------- src/uu/sed/src/compiler.rs | 2 +- 2 files changed, 80 insertions(+), 60 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 24a80388..d5643eac 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -19,7 +19,7 @@ use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; -use uucore::error::UResult; +use uucore::error::{UResult, USimpleError}; #[derive(Debug, Default, Clone)] /// Compilation and processing options provided mostly through the @@ -124,16 +124,34 @@ pub enum ReplacementPart { /// All specified replacements for an RE pub struct ReplacementTemplate { pub parts: Vec, + max_group_number: usize, // Maximum referenced group number (e.g. 8 for \8) } impl Default for ReplacementTemplate { /// Create an empty template. fn default() -> Self { - ReplacementTemplate { parts: Vec::new() } + ReplacementTemplate::new(Vec::new()) } } impl ReplacementTemplate { + /// Construct from the parts + pub fn new(parts: Vec) -> Self { + let max_group_number = parts + .iter() + .filter_map(|part| match part { + ReplacementPart::Group(n) => Some(*n), + _ => None, + }) + .max() + .unwrap_or(0); + + Self { + parts, + max_group_number: max_group_number.try_into().unwrap(), + } + } + /// Apply the template to the given RE captures. /// Example: /// let result = regex.replace_all(input, |caps: &Captures| { @@ -142,6 +160,18 @@ impl ReplacementTemplate { pub fn apply(&self, caps: &Captures) -> UResult { let mut result = String::new(); + // Invalid group numbers may end here through (unkown at compile time) + // reused REs. + if self.max_group_number > caps.len() - 1 { + return Err(USimpleError::new( + 2, + format!( + "invalid reference \\{} on `s' command's RHS", + self.max_group_number + ), + )); + } + for part in &self.parts { match part { ReplacementPart::Literal(s) => result.push_str(s), @@ -151,7 +181,6 @@ impl ReplacementTemplate { } ReplacementPart::Group(n) => { - // Compilation guarantees we only get valid group numbers result.push_str( caps.get((*n).try_into().unwrap()) .map_or("", |m| m.as_str()), @@ -162,21 +191,6 @@ impl ReplacementTemplate { Ok(result) } - - /// Returns the highest capture group number referenced in this template. - pub fn max_group_number(&self) -> u32 { - self.parts - .iter() - .filter_map(|part| { - if let ReplacementPart::Group(n) = part { - Some(*n) - } else { - None - } - }) - .max() - .unwrap_or(0) - } } #[derive(Debug, Default)] @@ -345,9 +359,7 @@ mod tests { #[test] // s/abc/hello/ fn test_literal_only() { - let template = ReplacementTemplate { - parts: vec![ReplacementPart::Literal("hello".into())], - }; + let template = ReplacementTemplate::new(vec![ReplacementPart::Literal("hello".into())]); let caps = caps_for("abc", "abc"); let result = template.apply(&caps).unwrap(); @@ -357,12 +369,10 @@ mod tests { #[test] // s/foo\d+/got: &/ fn test_whole_match() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("got: ".into()), - ReplacementPart::WholeMatch, - ], - }; + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("got: ".into()), + ReplacementPart::WholeMatch, + ]); let caps = caps_for(r"foo\d+", "foo42"); let result = template.apply(&caps).unwrap(); @@ -372,12 +382,10 @@ mod tests { #[test] // s/foo(\d+)/number: \1/ fn test_backreference() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("number: ".into()), - ReplacementPart::Group(1), - ], - }; + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("number: ".into()), + ReplacementPart::Group(1), + ]); let caps = caps_for(r"foo(\d+)", "foo42"); let result = template.apply(&caps).unwrap(); @@ -387,45 +395,57 @@ mod tests { #[test] // s/(\w+):(\d+)/key: \1, value: \2/ fn test_multiple_parts() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("key: ".into()), - ReplacementPart::Group(1), - ReplacementPart::Literal(", value: ".into()), - ReplacementPart::Group(2), - ], - }; + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("key: ".into()), + ReplacementPart::Group(1), + ReplacementPart::Literal(", value: ".into()), + ReplacementPart::Group(2), + ]); let caps = caps_for(r"(\w+):(\d+)", "x:123"); let result = template.apply(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); } + #[test] + // s/(\w+):(\d+)/key: \1, value: \3/ + fn test_invalid_group() { + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("key: ".into()), + ReplacementPart::Group(1), + ReplacementPart::Literal(", value: ".into()), + ReplacementPart::Group(3), + ]); + let caps = caps_for(r"(\w+):(\d+)", "x:123"); + + let result = template.apply(&caps); + assert!(result.is_err()); + + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("invalid reference \\3")); + } + // max_group_number #[test] fn test_max_group_number_with_groups() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("a".into()), - ReplacementPart::Group(2), - ReplacementPart::WholeMatch, - ReplacementPart::Group(5), - ReplacementPart::Literal("z".into()), - ], - }; - assert_eq!(template.max_group_number(), 5); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("a".into()), + ReplacementPart::Group(2), + ReplacementPart::WholeMatch, + ReplacementPart::Group(5), + ReplacementPart::Literal("z".into()), + ]); + assert_eq!(template.max_group_number, 5); } #[test] fn test_max_group_number_without_groups() { - let template = ReplacementTemplate { - parts: vec![ - ReplacementPart::Literal("no".into()), - ReplacementPart::WholeMatch, - ReplacementPart::Literal("groups".into()), - ], - }; - assert_eq!(template.max_group_number(), 0); + let template = ReplacementTemplate::new(vec![ + ReplacementPart::Literal("no".into()), + ReplacementPart::WholeMatch, + ReplacementPart::Literal("groups".into()), + ]); + assert_eq!(template.max_group_number, 0); } // Transliteration diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index cdfc1354..0dd70039 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -762,7 +762,7 @@ pub fn compile_replacement( if !literal.is_empty() { parts.push(ReplacementPart::Literal(literal)); } - return Ok(ReplacementTemplate { parts }); + return Ok(ReplacementTemplate::new(parts)); } c => { From cbe314d01b2731ad54a42c0bddf4495a8bceb03f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 19:00:54 +0300 Subject: [PATCH 24/42] Catch invalid group references at compile time --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/compiler.rs | 29 ++++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index d5643eac..6e06f30d 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -124,7 +124,7 @@ pub enum ReplacementPart { /// All specified replacements for an RE pub struct ReplacementTemplate { pub parts: Vec, - max_group_number: usize, // Maximum referenced group number (e.g. 8 for \8) + pub max_group_number: usize, // Highest used group number (e.g. 8 for \8) } impl Default for ReplacementTemplate { diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 0dd70039..ad21e76e 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -816,11 +816,21 @@ fn compile_subst_command( ); } - if pattern.is_empty() { - subst.regex = None; // Will be reuse saved RE at runtime. - } else { - // Compile regex with now known ignore_case flag. - subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; + // Compile regex with now known ignore_case flag. + subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; + + // Catch invalid group references at compile time, if possible. + if let Some(regex) = &subst.regex { + if subst.replacement.max_group_number > regex.captures_len() - 1 { + return compilation_error( + lines, + line, + format!( + "invalid reference \\{} on `s' command's RHS", + subst.replacement.max_group_number + ), + ); + } } cmd.data = CommandData::Substitution(subst); @@ -1989,6 +1999,15 @@ mod tests { } } + #[test] + fn test_compile_subst_invalid_group_reference() { + let (mut lines, mut chars) = make_providers(r"s/f(o)o/\2/"); + let mut cmd = Command::default(); + + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); + assert!(err.to_string().contains("invalid reference \\2")); + } + // bre_to_ere #[test] fn test_bre_group_translation() { From eb64104a21093f18c987f0c97fd73272bf4deed1 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 19:12:49 +0300 Subject: [PATCH 25/42] Support replacement group \0 as synonym for & --- README.md | 1 + src/uu/sed/src/compiler.rs | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 365064d5..8cbd8c9a 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ cargo run --release * In addition to `\n`, other escape sequences (octal, hex, C) are supported in the strings of the `y` command. Under POSIX these yield undefined behavior. +* The substitution command replacement group `\0` is a synonym for &. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index ad21e76e..5fdcfc30 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -706,14 +706,18 @@ pub fn compile_replacement( } match line.current() { - // \1 - \9 - c @ '1'..='9' => { + // \0 - \9 + c @ '0'..='9' => { let ref_num = c.to_digit(10).unwrap(); if !literal.is_empty() { parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); } - parts.push(ReplacementPart::Group(ref_num)); + if ref_num == 0 { + parts.push(ReplacementPart::WholeMatch); + } else { + parts.push(ReplacementPart::Group(ref_num)); + } line.advance(); } @@ -1806,6 +1810,18 @@ mod tests { assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); } + #[test] + fn test_compile_replacement_whole_match_synonym() { + let (mut lines, mut chars) = make_providers(r"/The match was: \0/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 2); + assert!( + matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "The match was: ") + ); + assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); + } + #[test] fn test_compile_replacement_ampersand() { let (mut lines, mut chars) = make_providers("/Simon \\& Garfunkel/"); From 7868f4f4572fead68756ae8d064317220251aa6a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 20:26:34 +0300 Subject: [PATCH 26/42] Early exit in substitution This improves performance by 10% in the following benchmark case. Performance in all others remains the same. access-log-subst rust-sed is 1.1 times faster than rust-sed-eb64104 --- src/uu/sed/src/processor.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 7ac819b8..bd2522a5 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -225,6 +225,11 @@ fn substitute( } last_end = m.end(); + + // Early exit if only a specific occurrence (likely 1) needed replacing. + if count == sub.occurrence { + break; + } } // Handle substitution success. From f18f772f31b21660df54474dc3f0872ce53c8612 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 22:51:35 +0300 Subject: [PATCH 27/42] Try to specialize regex retrieval In theory, find, find_iter, captures, and captures_iter according to the values of sub.occurrence, sub.replacement.max_group_number could allow for better performance. In practice, specializing for the easiest case to use find did not show any improvement. no-op-short rust-sed is similarly fast as rust-sed-7868f4f access-log-no-op rust-sed is similarly fast as rust-sed-7868f4f access-log-no-subst rust-sed is similarly fast as rust-sed-7868f4f access-log-subst rust-sed is similarly fast as rust-sed-7868f4f access-log-no-del rust-sed is similarly fast as rust-sed-7868f4f access-log-all-del rust-sed is similarly fast as rust-sed-7868f4f access-log-translit rust-sed is similarly fast as rust-sed-7868f4f access-log-complex-sub rust-sed is similarly fast as rust-sed-7868f4f remove-cr rust-sed is similarly fast as rust-sed-7868f4f genome-subst rust-sed is similarly fast as rust-sed-7868f4f number-fix rust-sed is similarly fast as rust-sed-7868f4f long-script rust-sed is similarly fast as rust-sed-7868f4f hanoi rust-sed is similarly fast as rust-sed-7868f4f factorial rust-sed is similarly fast as rust-sed-7868f4f Consequently, this commit just documents the change and its results, and will be reverted. --- src/uu/sed/src/command.rs | 36 +++++++++++++++----- src/uu/sed/src/processor.rs | 67 ++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 32 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 6e06f30d..e2fb1620 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -13,7 +13,7 @@ use crate::named_writer::NamedWriter; -use fancy_regex::{Captures, Regex}; +use fancy_regex::{Captures, Match, Regex}; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -155,9 +155,9 @@ impl ReplacementTemplate { /// Apply the template to the given RE captures. /// Example: /// let result = regex.replace_all(input, |caps: &Captures| { - /// template.apply(caps) }); + /// template.apply_captures(caps) }); /// Returns an error if a backreference in the template was not matched by the RE. - pub fn apply(&self, caps: &Captures) -> UResult { + pub fn apply_captures(&self, caps: &Captures) -> UResult { let mut result = String::new(); // Invalid group numbers may end here through (unkown at compile time) @@ -191,6 +191,24 @@ impl ReplacementTemplate { Ok(result) } + + /// Apply the template to the given RE single match. + pub fn apply_match(&self, m: &Match) -> String { + let mut result = String::new(); + + for part in &self.parts { + match part { + ReplacementPart::Literal(s) => result.push_str(s), + + ReplacementPart::WholeMatch => result.push_str(m.as_str()), + + ReplacementPart::Group(_) => { + panic!("unexpected Regex group replacement") + } + } + } + result + } } #[derive(Debug, Default)] @@ -352,7 +370,7 @@ mod tests { let template = ReplacementTemplate::default(); let caps = caps_for("foo", "foo"); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, ""); } @@ -362,7 +380,7 @@ mod tests { let template = ReplacementTemplate::new(vec![ReplacementPart::Literal("hello".into())]); let caps = caps_for("abc", "abc"); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "hello"); } @@ -375,7 +393,7 @@ mod tests { ]); let caps = caps_for(r"foo\d+", "foo42"); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "got: foo42"); } @@ -388,7 +406,7 @@ mod tests { ]); let caps = caps_for(r"foo(\d+)", "foo42"); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "number: 42"); } @@ -403,7 +421,7 @@ mod tests { ]); let caps = caps_for(r"(\w+):(\d+)", "x:123"); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); } @@ -418,7 +436,7 @@ mod tests { ]); let caps = caps_for(r"(\w+):(\d+)", "x:123"); - let result = template.apply(&caps); + let result = template.apply_captures(&caps); assert!(result.is_err()); let msg = result.unwrap_err().to_string(); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index bd2522a5..41dfc0ab 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -201,34 +201,55 @@ fn substitute( let regex = re_or_saved_re(&sub.regex, context)?; - for caps in regex.captures_iter(text) { - count += 1; - let caps = caps.map_err(|e| { - USimpleError::new( - 2, - format!("regular expression capture retrieval error: {}", e), - ) - })?; + match (sub.occurrence, sub.replacement.max_group_number) { + (1, 0) => { + let m = regex.find(text); + let m = m.map_err(|e| { + USimpleError::new( + 2, + format!("regular expression match retrieval error: {}", e), + ) + })?; + if let Some(m) = m { + result.push_str(&text[last_end..m.start()]); + + let replacement = sub.replacement.apply_match(&m); + result.push_str(&replacement); + replaced = true; + last_end = m.end(); + } + } + (_, _) => { + for caps in regex.captures_iter(text) { + count += 1; + let caps = caps.map_err(|e| { + USimpleError::new( + 2, + format!("regular expression capture retrieval error: {}", e), + ) + })?; - let m = caps.get(0).unwrap(); + let m = caps.get(0).unwrap(); - // Always write the unmatched text before this match. - result.push_str(&text[last_end..m.start()]); + // Always write the unmatched text before this match. + result.push_str(&text[last_end..m.start()]); - if sub.occurrence == 0 || count == sub.occurrence { - let replacement = sub.replacement.apply(&caps)?; - result.push_str(&replacement); - replaced = true; - } else { - // Not the target match — leave the match unchanged. - result.push_str(m.as_str()); - } + if sub.occurrence == 0 || count == sub.occurrence { + let replacement = sub.replacement.apply_captures(&caps)?; + result.push_str(&replacement); + replaced = true; + } else { + // Not the target match — leave the match unchanged. + result.push_str(m.as_str()); + } - last_end = m.end(); + last_end = m.end(); - // Early exit if only a specific occurrence (likely 1) needed replacing. - if count == sub.occurrence { - break; + // Early exit if only a specific occurrence (likely 1) needed replacing. + if count == sub.occurrence { + break; + } + } } } From a6010ae54be658d5b400935a3b279b6dca31864a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 22:54:18 +0300 Subject: [PATCH 28/42] Revert "Try to specialize regex retrieval" This reverts commit f18f772f31b21660df54474dc3f0872ce53c8612. No performance improvement was seen. --- src/uu/sed/src/command.rs | 36 +++++--------------- src/uu/sed/src/processor.rs | 67 +++++++++++++------------------------ 2 files changed, 32 insertions(+), 71 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index e2fb1620..6e06f30d 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -13,7 +13,7 @@ use crate::named_writer::NamedWriter; -use fancy_regex::{Captures, Match, Regex}; +use fancy_regex::{Captures, Regex}; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -155,9 +155,9 @@ impl ReplacementTemplate { /// Apply the template to the given RE captures. /// Example: /// let result = regex.replace_all(input, |caps: &Captures| { - /// template.apply_captures(caps) }); + /// template.apply(caps) }); /// Returns an error if a backreference in the template was not matched by the RE. - pub fn apply_captures(&self, caps: &Captures) -> UResult { + pub fn apply(&self, caps: &Captures) -> UResult { let mut result = String::new(); // Invalid group numbers may end here through (unkown at compile time) @@ -191,24 +191,6 @@ impl ReplacementTemplate { Ok(result) } - - /// Apply the template to the given RE single match. - pub fn apply_match(&self, m: &Match) -> String { - let mut result = String::new(); - - for part in &self.parts { - match part { - ReplacementPart::Literal(s) => result.push_str(s), - - ReplacementPart::WholeMatch => result.push_str(m.as_str()), - - ReplacementPart::Group(_) => { - panic!("unexpected Regex group replacement") - } - } - } - result - } } #[derive(Debug, Default)] @@ -370,7 +352,7 @@ mod tests { let template = ReplacementTemplate::default(); let caps = caps_for("foo", "foo"); - let result = template.apply_captures(&caps).unwrap(); + let result = template.apply(&caps).unwrap(); assert_eq!(result, ""); } @@ -380,7 +362,7 @@ mod tests { let template = ReplacementTemplate::new(vec![ReplacementPart::Literal("hello".into())]); let caps = caps_for("abc", "abc"); - let result = template.apply_captures(&caps).unwrap(); + let result = template.apply(&caps).unwrap(); assert_eq!(result, "hello"); } @@ -393,7 +375,7 @@ mod tests { ]); let caps = caps_for(r"foo\d+", "foo42"); - let result = template.apply_captures(&caps).unwrap(); + let result = template.apply(&caps).unwrap(); assert_eq!(result, "got: foo42"); } @@ -406,7 +388,7 @@ mod tests { ]); let caps = caps_for(r"foo(\d+)", "foo42"); - let result = template.apply_captures(&caps).unwrap(); + let result = template.apply(&caps).unwrap(); assert_eq!(result, "number: 42"); } @@ -421,7 +403,7 @@ mod tests { ]); let caps = caps_for(r"(\w+):(\d+)", "x:123"); - let result = template.apply_captures(&caps).unwrap(); + let result = template.apply(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); } @@ -436,7 +418,7 @@ mod tests { ]); let caps = caps_for(r"(\w+):(\d+)", "x:123"); - let result = template.apply_captures(&caps); + let result = template.apply(&caps); assert!(result.is_err()); let msg = result.unwrap_err().to_string(); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 41dfc0ab..bd2522a5 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -201,55 +201,34 @@ fn substitute( let regex = re_or_saved_re(&sub.regex, context)?; - match (sub.occurrence, sub.replacement.max_group_number) { - (1, 0) => { - let m = regex.find(text); - let m = m.map_err(|e| { - USimpleError::new( - 2, - format!("regular expression match retrieval error: {}", e), - ) - })?; - if let Some(m) = m { - result.push_str(&text[last_end..m.start()]); - - let replacement = sub.replacement.apply_match(&m); - result.push_str(&replacement); - replaced = true; - last_end = m.end(); - } - } - (_, _) => { - for caps in regex.captures_iter(text) { - count += 1; - let caps = caps.map_err(|e| { - USimpleError::new( - 2, - format!("regular expression capture retrieval error: {}", e), - ) - })?; + for caps in regex.captures_iter(text) { + count += 1; + let caps = caps.map_err(|e| { + USimpleError::new( + 2, + format!("regular expression capture retrieval error: {}", e), + ) + })?; - let m = caps.get(0).unwrap(); + let m = caps.get(0).unwrap(); - // Always write the unmatched text before this match. - result.push_str(&text[last_end..m.start()]); + // Always write the unmatched text before this match. + result.push_str(&text[last_end..m.start()]); - if sub.occurrence == 0 || count == sub.occurrence { - let replacement = sub.replacement.apply_captures(&caps)?; - result.push_str(&replacement); - replaced = true; - } else { - // Not the target match — leave the match unchanged. - result.push_str(m.as_str()); - } + if sub.occurrence == 0 || count == sub.occurrence { + let replacement = sub.replacement.apply(&caps)?; + result.push_str(&replacement); + replaced = true; + } else { + // Not the target match — leave the match unchanged. + result.push_str(m.as_str()); + } - last_end = m.end(); + last_end = m.end(); - // Early exit if only a specific occurrence (likely 1) needed replacing. - if count == sub.occurrence { - break; - } - } + // Early exit if only a specific occurrence (likely 1) needed replacing. + if count == sub.occurrence { + break; } } From 5fad204a09b72a82fa30af59ac93fed8bdb7024a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 25 May 2025 11:19:02 +0300 Subject: [PATCH 29/42] Use regex::bytes whenever possible This improves one benchmark case by 20%: access-log-all-del rust-sed is 1.2 times faster than rust-sed-f18f772 Unfortunatelly, all other cases remain unaffected. --- Cargo.lock | 2 + Cargo.toml | 2 + src/uu/sed/Cargo.toml | 1 + src/uu/sed/src/command.rs | 33 ++-- src/uu/sed/src/compiler.rs | 39 ++--- src/uu/sed/src/fast_io.rs | 48 ++++-- src/uu/sed/src/fast_regex.rs | 292 +++++++++++++++++++++++++++++++++++ src/uu/sed/src/processor.rs | 29 ++-- src/uu/sed/src/sed.rs | 1 + 9 files changed, 384 insertions(+), 63 deletions(-) create mode 100644 src/uu/sed/src/fast_regex.rs diff --git a/Cargo.lock b/Cargo.lock index 15c84739..10be09f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -705,6 +705,7 @@ dependencies = [ "libc", "memchr", "memmap2", + "once_cell", "phf", "phf_codegen", "pretty_assertions", @@ -899,6 +900,7 @@ dependencies = [ "fancy-regex", "memchr", "memmap2", + "once_cell", "regex", "tempfile", "uucore", diff --git a/Cargo.toml b/Cargo.toml index 61a7d8cd..ca184677 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ fancy-regex = "0.14.0" libc = "0.2.153" memchr = "2.7.4" memmap2 = "0.9" +once_cell = "1.21.3" phf = "0.11.2" phf_codegen = "0.11.2" rand = { version = "0.9", features = ["small_rng"] } @@ -59,6 +60,7 @@ ctor = "0.4.1" fancy-regex = { workspace = true } memchr = { workspace = true } memmap2.workspace = true +once_cell = { workspace = true } phf = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } sysinfo = { workspace = true } diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 8487ab22..acf27186 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -19,6 +19,7 @@ memchr = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } +once_cell = { workspace = true } uucore = { workspace = true } [lib] diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 6e06f30d..8f8dccd7 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,9 +11,9 @@ // TODO: remove when compile is implemented #![allow(dead_code)] +use crate::fast_regex::{Captures, Regex}; use crate::named_writer::NamedWriter; -use fancy_regex::{Captures, Regex}; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -177,14 +177,12 @@ impl ReplacementTemplate { ReplacementPart::Literal(s) => result.push_str(s), ReplacementPart::WholeMatch => { - result.push_str(caps.get(0).map_or("", |m| m.as_str())); + result.push_str(caps.get(0)?.map(|m| m.as_str()).unwrap_or("")); } ReplacementPart::Group(n) => { - result.push_str( - caps.get((*n).try_into().unwrap()) - .map_or("", |m| m.as_str()), - ); + let i: usize = (*n).try_into().unwrap(); + result.push_str(caps.get(i)?.map(|m| m.as_str()).unwrap_or("")); } } } @@ -336,12 +334,13 @@ pub struct InputAction { #[cfg(test)] mod tests { use super::*; + use crate::fast_io::IOChunk; // Return the captures for the RE applied to the specified string - fn caps_for<'a>(re: &str, input: &'a str) -> Captures<'a> { + fn caps_for<'a>(re: &str, chunk: &'a mut IOChunk) -> Captures<'a> { Regex::new(re) .unwrap() - .captures(input) + .captures(chunk) .unwrap() .expect("captures") } @@ -350,7 +349,8 @@ mod tests { // s/foo// fn test_empty_template() { let template = ReplacementTemplate::default(); - let caps = caps_for("foo", "foo"); + let input = &mut IOChunk::from_str("foo"); + let caps = caps_for("foo", input); let result = template.apply(&caps).unwrap(); assert_eq!(result, ""); @@ -360,7 +360,8 @@ mod tests { // s/abc/hello/ fn test_literal_only() { let template = ReplacementTemplate::new(vec![ReplacementPart::Literal("hello".into())]); - let caps = caps_for("abc", "abc"); + let input = &mut IOChunk::from_str("abc"); + let caps = caps_for("abc", input); let result = template.apply(&caps).unwrap(); assert_eq!(result, "hello"); @@ -373,7 +374,8 @@ mod tests { ReplacementPart::Literal("got: ".into()), ReplacementPart::WholeMatch, ]); - let caps = caps_for(r"foo\d+", "foo42"); + let input = &mut IOChunk::from_str("foo42"); + let caps = caps_for(r"foo\d+", input); let result = template.apply(&caps).unwrap(); assert_eq!(result, "got: foo42"); @@ -386,7 +388,8 @@ mod tests { ReplacementPart::Literal("number: ".into()), ReplacementPart::Group(1), ]); - let caps = caps_for(r"foo(\d+)", "foo42"); + let input = &mut IOChunk::from_str("foo42"); + let caps = caps_for(r"foo(\d+)", input); let result = template.apply(&caps).unwrap(); assert_eq!(result, "number: 42"); @@ -401,7 +404,8 @@ mod tests { ReplacementPart::Literal(", value: ".into()), ReplacementPart::Group(2), ]); - let caps = caps_for(r"(\w+):(\d+)", "x:123"); + let input = &mut IOChunk::from_str("x:123"); + let caps = caps_for(r"(\w+):(\d+)", input); let result = template.apply(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); @@ -416,7 +420,8 @@ mod tests { ReplacementPart::Literal(", value: ".into()), ReplacementPart::Group(3), ]); - let caps = caps_for(r"(\w+):(\d+)", "x:123"); + let input = &mut IOChunk::from_str("x:123"); + let caps = caps_for(r"(\w+):(\d+)", input); let result = template.apply(&caps); assert!(result.is_err()); diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 5fdcfc30..d3a7c28d 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -15,10 +15,10 @@ use crate::command::{ use crate::delimited_parser::{ compilation_error, parse_char_escape, parse_regex, parse_transliteration, }; +use crate::fast_regex::Regex; use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; -use fancy_regex::Regex; use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -1167,6 +1167,7 @@ fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { #[cfg(test)] mod tests { use super::*; + use crate::fast_io::IOChunk; // Return an empty line provider and a char provider for the specified str. fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { @@ -1384,8 +1385,8 @@ mod tests { let regex = compile_regex(&lines, &chars, "abc", &ctx(), false) .unwrap() .expect("regex should be present"); - assert!(regex.is_match("abc").unwrap()); - assert!(!regex.is_match("ABC").unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("abc")).unwrap()); + assert!(!regex.is_match(&mut IOChunk::from_str("ABC")).unwrap()); } #[test] @@ -1394,9 +1395,9 @@ mod tests { let regex = compile_regex(&lines, &chars, "abc", &ctx(), true) .unwrap() .expect("regex should be present"); - assert!(regex.is_match("abc").unwrap()); - assert!(regex.is_match("ABC").unwrap()); - assert!(regex.is_match("AbC").unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("abc")).unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("ABC")).unwrap()); + assert!(regex.is_match(&mut IOChunk::from_str("AbC")).unwrap()); } #[test] @@ -1444,7 +1445,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("hello").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); } else { panic!("expected Regex address value"); } @@ -1456,7 +1457,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("hello").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); } else { panic!("expected Regex address value"); } @@ -1468,7 +1469,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(!re.is_match("helio").unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("helio")).unwrap()); } else { panic!("expected Regex address value"); } @@ -1480,7 +1481,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("hello").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("hello")).unwrap()); } else { panic!("expected Regex address value"); } @@ -1492,7 +1493,7 @@ mod tests { let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(Some(re)) = addr.value { - assert!(re.is_match("HELLO").unwrap()); // case-insensitive + assert!(re.is_match(&mut IOChunk::from_str("HELLO")).unwrap()); // case-insensitive } else { panic!("expected Regex address value"); } @@ -1583,8 +1584,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo").unwrap()); - assert!(!re.is_match("bar").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("bar")).unwrap()); } else { panic!("expected a regex address"); }; @@ -1603,8 +1604,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("foo").unwrap()); - assert!(!re.is_match("bar").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("bar")).unwrap()); } else { panic!("expected a regex address"); } @@ -1614,8 +1615,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr2.as_ref().unwrap().value { - assert!(re.is_match("bar").unwrap()); - assert!(!re.is_match("foo").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("bar")).unwrap()); + assert!(!re.is_match(&mut IOChunk::from_str("foo")).unwrap()); } else { panic!("expected a regex address"); }; @@ -1633,8 +1634,8 @@ mod tests { AddressType::Re )); if let AddressValue::Regex(Some(re)) = &cmd.borrow().addr1.as_ref().unwrap().value { - assert!(re.is_match("FOO").unwrap()); - assert!(re.is_match("foo").unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("FOO")).unwrap()); + assert!(re.is_match(&mut IOChunk::from_str("foo")).unwrap()); } else { panic!("expected a regex address with case-insensitive match"); }; diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 1cc7e12d..2bc678d3 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -18,6 +18,7 @@ use memchr::memchr; #[cfg(unix)] use memmap2::Mmap; +use std::cell::Cell; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; @@ -138,7 +139,7 @@ impl ReadLineCursor { /// As chunk of data that is input and can be output, often very efficiently #[derive(Debug, PartialEq, Eq)] pub struct IOChunk<'a> { - utf8_verified: bool, // True if the contents are valid UTF-8 + utf8_verified: Cell, // True if the contents are valid UTF-8 content: IOChunkContent<'a>, } @@ -146,14 +147,14 @@ impl<'a> IOChunk<'a> { /// Construct an IOChunk from the given content fn from_content(content: IOChunkContent<'a>) -> Self { Self { - utf8_verified: false, + utf8_verified: Cell::new(false), content, } } /// Clear the object's contents, converting it into Owned if needed. pub fn clear(&mut self) { - self.utf8_verified = true; + self.utf8_verified.set(true); match &mut self.content { IOChunkContent::Owned { content, @@ -185,10 +186,19 @@ impl<'a> IOChunk<'a> { } } + #[cfg(test)] + /// Create an Owned newline-terminated IOChunk from a string. + pub fn from_str(s: &str) -> Self { + IOChunk { + content: IOChunkContent::new_owned(s.to_string(), true), + utf8_verified: Cell::new(false), + } + } + /// Set the object's contents to the specified string. /// Convert it into Owned if needed. pub fn set_to_string(&mut self, new_content: String, add_newline: bool) { - self.utf8_verified = true; + self.utf8_verified.set(true); match &mut self.content { IOChunkContent::Owned { content, @@ -206,16 +216,16 @@ impl<'a> IOChunk<'a> { } /// Return the content as a str. - pub fn as_str(&mut self) -> Result<&str, Box> { + pub fn as_str(&self) -> Result<&str, Box> { match &self.content { #[cfg(unix)] IOChunkContent::MmapInput { content, .. } => { - if self.utf8_verified { + if self.utf8_verified.get() { // Use cached result Ok(unsafe { self.content.as_str_unchecked() }) } else { let result = str::from_utf8(content); - self.utf8_verified = true; + self.utf8_verified.set(true); result.map_err(|e| USimpleError::new(1, e.to_string())) } } @@ -223,6 +233,15 @@ impl<'a> IOChunk<'a> { } } + /// Return the raw byte content (always safe). + pub fn as_bytes(&self) -> &[u8] { + match &self.content { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => content, + IOChunkContent::Owned { content, .. } => content.as_bytes(), + } + } + /// Convert content to the Owned variant if it's not already. /// Fails if the conversion to UTF-8 fails. pub fn ensure_owned(&mut self) -> Result<(), Box> { @@ -235,7 +254,7 @@ impl<'a> IOChunk<'a> { let has_newline = full_span.last().copied() == Some(b'\n'); self.content = IOChunkContent::new_owned(valid_str.to_string(), has_newline); - self.utf8_verified = true; + self.utf8_verified.set(true); Ok(()) } Err(e) => Err(USimpleError::new(1, e.to_string())), @@ -937,7 +956,7 @@ mod tests { { assert_eq!(content, "first line"); assert!(has_newline); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::Owned"); @@ -963,7 +982,7 @@ mod tests { panic!("Expected IOChunkContent::Owned"); } - if let Some((mut content, last_line)) = reader.get_line()? { + if let Some((content, last_line)) = reader.get_line()? { assert_eq!(content.as_str().unwrap(), "last line"); assert!(last_line); } else { @@ -1001,7 +1020,7 @@ mod tests { { assert_eq!(content, b"first line"); assert_eq!(full_span, b"first line\n"); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); @@ -1021,15 +1040,16 @@ mod tests { { assert_eq!(content, b"second line"); assert_eq!(full_span, b"second line\n"); - assert!(!utf8_verified); + assert!(!utf8_verified.get()); assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); } - if let Some((mut content, last_line)) = reader.get_line()? { + if let Some((content, last_line)) = reader.get_line()? { + assert_eq!(content.as_bytes(), b"last line"); assert_eq!(content.as_str().unwrap(), "last line"); - assert!(content.utf8_verified); + assert!(content.utf8_verified.get()); assert!(last_line); // Cached version assert_eq!(content.as_str().unwrap(), "last line"); diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs new file mode 100644 index 00000000..3375bd65 --- /dev/null +++ b/src/uu/sed/src/fast_regex.rs @@ -0,0 +1,292 @@ +// A unified interface to byte and fancy Regex +// +// This allows using byte Regex when possible, resorting to the +// slower fancy_regex crate when needed. +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use fancy_regex::{ + CaptureMatches as FancyCaptureMatches, Captures as FancyCaptures, Regex as FancyRegex, +}; +use once_cell::sync::Lazy; +use regex::bytes::{ + CaptureMatches as ByteCaptureMatches, Captures as ByteCaptures, Regex as ByteRegex, +}; +use std::error::Error; +use uucore::error::{UResult, USimpleError}; + +use crate::fast_io::IOChunk; + +#[derive(Clone, Debug)] +/// A regular expression that can be implemented through Byte or Fancy-Regex +// This enables a fast bytes path where possible. +pub enum Regex { + Byte(ByteRegex), + Fancy(FancyRegex), +} + +impl Regex { + /// Construct the most efficient engine possible + pub fn new(pattern: &str) -> Result> { + static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { + regex::Regex::new( + r"(?x) # Turn on verbose mode + ( # An ASCII-incompatible RE + ( ^ | [^\\] ) # Non-escaped, so BOL or not \ + ( # A potentially incompatible match + \. # . matches any Unicode character + | \[\^ # Bracketed negative character class + | \(\?i # Case insensitive is done on Unicode + | \\[WwDdSsBbPp] # Unicode classes + | \\[0-9] # Only fancy supports back-references + ) + ) + | [^\x01-\x7f] # Or any non-ASCII character in the RE + ", + ) + .unwrap() + }); + + if NEEDS_FANCY_RE.is_match(pattern) { + Ok(Self::Fancy(FancyRegex::new(pattern)?)) + } else { + Ok(Self::Byte(ByteRegex::new(pattern)?)) + } + } + + /// Return true if this is a byte-based regex. + pub fn is_byte(&self) -> bool { + matches!(self, Regex::Byte(_)) + } + + /// Check if the regex matches the content of the IOChunk. + pub fn is_match(&self, chunk: &mut IOChunk) -> UResult { + match self { + Regex::Byte(re) => Ok(re.is_match(chunk.as_bytes())), + Regex::Fancy(re) => { + let text = chunk.as_str()?; + re.is_match(text) + .map_err(|e| USimpleError::new(2, e.to_string())) + } + } + } + + /// Return an iterator over capture groups. + pub fn captures_iter<'t>(&'t self, chunk: &'t IOChunk) -> UResult> { + match self { + Regex::Byte(re) => Ok(CaptureMatches::Byte(re.captures_iter(chunk.as_bytes()))), + Regex::Fancy(re) => { + let text = chunk.as_str()?; + Ok(CaptureMatches::Fancy(re.captures_iter(text))) + } + } + } + + /// Return the number of capture groups, including group 0. + pub fn captures_len(&self) -> usize { + match self { + Regex::Byte(re) => re.captures_len(), + Regex::Fancy(re) => re.captures_len(), + } + } + + /// Return the elements of the first capture. + pub fn captures<'t>(&self, chunk: &'t IOChunk) -> UResult>> { + match self { + Regex::Byte(re) => { + let bytes = chunk.as_bytes(); + Ok(re.captures(bytes).map(Captures::Byte)) + } + + Regex::Fancy(re) => { + let text = chunk.as_str()?; + match re.captures(text) { + Ok(Some(caps)) => Ok(Some(Captures::Fancy(caps))), + Ok(None) => Ok(None), + Err(e) => Err(USimpleError::new(2, e.to_string())), + } + } + } + } +} + +/// Unified enum for holding either byte or fancy capture iterators. +pub enum CaptureMatches<'t> { + Byte(ByteCaptureMatches<'t, 't>), + Fancy(FancyCaptureMatches<'t, 't>), +} + +impl<'t> Iterator for CaptureMatches<'t> { + type Item = UResult>; + + fn next(&mut self) -> Option { + match self { + CaptureMatches::Byte(iter) => iter.next().map(|caps| Ok(Captures::Byte(caps))), + CaptureMatches::Fancy(iter) => match iter.next() { + Some(Ok(caps)) => Some(Ok(Captures::Fancy(caps))), + Some(Err(e)) => Some(Err(USimpleError::new( + 2, + format!("error retrieving RE captures: {}", e), + ))), + None => None, + }, + } + } +} + +#[derive(Debug)] +/// Result type for RE capture get(n) +pub struct Match<'t> { + start: usize, // Match start + end: usize, // Match end + text: &'t str, // Actual match +} + +/// Provide interface compatible with Regex::Match. +impl<'t> Match<'t> { + pub fn start(&self) -> usize { + self.start + } + + pub fn end(&self) -> usize { + self.end + } + + pub fn as_str(&self) -> &'t str { + self.text + } +} + +/// Provide interface compatible with Regex::Captures. +pub enum Captures<'t> { + Byte(ByteCaptures<'t>), + Fancy(FancyCaptures<'t>), +} + +impl<'t> Captures<'t> { + /// Get capture group at index `i` + /// Returns Ok(None) if the group didn't match. + /// Returns Err if UTF-8 conversion fails (in Byte variant). + pub fn get(&self, i: usize) -> UResult>> { + match self { + Captures::Byte(caps) => match caps.get(i) { + Some(m) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: std::str::from_utf8(m.as_bytes()) + .map_err(|e| USimpleError::new(1, e.to_string()))?, + })), + None => Ok(None), + }, + Captures::Fancy(caps) => match caps.get(i) { + Some(m) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: m.as_str(), + })), + None => Ok(None), + }, + } + } + + /// Return the number of capture groups (including group 0). + pub fn len(&self) -> usize { + match self { + Captures::Byte(caps) => caps.len(), + Captures::Fancy(caps) => caps.len(), + } + } + + /// Return true if there are no captures. + // Unused, but provided for completeness. + pub fn is_empty(&self) -> bool { + match self { + Captures::Byte(caps) => caps.len() == 0, + Captures::Fancy(caps) => caps.len() == 0, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn assert_byte(pattern: &str) { + let re = Regex::new(pattern).unwrap(); + assert!(re.is_byte(), "Pattern {:?} should use Byte engine", pattern); + } + + fn assert_fancy(pattern: &str) { + let re = Regex::new(pattern).unwrap(); + assert!( + !re.is_byte(), + "Pattern {:?} should use Fancy engine", + pattern + ); + } + + #[test] + fn selects_byte_regex_for_simple_ascii() { + assert_byte(r"foo"); + assert_byte(r"foo|bar"); + assert_byte(r"^foo[0-9]+bar$"); + // Escaped Unicode triggers shouldn't trigger RE. + assert_byte(r"\."); + assert_byte(r"\[^x]"); + assert_byte(r"\\(?i)"); + assert_byte(r"\\w"); + } + + #[test] + fn selects_fancy_for_unicode_class_bol() { + assert_fancy(r"\p{L}+"); // Unicode letter class + assert_fancy(r"\W"); // \W is Unicode-aware. + assert_fancy(r"\S+"); // \S is Unicode-aware. + assert_fancy(r"\d"); // \d includes all Unicode digits. + } + + #[test] + fn selects_fancy_for_unicode_class_non_bol() { + assert_fancy(r"x\p{L}+"); // Unicode letter class + assert_fancy(r"x\W"); // \W is Unicode-aware. + assert_fancy(r"x\S+"); // \S is Unicode-aware. + assert_fancy(r"x\d"); // \d includes all Unicode digits. + } + + #[test] + fn selects_fancy_for_dot() { + assert_fancy(r"."); // Dot matches any Unicode char. + } + + #[test] + fn selects_fancy_for_inline_flags() { + assert_fancy(r"(?i)abc"); // Unicode case-insensitive + } + + #[test] + fn selects_fancy_for_backrefs() { + assert_fancy(r"(\w+):\1"); // back-reference \1 + } + + #[test] + fn selects_fancy_for_non_ascii_literals() { + assert_fancy("naïve"); // Contains literal non-ASCII. + assert_fancy("café"); // Contains literal non-ASCII. + } + + #[test] + fn handles_invalid_regex_gracefully() { + let err = Regex::new("(").unwrap_err().to_string(); + assert!( + err.contains("unclosed group") || err.contains("error parsing"), + "Unexpected error: {}", + err + ); + } +} diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index bd2522a5..83a574bb 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -13,9 +13,10 @@ use crate::command::{ ProcessingContext, Substitution, Transliteration, }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; +use crate::fast_regex::Regex; use crate::in_place::InPlace; use crate::named_writer; -use fancy_regex::Regex; + use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; @@ -32,10 +33,7 @@ fn match_address( AddressType::Re => { if let AddressValue::Regex(ref re) = addr.value { let regex = re_or_saved_re(re, context)?; - let matched = regex.is_match(pattern.as_str()?).map_err(|e| { - USimpleError::new(2, format!("regular expression match error: {}", e)) - })?; - Ok(matched) + regex.is_match(pattern) } else { Ok(false) } @@ -197,23 +195,22 @@ fn substitute( let mut result = String::new(); let mut replaced = false; - let text = pattern.as_str()?; + let mut text: Option<&str> = None; let regex = re_or_saved_re(&sub.regex, context)?; - for caps in regex.captures_iter(text) { + // Iterate over multiple captures of the RE in the pattern. + for caps in regex.captures_iter(pattern)? { count += 1; - let caps = caps.map_err(|e| { - USimpleError::new( - 2, - format!("regular expression capture retrieval error: {}", e), - ) - })?; + let caps = caps?; - let m = caps.get(0).unwrap(); + let m = caps.get(0)?.unwrap(); // Always write the unmatched text before this match. - result.push_str(&text[last_end..m.start()]); + if text.is_none() { + text = Some(pattern.as_str()?); + } + result.push_str(&text.unwrap()[last_end..m.start()]); if sub.occurrence == 0 || count == sub.occurrence { let replacement = sub.replacement.apply(&caps)?; @@ -234,7 +231,7 @@ fn substitute( // Handle substitution success. if replaced { - result.push_str(&text[last_end..]); + result.push_str(&text.unwrap()[last_end..]); pattern.set_to_string(result, pattern.is_newline_terminated()); diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 13f3a1b3..d61bc96c 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -12,6 +12,7 @@ pub mod command; pub mod compiler; pub mod delimited_parser; pub mod fast_io; +pub mod fast_regex; pub mod in_place; pub mod named_writer; pub mod processor; From 90d60b87a69b3d4107b25184fafc9ea5fc3cbd09 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 24 May 2025 22:51:35 +0300 Subject: [PATCH 30/42] Try again to specialize regex retrieval The hope is that the now available regex::bytes will run find() faster than captures_iter(). Indeed, using find() increases the performance of several benchmark cases by 2-12%. no-op-short rust-sed is 1.02 times faster than rust-sed-5fad204 access-log-no-op rust-sed is 1.01 times faster than rust-sed-5fad204 access-log-no-subst rust-sed is 1.08 times faster than rust-sed-5fad204 access-log-subst rust-sed is 1.12 times faster than rust-sed-5fad204 access-log-translit rust-sed is 1.02 times faster than rust-sed-5fad204 access-log-complex-sub rust-sed is 1.04 times faster than rust-sed-5fad204 remove-cr rust-sed is 1.12 times faster than rust-sed-5fad204 genome-subst rust-sed is 1.02 times faster than rust-sed-5fad204 hanoi rust-sed is 1.01 times faster than rust-sed-5fad204 All other cases remain the same. --- src/uu/sed/src/command.rs | 36 +++++++++++++++------ src/uu/sed/src/fast_regex.rs | 33 +++++++++++++++++++ src/uu/sed/src/processor.rs | 62 +++++++++++++++++++++++------------- 3 files changed, 99 insertions(+), 32 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 8f8dccd7..28e30e0b 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,7 +11,7 @@ // TODO: remove when compile is implemented #![allow(dead_code)] -use crate::fast_regex::{Captures, Regex}; +use crate::fast_regex::{Captures, Match, Regex}; use crate::named_writer::NamedWriter; use std::borrow::Cow; @@ -155,9 +155,9 @@ impl ReplacementTemplate { /// Apply the template to the given RE captures. /// Example: /// let result = regex.replace_all(input, |caps: &Captures| { - /// template.apply(caps) }); + /// template.apply_captures(caps) }); /// Returns an error if a backreference in the template was not matched by the RE. - pub fn apply(&self, caps: &Captures) -> UResult { + pub fn apply_captures(&self, caps: &Captures) -> UResult { let mut result = String::new(); // Invalid group numbers may end here through (unkown at compile time) @@ -189,6 +189,24 @@ impl ReplacementTemplate { Ok(result) } + + /// Apply the template to the given RE single match. + pub fn apply_match(&self, m: &Match) -> String { + let mut result = String::new(); + + for part in &self.parts { + match part { + ReplacementPart::Literal(s) => result.push_str(s), + + ReplacementPart::WholeMatch => result.push_str(m.as_str()), + + ReplacementPart::Group(_) => { + panic!("unexpected Regex group replacement") + } + } + } + result + } } #[derive(Debug, Default)] @@ -352,7 +370,7 @@ mod tests { let input = &mut IOChunk::from_str("foo"); let caps = caps_for("foo", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, ""); } @@ -363,7 +381,7 @@ mod tests { let input = &mut IOChunk::from_str("abc"); let caps = caps_for("abc", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "hello"); } @@ -377,7 +395,7 @@ mod tests { let input = &mut IOChunk::from_str("foo42"); let caps = caps_for(r"foo\d+", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "got: foo42"); } @@ -391,7 +409,7 @@ mod tests { let input = &mut IOChunk::from_str("foo42"); let caps = caps_for(r"foo(\d+)", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "number: 42"); } @@ -407,7 +425,7 @@ mod tests { let input = &mut IOChunk::from_str("x:123"); let caps = caps_for(r"(\w+):(\d+)", input); - let result = template.apply(&caps).unwrap(); + let result = template.apply_captures(&caps).unwrap(); assert_eq!(result, "key: x, value: 123"); } @@ -423,7 +441,7 @@ mod tests { let input = &mut IOChunk::from_str("x:123"); let caps = caps_for(r"(\w+):(\d+)", input); - let result = template.apply(&caps); + let result = template.apply_captures(&caps); assert!(result.is_err()); let msg = result.unwrap_err().to_string(); diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs index 3375bd65..e8e52edc 100644 --- a/src/uu/sed/src/fast_regex.rs +++ b/src/uu/sed/src/fast_regex.rs @@ -114,6 +114,39 @@ impl Regex { } } } + + /// Return a non-capturing result for a single match. + pub fn find<'t>(&self, chunk: &'t IOChunk) -> UResult>> { + match self { + Regex::Byte(re) => { + let haystack = chunk.as_bytes(); + if let Some(m) = re.find(haystack) { + // Attempt UTF-8 decode for the match region only + let text = std::str::from_utf8(&haystack[m.start()..m.end()]) + .map_err(|e| USimpleError::new(2, e.to_string()))?; + Ok(Some(Match { + start: m.start(), + end: m.end(), + text, + })) + } else { + Ok(None) + } + } + Regex::Fancy(re) => { + let text = chunk.as_str()?; + match re.find(text) { + Ok(Some(m)) => Ok(Some(Match { + start: m.start(), + end: m.end(), + text: m.as_str(), + })), + Ok(None) => Ok(None), + Err(e) => Err(USimpleError::new(2, e.to_string())), + } + } + } + } } /// Unified enum for holding either byte or fancy capture iterators. diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 83a574bb..a9263cec 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -199,33 +199,49 @@ fn substitute( let regex = re_or_saved_re(&sub.regex, context)?; - // Iterate over multiple captures of the RE in the pattern. - for caps in regex.captures_iter(pattern)? { - count += 1; - let caps = caps?; - - let m = caps.get(0)?.unwrap(); - - // Always write the unmatched text before this match. - if text.is_none() { - text = Some(pattern.as_str()?); + match (sub.occurrence, sub.replacement.max_group_number) { + (1, 0) => { + let m = regex.find(pattern)?; + if let Some(m) = m { + text = Some(pattern.as_str()?); + result.push_str(&text.unwrap()[last_end..m.start()]); + + let replacement = sub.replacement.apply_match(&m); + result.push_str(&replacement); + replaced = true; + last_end = m.end(); + } } - result.push_str(&text.unwrap()[last_end..m.start()]); + (_, _) => { + // Iterate over multiple captures of the RE in the pattern. + for caps in regex.captures_iter(pattern)? { + count += 1; + let caps = caps?; - if sub.occurrence == 0 || count == sub.occurrence { - let replacement = sub.replacement.apply(&caps)?; - result.push_str(&replacement); - replaced = true; - } else { - // Not the target match — leave the match unchanged. - result.push_str(m.as_str()); - } + let m = caps.get(0)?.unwrap(); - last_end = m.end(); + // Always write the unmatched text before this match. + if text.is_none() { + text = Some(pattern.as_str()?); + } + result.push_str(&text.unwrap()[last_end..m.start()]); + + if sub.occurrence == 0 || count == sub.occurrence { + let replacement = sub.replacement.apply_captures(&caps)?; + result.push_str(&replacement); + replaced = true; + } else { + // Not the target match — leave the match unchanged. + result.push_str(m.as_str()); + } - // Early exit if only a specific occurrence (likely 1) needed replacing. - if count == sub.occurrence { - break; + last_end = m.end(); + + // Early exit if only a specific occurrence (likely 1) needed replacing. + if count == sub.occurrence { + break; + } + } } } From 28b5eb0b722e285d3a8820ac56ec40f3b9e91a0d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 25 May 2025 15:16:21 +0300 Subject: [PATCH 31/42] Report performance changes up to 1% --- util/bench-compare.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util/bench-compare.sh b/util/bench-compare.sh index 0ee61b8d..7e829c60 100755 --- a/util/bench-compare.sh +++ b/util/bench-compare.sh @@ -21,10 +21,10 @@ paste -d , "$1" "$2" | printf("%12s encountered an error\n", $1); else if (!$12) printf("%12s encountered an error\n", $2); - else if ($4 / $12 > 1.1) - printf("%12s is %5.1f times faster than %s\n", $2, $4 / $12, $1); - else if ($12 / $4 > 1.1) - printf("%12s is %5.1f times faster than %s\n", $1, $12 / $4, $2); + else if ($4 / $12 > 1.01) + printf("%12s is %5.2f times faster than %s\n", $2, $4 / $12, $1); + else if ($12 / $4 > 1.01) + printf("%12s is %5.2f times faster than %s\n", $1, $12 / $4, $2); else printf("%12s is similarly fast as %s\n", $1, $2); }' From a67be7460466feb4063a6cfe13781c73b731918c Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 25 May 2025 16:06:22 +0300 Subject: [PATCH 32/42] Specialize single replacement multiple groups Performance improves significantly in the following case: number-fix rust-sed is 1.07 times faster than rust-sed-90d60b8 Other changes <5% are likely to be noise. --- src/uu/sed/src/processor.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index a9263cec..24253a2d 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -201,6 +201,7 @@ fn substitute( match (sub.occurrence, sub.replacement.max_group_number) { (1, 0) => { + // Example: s/foo/bar/: find() is enough. let m = regex.find(pattern)?; if let Some(m) = m { text = Some(pattern.as_str()?); @@ -212,7 +213,24 @@ fn substitute( last_end = m.end(); } } + + (1, _) => { + // Example: s/\(.\)\(.\)/\2\1/: captures() is enough. + let caps = regex.captures(pattern)?; + if let Some(caps) = caps { + let m = caps.get(0)?.unwrap(); + text = Some(pattern.as_str()?); + result.push_str(&text.unwrap()[last_end..m.start()]); + + let replacement = sub.replacement.apply_captures(&caps)?; + result.push_str(&replacement); + replaced = true; + last_end = m.end(); + } + } + (_, _) => { + // Example: s/(.)(.)/\2\1/3: captures_iter() is needed. // Iterate over multiple captures of the RE in the pattern. for caps in regex.captures_iter(pattern)? { count += 1; @@ -237,7 +255,8 @@ fn substitute( last_end = m.end(); - // Early exit if only a specific occurrence (likely 1) needed replacing. + // Early exit if only a specific occurrence, + // (likely 1) needed replacing. if count == sub.occurrence { break; } From aa12a127336bb8b92d403ca511a21941419ef1b0 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 26 May 2025 20:23:08 +0300 Subject: [PATCH 33/42] Improve fancy_regex detection RE and tests --- src/uu/sed/src/fast_regex.rs | 79 +++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs index e8e52edc..c0239151 100644 --- a/src/uu/sed/src/fast_regex.rs +++ b/src/uu/sed/src/fast_regex.rs @@ -34,20 +34,42 @@ pub enum Regex { impl Regex { /// Construct the most efficient engine possible pub fn new(pattern: &str) -> Result> { + // REs requiring the fancy_regex capabilities rather than the + // regex::bytes engine + // Consider . as one character that requires fancy_regex, + // because it can match more than one byte when matching a + // two or more byte Unicode UTF-8 representation. + // It is an RE . rather than a literal one in the following + // example sitations. + // . First character of the line + // [^\\]. Second character after non \ + // + // \*. A consumed backslash anywhere on the line + // \\. An escaped backslash anywhere on the line + // xx. A non-escaped sequence anywhere on the line + // But the following are literal dots and can be captured by bytes: + // \. escaped at the beginning of the line + // x\. escaped after a non escaped \ anywhere on the line + // + // The following RE captures these situations. static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { regex::Regex::new( r"(?x) # Turn on verbose mode ( # An ASCII-incompatible RE - ( ^ | [^\\] ) # Non-escaped, so BOL or not \ + ( ^ # Non-escaped: i.e. at BOL + | ^[^\\] # or after a BOL non \ + | [^\\] {2} # or after two non \ characters + | \\. # or after a consumed or escaped \ + ) ( # A potentially incompatible match \. # . matches any Unicode character - | \[\^ # Bracketed negative character class - | \(\?i # Case insensitive is done on Unicode + | \[\^ # Bracketed -ve character class + | \(\?i # (Unicode) case insensitive | \\[WwDdSsBbPp] # Unicode classes - | \\[0-9] # Only fancy supports back-references + | \\[0-9] # Back-references need fancy ) ) - | [^\x01-\x7f] # Or any non-ASCII character in the RE + | [^\x01-\x7f] # Any non-ASCII character ", ) .unwrap() @@ -264,16 +286,36 @@ mod tests { ); } + #[test] + fn selects_byte_regex_for_escpaped_dot_bol() { + assert_byte(r"\."); + } + + #[test] + fn selects_byte_regex_for_escpaped_dot_non_bol() { + assert_byte(r"x\."); + } + + #[test] + fn selects_byte_regex_for_escaped_class() { + assert_byte(r"\[^x]"); + } + + #[test] + fn selects_byte_regex_for_escaped_case_insensitive_flag() { + assert_byte(r"\(?i\)"); + } + + #[test] + fn selects_byte_regex_for_escaped_unicode_class() { + assert_byte(r"\\w"); + } + #[test] fn selects_byte_regex_for_simple_ascii() { assert_byte(r"foo"); assert_byte(r"foo|bar"); assert_byte(r"^foo[0-9]+bar$"); - // Escaped Unicode triggers shouldn't trigger RE. - assert_byte(r"\."); - assert_byte(r"\[^x]"); - assert_byte(r"\\(?i)"); - assert_byte(r"\\w"); } #[test] @@ -294,12 +336,27 @@ mod tests { #[test] fn selects_fancy_for_dot() { - assert_fancy(r"."); // Dot matches any Unicode char. + assert_fancy(r"."); + assert_fancy(r"x."); + assert_fancy(r"xx."); + } + + #[test] + fn selects_fancy_for_consumed_backslash() { + assert_fancy(r"\*."); + assert_fancy(r"x\*."); + } + + #[test] + fn selects_fancy_for_escaped_backslash() { + assert_fancy(r"\\."); + assert_fancy(r"x\\."); } #[test] fn selects_fancy_for_inline_flags() { assert_fancy(r"(?i)abc"); // Unicode case-insensitive + assert_fancy(r"x(?i)abc"); // Unicode case-insensitive } #[test] From ea3df65f5c88ef55af8138f77b45453667d9f223 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 26 May 2025 20:25:48 +0300 Subject: [PATCH 34/42] Fix Windows warning --- src/uu/sed/src/fast_io.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 2bc678d3..273e667a 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -14,6 +14,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +#[cfg(unix)] use memchr::memchr; #[cfg(unix)] use memmap2::Mmap; From a069605d7e5a2d17ef1690fb34a88532f151f18b Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 27 May 2025 20:30:57 +0300 Subject: [PATCH 35/42] Add RE detection and refactor unit tests - Add RE to detect specifications that require an RE engine, rather than a literal string match. - Change RE unit tests to test directly the RE rather than the constructor. - Use arrays for similar tests to avoid repetition. --- src/uu/sed/src/fast_regex.rs | 300 +++++++++++++++++++++-------------- 1 file changed, 179 insertions(+), 121 deletions(-) diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs index c0239151..ff7760ef 100644 --- a/src/uu/sed/src/fast_regex.rs +++ b/src/uu/sed/src/fast_regex.rs @@ -15,6 +15,7 @@ use fancy_regex::{ CaptureMatches as FancyCaptureMatches, Captures as FancyCaptures, Regex as FancyRegex, }; use once_cell::sync::Lazy; +use regex::Regex as RustRegex; use regex::bytes::{ CaptureMatches as ByteCaptureMatches, Captures as ByteCaptures, Regex as ByteRegex, }; @@ -23,6 +24,76 @@ use uucore::error::{UResult, USimpleError}; use crate::fast_io::IOChunk; +/// REs requiring the fancy_regex capabilities rather than the +/// faster regex::bytes engine +// Consider . as one character that requires fancy_regex, +// because it can match more than one byte when matching a +// two or more byte Unicode UTF-8 representation. +// It is an RE . rather than a literal one in the following +// example sitations. +// . First character of the line +// [^\\]. Second character after non \ +// +// \*. A consumed backslash anywhere on the line +// \\. An escaped backslash anywhere on the line +// xx. A non-escaped sequence anywhere on the line +// But the following are literal dots and can be captured by bytes: +// \. escaped at the beginning of the line +// x\. escaped after a non escaped \ anywhere on the line +// +// The following RE captures these situations. +static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { + regex::Regex::new( + r"(?x) # Turn on verbose mode + ( # An ASCII-incompatible RE + ( ^ # Non-escaped: i.e. at BOL + | ^[^\\] # or after a BOL non \ + | [^\\] {2} # or after two non \ characters + | \\. # or after a consumed or escaped \ + ) + ( # A potentially incompatible match + \. # . matches any Unicode character + | \[\^ # Bracketed -ve character class + | \(\?i # (Unicode) case insensitive + | \\[WwDdSsBbPp] # Unicode classes + | \\[0-9] # Back-references need fancy + ) + ) + | [^\x01-\x7f] # Any non-ASCII character + ", + ) + .unwrap() +}); + +/// All characters signifying that the match must be handled by an RE +/// rather than by plain string pattern matching. +// These do not include the ^$ metacharacters, which we can easily handle. +// Plain string fixed-string matching is currently faster than Regex +// matching, because Regex always constructs an automaton and needs +// to handle state transitions, whereas plain string matching can +// use tailored CPU string or vectored instructions. +static NEEDS_RE: Lazy = Lazy::new(|| { + regex::Regex::new( + r"(?x) # Turn on verbose mode + ( ^ # Non-escaped: i.e. at BOL + | ^[^\\] # or after a BOL non \ + | [^\\] {2} # or after two non \ characters + | \\. # or after a consumed or escaped \ + ) + ( # A potentially incompatible match + [.?|+(\[{*] # Any magic RE character + # Some are operators so illegal at + # BOL but they should error there, + # not use them as literals. + | \\[WwDdSsPp] # Unicode classes + | \\[AzBb] # Empty matches + | \\[0-9] # Back-references + ) + ", + ) + .unwrap() +}); + #[derive(Clone, Debug)] /// A regular expression that can be implemented through Byte or Fancy-Regex // This enables a fast bytes path where possible. @@ -34,47 +105,6 @@ pub enum Regex { impl Regex { /// Construct the most efficient engine possible pub fn new(pattern: &str) -> Result> { - // REs requiring the fancy_regex capabilities rather than the - // regex::bytes engine - // Consider . as one character that requires fancy_regex, - // because it can match more than one byte when matching a - // two or more byte Unicode UTF-8 representation. - // It is an RE . rather than a literal one in the following - // example sitations. - // . First character of the line - // [^\\]. Second character after non \ - // - // \*. A consumed backslash anywhere on the line - // \\. An escaped backslash anywhere on the line - // xx. A non-escaped sequence anywhere on the line - // But the following are literal dots and can be captured by bytes: - // \. escaped at the beginning of the line - // x\. escaped after a non escaped \ anywhere on the line - // - // The following RE captures these situations. - static NEEDS_FANCY_RE: Lazy = Lazy::new(|| { - regex::Regex::new( - r"(?x) # Turn on verbose mode - ( # An ASCII-incompatible RE - ( ^ # Non-escaped: i.e. at BOL - | ^[^\\] # or after a BOL non \ - | [^\\] {2} # or after two non \ characters - | \\. # or after a consumed or escaped \ - ) - ( # A potentially incompatible match - \. # . matches any Unicode character - | \[\^ # Bracketed -ve character class - | \(\?i # (Unicode) case insensitive - | \\[WwDdSsBbPp] # Unicode classes - | \\[0-9] # Back-references need fancy - ) - ) - | [^\x01-\x7f] # Any non-ASCII character - ", - ) - .unwrap() - }); - if NEEDS_FANCY_RE.is_match(pattern) { Ok(Self::Fancy(FancyRegex::new(pattern)?)) } else { @@ -272,102 +302,130 @@ impl<'t> Captures<'t> { mod tests { use super::*; - fn assert_byte(pattern: &str) { - let re = Regex::new(pattern).unwrap(); - assert!(re.is_byte(), "Pattern {:?} should use Byte engine", pattern); - } - - fn assert_fancy(pattern: &str) { - let re = Regex::new(pattern).unwrap(); - assert!( - !re.is_byte(), - "Pattern {:?} should use Fancy engine", - pattern - ); - } - - #[test] - fn selects_byte_regex_for_escpaped_dot_bol() { - assert_byte(r"\."); - } - + // FANCY_RE #[test] - fn selects_byte_regex_for_escpaped_dot_non_bol() { - assert_byte(r"x\."); - } - - #[test] - fn selects_byte_regex_for_escaped_class() { - assert_byte(r"\[^x]"); - } - - #[test] - fn selects_byte_regex_for_escaped_case_insensitive_flag() { - assert_byte(r"\(?i\)"); - } - - #[test] - fn selects_byte_regex_for_escaped_unicode_class() { - assert_byte(r"\\w"); - } - - #[test] - fn selects_byte_regex_for_simple_ascii() { - assert_byte(r"foo"); - assert_byte(r"foo|bar"); - assert_byte(r"^foo[0-9]+bar$"); - } - - #[test] - fn selects_fancy_for_unicode_class_bol() { - assert_fancy(r"\p{L}+"); // Unicode letter class - assert_fancy(r"\W"); // \W is Unicode-aware. - assert_fancy(r"\S+"); // \S is Unicode-aware. - assert_fancy(r"\d"); // \d includes all Unicode digits. - } - - #[test] - fn selects_fancy_for_unicode_class_non_bol() { - assert_fancy(r"x\p{L}+"); // Unicode letter class - assert_fancy(r"x\W"); // \W is Unicode-aware. - assert_fancy(r"x\S+"); // \S is Unicode-aware. - assert_fancy(r"x\d"); // \d includes all Unicode digits. - } - - #[test] - fn selects_fancy_for_dot() { - assert_fancy(r"."); - assert_fancy(r"x."); - assert_fancy(r"xx."); + fn test_needs_fancy_re_matches() { + let should_match = [ + // Unicode classes BOL + r"\p{L}+", // Unicode letter class + r"\W", // \W is Unicode-aware. + r"\S+", // \S is Unicode-aware. + r"\d", // \d includes all Unicode digits. + // Unicode classes non-BOL + r"x\p{L}+", // Unicode letter class + r"x\W", // \W is Unicode-aware. + r"x\S+", // \S is Unicode-aware. + r"x\d", // \d includes all Unicode digits. + // . + r".", + r"x.", + r"xx.", + // Consumed \ + r"\*.", + r"x\*.", + // Escaped \ + r"\\.", + r"x\\.", + // Inline flags + r"(?i)abc", // Unicode case-insensitive + r"x(?i)abc", // Unicode case-insensitive + r"(\w+):\1", // back-reference \1 + // Non-ASCII literals + "naïve", // Contains literal non-ASCII. + "café", // Contains literal non-ASCII. + ]; + + for pat in &should_match { + assert!( + NEEDS_FANCY_RE.is_match(pat), + "Expected NEEDS_FANCY_RE to match: {:?}", + pat + ); + } } #[test] - fn selects_fancy_for_consumed_backslash() { - assert_fancy(r"\*."); - assert_fancy(r"x\*."); + fn test_needs_fancy_re_does_not_match() { + let should_not_match = [ + r"\.", // Escaped . at BOL + r"x\.", // Escaped . at non BOL + r"\[^x]", // Escaped character class + r"\(?i\)", // Escaped case insesitive flag + r"\\w", // Escaped Unicode class + // Simple ASCII + r"foo", + r"foo|bar", + r"^foo[0-9]+bar$", + ]; + + for pat in &should_not_match { + assert!( + !NEEDS_FANCY_RE.is_match(pat), + "Expected NEEDS_FANCY_RE to NOT match: {:?}", + pat + ); + } } + // NEEDS_RE #[test] - fn selects_fancy_for_escaped_backslash() { - assert_fancy(r"\\."); - assert_fancy(r"x\\."); + fn test_needs_re_matches() { + let should_match = [ + r".", // Single regex wildcard + r"a+b", // Regex + + r"foo|bar", // Regex alternation + r"abc?", // Regex optional + r"a*b", // Regex star + r"[abc]", // Character class + r"(abc)", // Group + r"{1,2}", // Repetition + r"\d", // Class shorthand + r"\S", // Class shorthand + r"\1", // Backreference + r"a\Pb", // Unicode property + ]; + + for pat in &should_match { + assert!( + NEEDS_RE.is_match(pat), + "Expected NEEDS_RE to match: {:?}", + pat + ); + } } #[test] - fn selects_fancy_for_inline_flags() { - assert_fancy(r"(?i)abc"); // Unicode case-insensitive - assert_fancy(r"x(?i)abc"); // Unicode case-insensitive + fn test_needs_re_does_not_match() { + let should_not_match = [ + r"abc", + r"a\.b", // Escaped dot + r"hello world", + r"^abc$", // Anchors alone + r"file\.", // Escaped dot + r"literal123", + r"\\", // Escaped backslash + ]; + + for pat in &should_not_match { + assert!( + !NEEDS_RE.is_match(pat), + "Expected NEEDS_RE to NOT match: {:?}", + pat + ); + } } + // Regex::new #[test] - fn selects_fancy_for_backrefs() { - assert_fancy(r"(\w+):\1"); // back-reference \1 + fn assert_byte_selection() { + let re = Regex::new(r"x\.").unwrap(); + assert!(re.is_byte()); } #[test] - fn selects_fancy_for_non_ascii_literals() { - assert_fancy("naïve"); // Contains literal non-ASCII. - assert_fancy("café"); // Contains literal non-ASCII. + fn assert_fancy() { + let re = Regex::new(r"\d").unwrap(); + assert!(!re.is_byte()); } #[test] From 5af9cf66b317061e67b424be67e6c55fd7ff372f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 28 May 2025 14:35:55 +0300 Subject: [PATCH 36/42] Implement an efficient literal string matcher Regex uses an automaton even for matching literal strings. Moving through its state transitions is suboptimal. This commit introduces literal string matching with the memchr::memmem matcher, which may use SIMD and other specialized features to speed up the search as well as other algorithms for special sizes. This boosts substantially the performance of several benchmark cases. access-log-no-subst rust-86 is 2.84 times faster than rust-sed-a6 access-log-subst rust-86 is 1.87 times faster than rust-sed-a6 access-log-no-del rust-86 is 3.00 times faster than rust-sed-a6 access-log-all-del rust-86 is 2.80 times faster than rust-sed-a6 remove-cr rust-86 is 4.35 times faster than rust-sed-a6 genome-subst rust-86 is 3.65 times faster than rust-sed-a6 It makes makes the performance of Rust better than the GNU and FreeBSD implementations for most benchmark cases. no-op-short rust is 3.04 times faster than gnu access-log-no-op rust is 1.91 times faster than gnu access-log-no-subst rust is 1.59 times faster than gnu access-log-subst rust is 1.32 times faster than gnu access-log-no-del rust is 1.54 times faster than gnu access-log-all-del rust is 1.99 times faster than gnu access-log-translit rust is 12.67 times faster than gnu access-log-complex-sub gnu is 4.24 times faster than rust remove-cr rust is 1.68 times faster than gnu genome-subst rust is 1.15 times faster than gnu number-fix rust is 1.06 times faster than gnu long-script gnu is 2.19 times faster than rust hanoi rust is 1.96 times faster than gnu factorial rust is 1.25 times faster than gnu no-op-short rust is 3.50 times faster than fbsd access-log-no-op rust is 2.45 times faster than fbsd access-log-no-subst rust is 1.38 times faster than fbsd access-log-subst rust is 4.02 times faster than fbsd access-log-no-del rust is 1.35 times faster than fbsd access-log-all-del rust is 4.74 times faster than fbsd access-log-translit fbsd is 1.14 times faster than rust access-log-complex-sub rust is 2.00 times faster than fbsd remove-cr rust is 2.48 times faster than fbsd genome-subst rust is 2.21 times faster than fbsd number-fix fbsd is 1.04 times faster than rust long-script fbsd is 15.46 times faster than rust hanoi rust is 9.33 times faster than fbsd factorial rust is 115.72 times faster than fbsd --- src/uu/sed/src/fast_regex.rs | 335 +++++++++++++++++++++++++++++++++-- 1 file changed, 320 insertions(+), 15 deletions(-) diff --git a/src/uu/sed/src/fast_regex.rs b/src/uu/sed/src/fast_regex.rs index ff7760ef..b45d36c9 100644 --- a/src/uu/sed/src/fast_regex.rs +++ b/src/uu/sed/src/fast_regex.rs @@ -14,6 +14,7 @@ use fancy_regex::{ CaptureMatches as FancyCaptureMatches, Captures as FancyCaptures, Regex as FancyRegex, }; +use memchr::memmem; use once_cell::sync::Lazy; use regex::Regex as RustRegex; use regex::bytes::{ @@ -95,31 +96,166 @@ static NEEDS_RE: Lazy = Lazy::new(|| { }); #[derive(Clone, Debug)] -/// A regular expression that can be implemented through Byte or Fancy-Regex -// This enables a fast bytes path where possible. +/// Types of literal string anchored matches +enum AnchoredMatch { + Begin, // ^... + End, // ...$ + Both, // ^...$ + Free, // ... +} + +#[derive(Clone, Debug)] +/// A fast Regex-like matcher for literal strings using memchr:memmem +pub struct LiteralMatcher { + needle: Vec, // Bytes without any anchors + match_type: AnchoredMatch, // Type of anchoring specified +} + +impl LiteralMatcher { + /// Construct a new matcher based on a needle possible with anchors. + pub fn new(needle: &str) -> Self { + let needle_bytes = needle.as_bytes(); + if needle_bytes[0] == b'^' && needle_bytes[needle_bytes.len() - 1] == b'$' { + LiteralMatcher { + match_type: AnchoredMatch::Both, + needle: needle_bytes[1..needle_bytes.len() - 1].to_vec(), + } + } else if needle_bytes[0] == b'^' { + LiteralMatcher { + match_type: AnchoredMatch::Begin, + needle: needle_bytes[1..needle_bytes.len()].to_vec(), + } + } else if needle_bytes[needle_bytes.len() - 1] == b'$' { + LiteralMatcher { + match_type: AnchoredMatch::End, + needle: needle_bytes[0..needle_bytes.len() - 1].to_vec(), + } + } else { + LiteralMatcher { + match_type: AnchoredMatch::Free, + needle: needle_bytes.to_vec(), + } + } + } + + /// Returns the start index of a match, if any + fn anchored_find(&self, haystack: &[u8]) -> Option { + let nlen = self.needle.len(); + let hlen = haystack.len(); + + match self.match_type { + AnchoredMatch::Both => { + if hlen == nlen && haystack == self.needle.as_slice() { + Some(0) + } else { + None + } + } + AnchoredMatch::Begin => { + if hlen >= nlen && &haystack[..nlen] == self.needle.as_slice() { + Some(0) + } else { + None + } + } + AnchoredMatch::End => { + if hlen >= nlen && &haystack[hlen - nlen..] == self.needle.as_slice() { + Some(hlen - nlen) + } else { + None + } + } + AnchoredMatch::Free => memmem::find(haystack, &self.needle), + } + } + + /// Return true if the needle occurs in the haystack. + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.anchored_find(haystack).is_some() + } + + /// Return the position and contents of the matched needle. + pub fn find<'t>(&self, haystack: &'t [u8]) -> Option<(usize, usize, &'t str)> { + self.anchored_find(haystack).and_then(|start| { + let end = start + self.needle.len(); + std::str::from_utf8(&haystack[start..end]) + .ok() + .map(|s| (start, end, s)) + }) + } + + /// Return all positions and contents of the matched needle. + pub fn iter<'t>( + &'t self, + haystack: &'t [u8], + ) -> Box + 't> { + let needle = &self.needle; + let nlen = needle.len(); + + match self.match_type { + AnchoredMatch::Both | AnchoredMatch::Begin | AnchoredMatch::End => { + // At most one match; yield it if present + Box::new(self.find(haystack).into_iter()) + } + AnchoredMatch::Free => { + // Multiple potential matches + Box::new( + memmem::find_iter(haystack, needle).filter_map(move |start| { + let end = start + nlen; + std::str::from_utf8(&haystack[start..end]) + .ok() + .map(|s| (start, end, s)) + }), + ) + } + } + } +} + +/// Return the passed pattern without any backslash escapes. +pub fn remove_escapes(pattern: &str) -> String { + let mut chars = pattern.chars().peekable(); + let mut result = String::with_capacity(pattern.len()); + + while let Some(c) = chars.next() { + if c == '\\' { + // Look ahead and consume the next character if present + if let Some(&next) = chars.peek() { + result.push(next); + chars.next(); // consume the peeked char + } + } else { + result.push(c); + } + } + + result +} + +#[derive(Clone, Debug)] +/// A regular expression that can be implemented in diverse efficient ways pub enum Regex { - Byte(ByteRegex), - Fancy(FancyRegex), + Literal(LiteralMatcher), // Fastest: literal bytes + Byte(ByteRegex), // Slower: byte-based RE + Fancy(FancyRegex), // Slowest: RE supporting UTF-8 and back-references } impl Regex { - /// Construct the most efficient engine possible + /// Construct the most efficient RE-like matching engine possible. pub fn new(pattern: &str) -> Result> { if NEEDS_FANCY_RE.is_match(pattern) { Ok(Self::Fancy(FancyRegex::new(pattern)?)) - } else { + } else if NEEDS_RE.is_match(pattern) { Ok(Self::Byte(ByteRegex::new(pattern)?)) + } else { + Ok(Self::Literal(LiteralMatcher::new(&remove_escapes(pattern)))) } } - /// Return true if this is a byte-based regex. - pub fn is_byte(&self) -> bool { - matches!(self, Regex::Byte(_)) - } - /// Check if the regex matches the content of the IOChunk. pub fn is_match(&self, chunk: &mut IOChunk) -> UResult { match self { + Regex::Literal(m) => Ok(m.is_match(chunk.as_bytes())), Regex::Byte(re) => Ok(re.is_match(chunk.as_bytes())), Regex::Fancy(re) => { let text = chunk.as_str()?; @@ -132,7 +268,15 @@ impl Regex { /// Return an iterator over capture groups. pub fn captures_iter<'t>(&'t self, chunk: &'t IOChunk) -> UResult> { match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + Ok(CaptureMatches::Literal(Box::new(m.iter(haystack).map( + |(start, end, text)| Ok(Captures::Literal(Match { start, end, text })), + )))) + } + Regex::Byte(re) => Ok(CaptureMatches::Byte(re.captures_iter(chunk.as_bytes()))), + Regex::Fancy(re) => { let text = chunk.as_str()?; Ok(CaptureMatches::Fancy(re.captures_iter(text))) @@ -143,6 +287,7 @@ impl Regex { /// Return the number of capture groups, including group 0. pub fn captures_len(&self) -> usize { match self { + Regex::Literal(_) => 1, // Only group 0 Regex::Byte(re) => re.captures_len(), Regex::Fancy(re) => re.captures_len(), } @@ -151,6 +296,16 @@ impl Regex { /// Return the elements of the first capture. pub fn captures<'t>(&self, chunk: &'t IOChunk) -> UResult>> { match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + match m.find(haystack) { + Some((start, end, text)) => { + Ok(Some(Captures::Literal(Match { start, end, text }))) + } + None => Ok(None), + } + } + Regex::Byte(re) => { let bytes = chunk.as_bytes(); Ok(re.captures(bytes).map(Captures::Byte)) @@ -170,6 +325,14 @@ impl Regex { /// Return a non-capturing result for a single match. pub fn find<'t>(&self, chunk: &'t IOChunk) -> UResult>> { match self { + Regex::Literal(m) => { + let haystack = chunk.as_bytes(); + match m.find(haystack) { + Some((start, end, text)) => Ok(Some(Match { start, end, text })), + None => Ok(None), + } + } + Regex::Byte(re) => { let haystack = chunk.as_bytes(); if let Some(m) = re.find(haystack) { @@ -185,6 +348,7 @@ impl Regex { Ok(None) } } + Regex::Fancy(re) => { let text = chunk.as_str()?; match re.find(text) { @@ -203,6 +367,7 @@ impl Regex { /// Unified enum for holding either byte or fancy capture iterators. pub enum CaptureMatches<'t> { + Literal(Box>> + 't>), Byte(ByteCaptureMatches<'t, 't>), Fancy(FancyCaptureMatches<'t, 't>), } @@ -212,6 +377,7 @@ impl<'t> Iterator for CaptureMatches<'t> { fn next(&mut self) -> Option { match self { + CaptureMatches::Literal(iter) => iter.next(), CaptureMatches::Byte(iter) => iter.next().map(|caps| Ok(Captures::Byte(caps))), CaptureMatches::Fancy(iter) => match iter.next() { Some(Ok(caps)) => Some(Ok(Captures::Fancy(caps))), @@ -225,7 +391,7 @@ impl<'t> Iterator for CaptureMatches<'t> { } } -#[derive(Debug)] +#[derive(Clone, Debug)] /// Result type for RE capture get(n) pub struct Match<'t> { start: usize, // Match start @@ -250,6 +416,7 @@ impl<'t> Match<'t> { /// Provide interface compatible with Regex::Captures. pub enum Captures<'t> { + Literal(Match<'t>), // only group 0 Byte(ByteCaptures<'t>), Fancy(FancyCaptures<'t>), } @@ -260,6 +427,7 @@ impl<'t> Captures<'t> { /// Returns Err if UTF-8 conversion fails (in Byte variant). pub fn get(&self, i: usize) -> UResult>> { match self { + Captures::Literal(m) => Ok(if i == 0 { Some(m.clone()) } else { None }), Captures::Byte(caps) => match caps.get(i) { Some(m) => Ok(Some(Match { start: m.start(), @@ -283,6 +451,7 @@ impl<'t> Captures<'t> { /// Return the number of capture groups (including group 0). pub fn len(&self) -> usize { match self { + Captures::Literal(_) => 1, Captures::Byte(caps) => caps.len(), Captures::Fancy(caps) => caps.len(), } @@ -292,6 +461,7 @@ impl<'t> Captures<'t> { // Unused, but provided for completeness. pub fn is_empty(&self) -> bool { match self { + Captures::Literal(_) => false, // A literal match always has group 0 Captures::Byte(caps) => caps.len() == 0, Captures::Fancy(caps) => caps.len() == 0, } @@ -418,14 +588,20 @@ mod tests { // Regex::new #[test] fn assert_byte_selection() { - let re = Regex::new(r"x\.").unwrap(); - assert!(re.is_byte()); + let re = Regex::new(r"x*").unwrap(); + assert!(matches!(re, Regex::Byte(_))); } #[test] fn assert_fancy() { let re = Regex::new(r"\d").unwrap(); - assert!(!re.is_byte()); + assert!(matches!(re, Regex::Fancy(_))); + } + + #[test] + fn assert_literal() { + let re = Regex::new(r"x\.").unwrap(); + assert!(matches!(re, Regex::Literal(_))); } #[test] @@ -437,4 +613,133 @@ mod tests { err ); } + + // remove_escapes + #[test] + fn test_remove_escapes() { + use super::remove_escapes; + + assert_eq!(remove_escapes("abc"), "abc"); + assert_eq!(remove_escapes(r"a\.c"), "a.c"); + assert_eq!(remove_escapes(r"\\d"), r"\d"); + assert_eq!(remove_escapes(r"\.\*\+\?"), ".*+?"); + assert_eq!(remove_escapes(r"escaped\\backslash"), r"escaped\backslash"); + assert_eq!(remove_escapes(r"trailing\\"), r"trailing\"); + } + + // LiteralMatcher + #[test] + fn test_literal_matcher_basic_match() { + let matcher = LiteralMatcher::new("needle"); + assert!(matcher.is_match(b"this is a needle in a haystack")); + assert!(!matcher.is_match(b"no match here")); + } + + #[test] + fn test_literal_matcher_anchor_start_match() { + let matcher = LiteralMatcher::new("^needle"); + assert!(matcher.is_match(b"needle in a haystack")); + assert!(!matcher.is_match(b"no needle match here")); + assert!(!matcher.is_match(b"no")); + } + + #[test] + fn test_literal_matcher_anchor_end_match() { + let matcher = LiteralMatcher::new("needle$"); + assert!(matcher.is_match(b"In a haystack there's a needle")); + assert!(!matcher.is_match(b"no needle match here")); + assert!(!matcher.is_match(b"no")); + } + + #[test] + fn test_literal_matcher_anchor_begin_end_match() { + let matcher = LiteralMatcher::new("^needle$"); + assert!(matcher.is_match(b"needle")); + assert!(!matcher.is_match(b"no needle match")); + assert!(!matcher.is_match(b"needle no match")); + assert!(!matcher.is_match(b"no match needle")); + assert!(!matcher.is_match(b"nada")); + } + + #[test] + fn test_literal_matcher_utf8_match() { + let matcher = LiteralMatcher::new("✓"); // U+2713 CHECK MARK (3 bytes) + let haystack = "contains ✓ unicode".as_bytes(); + assert!(matcher.is_match(haystack)); + let found = matcher.find(haystack).unwrap(); + assert_eq!(found.2, "✓"); + } + + #[test] + fn test_literal_matcher_find_location() { + let matcher = LiteralMatcher::new("abc"); + let haystack = b"___abc___"; + let result = matcher.find(haystack); + assert!(result.is_some()); + let (start, end, text) = result.unwrap(); + assert_eq!((start, end), (3, 6)); + assert_eq!(text, "abc"); + } + + #[test] + fn test_literal_matcher_find_location_end() { + let matcher = LiteralMatcher::new("abc$"); + let haystack = b"012abc"; + let result = matcher.find(haystack); + assert!(result.is_some()); + let (start, end, text) = result.unwrap(); + assert_eq!((start, end), (3, 6)); + assert_eq!(text, "abc"); + } + + #[test] + fn test_literal_matcher_iter_multiple() { + let matcher = LiteralMatcher::new("test"); + let haystack = b"this test is a test of test matching"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 3); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test", "test", "test"]); + } + + #[test] + fn test_literal_matcher_iter_begin() { + let matcher = LiteralMatcher::new("^test"); + let haystack = b"test is a test of test matching"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 1); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test"]); + } + + #[test] + fn test_literal_matcher_iter_end() { + let matcher = LiteralMatcher::new("test$"); + let haystack = b"this test is a test of test"; + let matches: Vec<_> = matcher.iter(haystack).collect(); + assert_eq!(matches.len(), 1); + + let strings: Vec<_> = matches.iter().map(|(_, _, s)| *s).collect(); + assert_eq!(strings, ["test"]); + } + + #[test] + fn test_literal_matcher_no_match() { + let matcher = LiteralMatcher::new("missing"); + let haystack = b"nothing to see here"; + assert!(!matcher.is_match(haystack)); + assert!(matcher.find(haystack).is_none()); + assert_eq!(matcher.iter(haystack).count(), 0); + } + + #[test] + fn test_literal_matcher_anchored_no_match() { + let matcher = LiteralMatcher::new("^see$"); + let haystack = b"nothing to see here"; + assert!(!matcher.is_match(haystack)); + assert!(matcher.find(haystack).is_none()); + assert_eq!(matcher.iter(haystack).count(), 0); + } } From 428fabed555fd102b8fe81a614952ccb3647bd13 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 29 May 2025 12:29:54 +0300 Subject: [PATCH 37/42] Implement the r, w, and = commands --- src/uu/sed/src/command.rs | 7 +- src/uu/sed/src/compiler.rs | 73 ++++++++++++++------- src/uu/sed/src/fast_io.rs | 8 ++- src/uu/sed/src/processor.rs | 21 ++++-- tests/by-util/test_sed.rs | 52 +++++++++++++++ tests/fixtures/sed/output/number_continuous | 32 +++++++++ tests/fixtures/sed/output/number_separate | 25 +++++++ tests/fixtures/sed/output/read_empty | 14 ++++ tests/fixtures/sed/output/read_missing | 14 ++++ tests/fixtures/sed/output/read_ok | 23 +++++++ tests/fixtures/sed/output/write_single_file | 10 +++ tests/fixtures/sed/output/write_two_files_1 | 10 +++ tests/fixtures/sed/output/write_two_files_2 | 2 + 13 files changed, 261 insertions(+), 30 deletions(-) create mode 100644 tests/fixtures/sed/output/number_continuous create mode 100644 tests/fixtures/sed/output/number_separate create mode 100644 tests/fixtures/sed/output/read_empty create mode 100644 tests/fixtures/sed/output/read_missing create mode 100644 tests/fixtures/sed/output/read_ok create mode 100644 tests/fixtures/sed/output/write_single_file create mode 100644 tests/fixtures/sed/output/write_two_files_1 create mode 100644 tests/fixtures/sed/output/write_two_files_2 diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 28e30e0b..7e8bb33d 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -73,7 +73,7 @@ pub struct ProcessingContext { /// Elements that shall be appended at the end of each command processing cycle pub enum AppendElement { Text(String), // The specified text string - File(PathBuf), // The contents of the specified file + Path(PathBuf), // The contents of the specified file path } #[derive(Clone, Debug, Default, PartialEq)] @@ -286,7 +286,7 @@ pub struct Command { pub addr2: Option
, // End address pub non_select: bool, // True if '!' pub start_line: Option, // Start line number (or None) - pub text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' + pub text: Option, // Text for 'a', 'c', 'i' pub data: CommandData, // Command-specific data pub next: Option>>, // Pointer to next command } @@ -314,7 +314,8 @@ pub enum CommandData { Block(Option>>), // Commands for '{' BranchTarget(Option>>), // Commands for 'b', 't' Label(Option), // Label name for 'b', 't', ':' - NamedWriter(Box), // File descriptor for 'w' + Path(PathBuf), // File path for 'r' + NamedWriter(Rc>), // File output for 'w' Substitution(Box), // Substitute command 's' Text(Cow<'static, str>), // Text for 'a', 'c', 'i' Transliteration(Box), // Transliteration command 'y' diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index d3a7c28d..8506d492 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -207,6 +207,10 @@ pub fn compile( let mut empty_line = ScriptCharProvider::new(""); let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; + // Comment-out the following to show the compiled script. + #[cfg(any())] + dbg!(&result); + // Link branch commands to the target label commands. populate_label_map(result.clone(), context)?; resolve_branch_targets(result.clone(), context)?; @@ -219,10 +223,6 @@ pub fn compile( } patch_block_endings(result.clone()); - // Comment-out the following to show the compiled script. - #[cfg(any())] - dbg!(&result); - // TODO: setup append & match structures Ok(result) } @@ -462,6 +462,24 @@ fn compile_address_range( Ok(n_addr) } +/// Read the line's remaining characters as a file path and return it. +fn read_file_path(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + line.advance(); // Skip the command/w character + line.eat_spaces(); // Skip any leading whitespace + + let mut path = String::new(); + while !line.eol() { + path.push(line.current()); + line.advance(); + } + + if path.is_empty() { + compilation_error(lines, line, "missing file path") + } else { + Ok(PathBuf::from(path)) + } +} + /// Compile and return a single range address specification. fn compile_address( lines: &ScriptLineProvider, @@ -948,20 +966,8 @@ pub fn compile_subst_flags( } 'w' => { - line.advance(); - line.eat_spaces(); - - let mut path = String::new(); - while !line.eol() && line.current() != ';' { - path.push(line.current()); - line.advance(); - } - - if path.is_empty() { - return compilation_error(lines, line, "missing filename after 'w' flag"); - } - - subst.write_file = Some(NamedWriter::new(PathBuf::from(path))?); + let path = read_file_path(lines, line)?; + subst.write_file = Some(NamedWriter::new(path)?); return Ok(()); // 'w' is the last flag allowed } @@ -1116,10 +1122,15 @@ fn compile_command( // a c i compile_text_command(lines, line, &mut cmd)?; } - // TODO - CommandArgs::ReadFile => { // r + CommandArgs::ReadFile => { + // r + let path = read_file_path(lines, line)?; + cmd.data = CommandData::Path(path); } - CommandArgs::WriteFile => { // w + CommandArgs::WriteFile => { + // w + let path = read_file_path(lines, line)?; + cmd.data = CommandData::NamedWriter(NamedWriter::new(path)?); } } @@ -1940,7 +1951,7 @@ mod tests { let mut subst = Substitution::default(); let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); - assert!(err.to_string().contains("missing filename")); + assert!(err.to_string().contains("missing file path")); } #[test] @@ -1963,6 +1974,7 @@ mod tests { let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); assert!(err.to_string().contains("invalid substitute flag")); } + // compile_subst_command #[test] fn test_compile_subst_invalid_delimiter_backslash() { @@ -2498,4 +2510,21 @@ mod tests { let err = result.unwrap_err().to_string(); assert!(err.contains("extra characters after \\")); } + + // read_file_path + #[test] + fn test_read_existing_file_path() { + let (lines, mut chars) = make_providers("r /etc/motd"); + + let path = read_file_path(&lines, &mut chars).unwrap(); + assert_eq!(path.to_str().unwrap(), "/etc/motd"); + } + + #[test] + fn test_read_missing_file_path() { + let (lines, mut chars) = make_providers("w "); + + let err = read_file_path(&lines, &mut chars).unwrap_err(); + assert!(err.to_string().contains("missing file path")); + } } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 273e667a..ec680266 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -497,7 +497,13 @@ impl OutputBuffer { pub fn copy_file(&mut self, path: &PathBuf) -> io::Result<()> { #[cfg(unix)] self.flush_mmap()?; // Flush mmap writes, if any. - let file = File::open(path)?; + + let file = match File::open(path) { + Ok(f) => f, + // Per POSIX, if the file can't be read treat it as empty. + Err(_) => return Ok(()), + }; + let mut reader = BufReader::new(file); io::copy(&mut reader, &mut self.out)?; Ok(()) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 24253a2d..2720751e 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -314,7 +314,7 @@ fn flush_appends(output: &mut OutputBuffer, context: &mut ProcessingContext) -> AppendElement::Text(text) => { output.write_str(text.clone())?; } - AppendElement::File(path) => { + AppendElement::Path(path) => { output.copy_file(path)?; } } @@ -498,7 +498,14 @@ fn process_file( break; } 'r' => { - // TODO + // Copy the file to standard output at a later point. + let path = match &command.data { + CommandData::Path(path) => path, + _ => panic!("Expected Path command data"), + }; + context + .append_elements + .push(AppendElement::Path(path.clone())); } 's' => { let subst = match &mut command.data { @@ -526,7 +533,12 @@ fn process_file( } } 'w' => { - // TODO + // Append the pattern space to the specified file. + let writer = match &mut command.data { + CommandData::NamedWriter(writer) => writer, + _ => panic!("Expected NamedWriter command data"), + }; + writer.borrow_mut().write_line(pattern.as_str()?)?; } 'x' => { // Exchange the contents of the pattern and hold spaces. @@ -546,7 +558,8 @@ fn process_file( // Branch target; do nothing. } '=' => { - // TODO + // Output current line number. + output.write_str(format!("{}\n", context.line_number))?; } // The compilation should supply only valid codes. _ => panic!("invalid command code"), diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 91b4bad6..9e33f376 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -607,6 +607,58 @@ p ] ); +//////////////////////////////////////////////////////////// +// r, w, 0 commands +check_output!(read_ok, [format!("4r {}", LINES2), LINES1.to_string()]); +check_output!(read_missing, ["5r /xyzzyxyzy42", LINES1]); +check_output!(read_empty, ["6r input/empty", LINES1]); + +#[test] +fn write_single_file() -> std::io::Result<()> { + let temp = NamedTempFile::new()?; + let cmd = format!("3,12w {}", temp.path().display()); + + new_ucmd!().args(&["-e", &cmd, LINES1]).succeeds(); + + let mut actual = String::new(); + temp.reopen()?.read_to_string(&mut actual)?; + + let expected = fs::read_to_string("tests/fixtures/sed/output/write_single_file")?; + assert_eq!(actual, expected, "Output did not match fixture"); + + Ok(()) +} + +#[test] +fn write_two_files() -> std::io::Result<()> { + let temp1 = NamedTempFile::new()?; + let temp2 = NamedTempFile::new()?; + let cmd = format!( + "3,12w {}\n1,2w {}", + temp1.path().display(), + temp2.path().display() + ); + + new_ucmd!().args(&["-e", &cmd, LINES1]).succeeds(); + + let mut actual = String::new(); + + temp1.reopen()?.read_to_string(&mut actual)?; + let expected = fs::read_to_string("tests/fixtures/sed/output/write_two_files_1")?; + assert_eq!(actual, expected, "Output 1 did not match fixture"); + + actual.clear(); + + temp2.reopen()?.read_to_string(&mut actual)?; + let expected = fs::read_to_string("tests/fixtures/sed/output/write_two_files_2")?; + assert_eq!(actual, expected, "Output 2 did not match fixture"); + + Ok(()) +} + +check_output!(number_continuous, ["/l2_/=", LINES1, LINES2]); +check_output!(number_separate, ["-s", "/l._8/=", LINES1, LINES2]); + //////////////////////////////////////////////////////////// // Large complex scripts diff --git a/tests/fixtures/sed/output/number_continuous b/tests/fixtures/sed/output/number_continuous new file mode 100644 index 00000000..cb8d266f --- /dev/null +++ b/tests/fixtures/sed/output/number_continuous @@ -0,0 +1,32 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +15 +l2_1 +16 +l2_2 +17 +l2_3 +18 +l2_4 +19 +l2_5 +20 +l2_6 +21 +l2_7 +22 +l2_8 +23 +l2_9 diff --git a/tests/fixtures/sed/output/number_separate b/tests/fixtures/sed/output/number_separate new file mode 100644 index 00000000..df6ffdda --- /dev/null +++ b/tests/fixtures/sed/output/number_separate @@ -0,0 +1,25 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +8 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +8 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/read_empty b/tests/fixtures/sed/output/read_empty new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/read_empty @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/read_missing b/tests/fixtures/sed/output/read_missing new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/read_missing @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/read_ok b/tests/fixtures/sed/output/read_ok new file mode 100644 index 00000000..19a94616 --- /dev/null +++ b/tests/fixtures/sed/output/read_ok @@ -0,0 +1,23 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/write_single_file b/tests/fixtures/sed/output/write_single_file new file mode 100644 index 00000000..d90ffa1e --- /dev/null +++ b/tests/fixtures/sed/output/write_single_file @@ -0,0 +1,10 @@ +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 diff --git a/tests/fixtures/sed/output/write_two_files_1 b/tests/fixtures/sed/output/write_two_files_1 new file mode 100644 index 00000000..d90ffa1e --- /dev/null +++ b/tests/fixtures/sed/output/write_two_files_1 @@ -0,0 +1,10 @@ +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 diff --git a/tests/fixtures/sed/output/write_two_files_2 b/tests/fixtures/sed/output/write_two_files_2 new file mode 100644 index 00000000..4c8acc52 --- /dev/null +++ b/tests/fixtures/sed/output/write_two_files_2 @@ -0,0 +1,2 @@ +l1_1 +l1_2 From c73f88a92823ed9a33a962309869e1ca0fb40c3c Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 29 May 2025 17:50:25 +0300 Subject: [PATCH 38/42] Remove unneeded text field --- src/uu/sed/src/command.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 7e8bb33d..ca9f5930 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -286,7 +286,6 @@ pub struct Command { pub addr2: Option
, // End address pub non_select: bool, // True if '!' pub start_line: Option, // Start line number (or None) - pub text: Option, // Text for 'a', 'c', 'i' pub data: CommandData, // Command-specific data pub next: Option>>, // Pointer to next command } @@ -299,7 +298,6 @@ impl Default for Command { addr2: None, non_select: false, start_line: None, - text: None, data: CommandData::None, next: None, } From 18cf3fc52cd7749ed3cf342ac1f5ba71e53dd4a2 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 30 May 2025 10:39:34 +0300 Subject: [PATCH 39/42] Add l and Q commands, extended with GNU number --- Cargo.lock | 2 + Cargo.toml | 2 + README.md | 5 + src/uu/sed/Cargo.toml | 1 + src/uu/sed/src/command.rs | 1 + src/uu/sed/src/compiler.rs | 96 ++++++++++++++++-- src/uu/sed/src/fast_io.rs | 29 +++++- src/uu/sed/src/processor.rs | 131 +++++++++++++++++-------- src/uu/sed/src/sed.rs | 6 +- tests/by-util/test_sed.rs | 26 ++++- tests/fixtures/sed/input/ascii | Bin 0 -> 130 bytes tests/fixtures/sed/input/unicode | 1 + tests/fixtures/sed/output/list_ascii | 6 ++ tests/fixtures/sed/output/list_empty | 0 tests/fixtures/sed/output/list_unicode | 4 + 15 files changed, 256 insertions(+), 54 deletions(-) create mode 100644 tests/fixtures/sed/input/ascii create mode 100644 tests/fixtures/sed/input/unicode create mode 100644 tests/fixtures/sed/output/list_ascii create mode 100644 tests/fixtures/sed/output/list_empty create mode 100644 tests/fixtures/sed/output/list_unicode diff --git a/Cargo.lock b/Cargo.lock index 10be09f1..0ef70220 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -714,6 +714,7 @@ dependencies = [ "rlimit", "sysinfo", "tempfile", + "terminal_size", "textwrap", "uu_sed", "uucore", @@ -903,6 +904,7 @@ dependencies = [ "once_cell", "regex", "tempfile", + "terminal_size", "uucore", ] diff --git a/Cargo.toml b/Cargo.toml index ca184677..cdf511aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ regex = "1.10.4" sysinfo = "0.35" tempfile = "3.10.1" textwrap = { version = "0.16.1", features = ["terminal_size"] } +terminal_size = "0.4.2" uucore = "0.0.30" xattr = "1.3.1" @@ -64,6 +65,7 @@ once_cell = { workspace = true } phf = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } sysinfo = { workspace = true } +terminal_size = { workspace = true } textwrap = { workspace = true } uucore = { workspace = true } uutests = "0.0.30" diff --git a/README.md b/README.md index 495adcf7..18a10747 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,9 @@ cargo run --release in the strings of the `y` command. Under POSIX these yield undefined behavior. * The substitution command replacement group `\0` is a synonym for &. +* A `Q` command (optionally followed by an exit code) quits immediately. +* The `q` command can be optionally followed by an exit code. +* The `l` command can be optionally followed by the output width. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. @@ -50,6 +53,8 @@ cargo run --release ### New extensions * Unicode characters can be specified in regular expression pattern, replacement and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. +* The `l` command lists Unicode characters using the `\uXXXX` and `\UXXXXXXXX` + sequences. ### Incompatibilities * The input is assumed to be valid UTF-8 (this includes 7-bit ASCII). diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index acf27186..0aa9f2b9 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -20,6 +20,7 @@ regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } once_cell = { workspace = true } +terminal_size = { workspace = true } uucore = { workspace = true } [lib] diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ca9f5930..6acfb424 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -314,6 +314,7 @@ pub enum CommandData { Label(Option), // Label name for 'b', 't', ':' Path(PathBuf), // File path for 'r' NamedWriter(Rc>), // File output for 'w' + Number(usize), // Number for 'l', 'q', 'Q' (GNU) Substitution(Box), // Substitute command 's' Text(Cow<'static, str>), // Text for 'a', 'c', 'i' Transliteration(Box), // Transliteration command 'y' diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 8506d492..a7ebd8e8 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -19,6 +19,7 @@ use crate::fast_regex::Regex; use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; + use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; @@ -26,20 +27,24 @@ use std::mem; use std::path::PathBuf; use std::rc::Rc; use std::sync::LazyLock; +use terminal_size::{Width, terminal_size}; use uucore::error::{UResult, USimpleError}; +const DEFAULT_OUTPUT_WIDTH: usize = 60; + // A global, immutable map of command properties, initialized on first access static CMD_MAP: LazyLock> = LazyLock::new(build_command_map); // Types of command arguments recognized by the parser #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CommandArgs { - Empty, // d D g G h H l n N p P q x = \0 + Empty, // d D g G h H n N p P x = \0 Text, // a c i NonSelect, // ! BeginGroup, // { EndGroup, // } Label, // b t : + Number, // l q Q (GNU extension) ReadFile, // r WriteFile, // w Substitute, // s @@ -120,7 +125,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'l', n_addr: 2, - args: CommandArgs::Empty, + args: CommandArgs::Number, }, CommandSpec { code: 'n', @@ -145,7 +150,12 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'q', n_addr: 1, - args: CommandArgs::Empty, + args: CommandArgs::Number, + }, + CommandSpec { + code: 'Q', // GNU extension + n_addr: 1, + args: CommandArgs::Number, }, CommandSpec { code: 'r', @@ -523,14 +533,14 @@ fn compile_address( } '+' => { line.advance(); - let number = parse_number(lines, line)?; + let number = parse_number(lines, line, true)?.unwrap(); Ok(Address { atype: AddressType::RelLine, value: AddressValue::LineNumber(number), }) } c if c.is_ascii_digit() => { - let number = parse_number(lines, line)?; + let number = parse_number(lines, line, true)?.unwrap(); Ok(Address { atype: AddressType::Line, value: AddressValue::LineNumber(number), @@ -542,7 +552,12 @@ fn compile_address( /// Parse and return the decimal number at the current line position. /// Advance the line to first non-digit or EOL. -fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { +/// Issue an error if the number is required. +fn parse_number( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + required: bool, +) -> UResult> { let mut num_str = String::new(); while !line.eol() && line.current().is_ascii_digit() { @@ -550,10 +565,19 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR line.advance(); } + if num_str.is_empty() { + if required { + return compilation_error(lines, line, "number expected"); + } else { + return Ok(None); + } + } + num_str .parse::() .map_err(|_| format!("invalid number '{}'", num_str)) .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) + .map(Some) } /// Parse the end of a command, failing with an error on extra characters. @@ -1032,6 +1056,42 @@ fn compile_label_command( parse_command_ending(lines, line, cmd) } +/// Return the width of the command's terminal or a default. +fn output_width() -> usize { + if let Some((Width(w), _)) = terminal_size() { + w as usize + } else { + DEFAULT_OUTPUT_WIDTH + } +} + +fn compile_number_command( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult<()> { + line.advance(); // Skip the command character + line.eat_spaces(); // Skip any leading whitespace + + match parse_number(lines, line, false)? { + Some(n) => { + cmd.data = CommandData::Number(n); + } + None => match cmd.code { + 'q' | 'Q' => { + cmd.data = CommandData::Number(0); + } + 'l' => { + cmd.data = CommandData::Number(output_width()); + } + _ => panic!("invalid number-expecting command"), + }, + } + + line.eat_spaces(); // Skip any trailing whitespace + parse_command_ending(lines, line, cmd) +} + fn compile_text_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, @@ -1110,6 +1170,10 @@ fn compile_command( CommandArgs::NonSelect => { // ! // Implemented at a higher level. } + CommandArgs::Number => { + // l q Q + compile_number_command(lines, line, &mut cmd)?; + } CommandArgs::Substitute => { // s return compile_subst_command(lines, line, &mut cmd, context); @@ -1375,14 +1439,28 @@ mod tests { #[test] fn test_parse_number_basic() { let (lines, mut chars) = make_providers("123abc"); - assert_eq!(parse_number(&lines, &mut chars).unwrap(), 123); + assert_eq!(parse_number(&lines, &mut chars, true).unwrap(), Some(123)); assert_eq!(chars.current(), 'a'); // Should stop at first non-digit } + #[test] + fn test_parse_optional_number_missing() { + let (lines, mut chars) = make_providers(" ;"); + assert_eq!(parse_number(&lines, &mut chars, false).unwrap(), None); + } + #[test] fn test_parse_number_invalid() { - let (lines, mut chars) = make_providers("abc"); - assert!(parse_number(&lines, &mut chars).is_err()); + let (lines, mut chars) = make_providers("537654897563495734653453434534534534545"); + let err = parse_number(&lines, &mut chars, true).unwrap_err(); + assert!(err.to_string().contains("invalid number")); + } + + #[test] + fn test_parse_required_number_missing() { + let (lines, mut chars) = make_providers(""); + let err = parse_number(&lines, &mut chars, true).unwrap_err(); + assert!(err.to_string().contains("number expected")); } // compile_re diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index ec680266..1fa50f3f 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -172,6 +172,11 @@ impl<'a> IOChunk<'a> { } } + /// Return true if the content is empty. + pub fn is_empty(&self) -> bool { + self.content.len() == 0 + } + /// Return true if the content ends with a newline. pub fn is_newline_terminated(&self) -> bool { match &self.content { @@ -326,6 +331,16 @@ impl IOChunkContent<'_> { IOChunkContent::Owned { content, .. } => content, } } + + /// Return the content's length (in bytes or characters). + pub fn len(&self) -> usize { + match self { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => content.len(), + + IOChunkContent::Owned { content, .. } => content.len(), + } + } } /// Unified reader that uses mmap when possible, falls back to buffered reading. @@ -962,6 +977,7 @@ mod tests { )) = reader.get_line()? { assert_eq!(content, "first line"); + assert_eq!(content.len(), 10); assert!(has_newline); assert!(!utf8_verified.get()); assert!(!last_line); @@ -1026,6 +1042,7 @@ mod tests { )) = reader.get_line()? { assert_eq!(content, b"first line"); + assert_eq!(content.len(), 10); assert_eq!(full_span, b"first line\n"); assert!(!utf8_verified.get()); assert!(!last_line); @@ -1069,11 +1086,19 @@ mod tests { Ok(()) } - // is_newline_terminated + // is_newline_terminated, is_empty #[test] - fn test_owned_newline_terminated() { + fn test_owned_newline_terminated_non_empty() { let chunk = IOChunk::from_content(IOChunkContent::new_owned("line".to_string(), true)); assert!(chunk.is_newline_terminated()); + assert!(!chunk.is_empty()); + } + + #[test] + fn test_owned_newline_terminated_empty() { + let chunk = IOChunk::from_content(IOChunkContent::new_owned("".to_string(), true)); + assert!(chunk.is_newline_terminated()); + assert!(chunk.is_empty()); } #[test] diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 2720751e..5112aa3e 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -17,11 +17,12 @@ use crate::fast_regex::Regex; use crate::in_place::InPlace; use crate::named_writer; +use std::borrow::Cow; use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::error::{UResult, USimpleError, set_exit_code}; /// Return true if the passed address matches the current I/O context. fn match_address( @@ -323,6 +324,74 @@ fn flush_appends(output: &mut OutputBuffer, context: &mut ProcessingContext) -> Ok(()) } +/// Return the specified command variant or panic. +// Example: let path = extract_variant!(command, Path); +macro_rules! extract_variant { + ($cmd:expr, $variant:ident) => { + match &$cmd.data { + CommandData::$variant(inner) => inner, + _ => panic!(concat!("Expected ", stringify!($variant), " command data")), + } + }; +} + +/// List the passed line in unambguous form. +fn list(output: &mut OutputBuffer, line: &IOChunk, max_width: usize) -> UResult<()> { + // Special case for an empty pattern space + if line.is_empty() { + if line.is_newline_terminated() { + output.write_str("$\n")?; + } + return Ok(()); + } + + let line = line.as_str()?; + let mut buff = String::new(); + let mut line_width = 0; + + for ch in line.chars() { + if ch == '\n' { + buff.push_str("$\n"); + output.write_str(&buff)?; + line_width = 0; + continue; + } + + let mut char_buff = [0u8; 1]; + let out_str: Cow = match ch { + '\x07' => Cow::Borrowed(r"\a"), + '\x08' => Cow::Borrowed(r"\b"), + '\x0b' => Cow::Borrowed(r"\v"), + '\x0c' => Cow::Borrowed(r"\f"), + '\\' => Cow::Borrowed(r"\\"), + '\r' => Cow::Borrowed(r"\r"), + '\t' => Cow::Borrowed(r"\t"), + c if c.is_ascii_control() => Cow::Owned(format!("\\{:03o}", ch as u8)), + c if c == ' ' || c.is_ascii_graphic() => Cow::Borrowed(ch.encode_utf8(&mut char_buff)), + c if (c as u32) <= 0xFFFF => Cow::Owned(format!("\\u{:04X}", c as u32)), + _ => Cow::Owned(format!("\\U{:08X}", ch as u32)), + }; + + // See if folding is required before adding out_str and terminator. + let out_len = out_str.len(); + if line_width + out_len + 1 > max_width { + buff.push_str("\\\n"); + output.write_str(&buff)?; + line_width = 0; + buff.clear(); + } + buff.push_str(out_str.as_ref()); + line_width += out_len; + } + + if !buff.is_empty() { + buff.push_str("$\n"); + output.write_str(buff)?; + } + Ok(()) +} + +#[allow(clippy::cognitive_complexity)] /// Process a single input file fn process_file( commands: &Option>>, @@ -364,9 +433,7 @@ fn process_file( match command.code { '{' => { // Block begin; start processing the enclosed ones. - let CommandData::Block(body) = &command.data else { - panic!("Expected Block command data"); - }; + let body = extract_variant!(command, Block); current = body.clone(); continue; } @@ -375,19 +442,14 @@ fn process_file( } 'a' => { // Write the text to standard output at a later point. - let text = match &command.data { - CommandData::Text(text) => text, - _ => panic!("Expected Text command data"), - }; + let text = extract_variant!(command, Text); context .append_elements .push(AppendElement::Text(text.clone().into_owned())); } 'b' => { // Branch to the specified label or end if none is given. - let CommandData::BranchTarget(target) = &command.data else { - panic!("Expected BranchTarget command data"); - }; + let target = extract_variant!(command, BranchTarget); if target.is_some() { // New command to execute current = target.clone(); @@ -402,10 +464,7 @@ fn process_file( // start the next cycle. pattern.clear(); if command.addr2.is_none() || context.last_address || context.last_line { - let text = match &command.data { - CommandData::Text(text) => text, - _ => panic!("Expected Text command data"), - }; + let text = extract_variant!(command, Text); output.write_str(text.as_ref())?; } break; @@ -452,14 +511,12 @@ fn process_file( } 'i' => { // Write text to standard output. - let text = match &command.data { - CommandData::Text(text) => text, - _ => panic!("Expected Text command data"), - }; + let text = extract_variant!(command, Text); output.write_str(text.as_ref())?; } 'l' => { - // TODO + let width = *extract_variant!(command, Number); + list(output, &pattern, width)?; } 'n' => { break; @@ -494,15 +551,21 @@ fn process_file( } } 'q' => { + // Quit after printing the pattern space. + set_exit_code(*extract_variant!(command, Number) as i32); + context.stop_processing = true; + break; + } + 'Q' => { + // Quit immediatelly. + set_exit_code(*extract_variant!(command, Number) as i32); context.stop_processing = true; + context.quiet = true; break; } 'r' => { // Copy the file to standard output at a later point. - let path = match &command.data { - CommandData::Path(path) => path, - _ => panic!("Expected Path command data"), - }; + let path = extract_variant!(command, Path); context .append_elements .push(AppendElement::Path(path.clone())); @@ -512,16 +575,13 @@ fn process_file( CommandData::Substitution(subst) => subst, _ => panic!("Expected Substitution command data"), }; - substitute(&mut pattern, &mut *subst, context, output)?; } 't' if !context.substitution_made => { /* Do nothing. */ } 't' => { // Branch to the specified label or end if none is given // if a substitution was made since last cycle or t. - let CommandData::BranchTarget(target) = &command.data else { - panic!("Expected BranchTarget command data"); - }; + let target = extract_variant!(command, BranchTarget); context.substitution_made = false; if target.is_some() { // New command to execute @@ -534,10 +594,7 @@ fn process_file( } 'w' => { // Append the pattern space to the specified file. - let writer = match &mut command.data { - CommandData::NamedWriter(writer) => writer, - _ => panic!("Expected NamedWriter command data"), - }; + let writer = extract_variant!(command, NamedWriter); writer.borrow_mut().write_line(pattern.as_str()?)?; } 'x' => { @@ -547,11 +604,7 @@ fn process_file( std::mem::swap(pat_has_newline, &mut context.hold.has_newline); } 'y' => { - let trans = match &mut command.data { - CommandData::Transliteration(trans) => trans, - _ => panic!("Expected Transliteration command data"), - }; - + let trans = extract_variant!(command, Transliteration); transliterate(&mut pattern, trans)?; } ':' => { @@ -598,7 +651,7 @@ fn process_file( pub fn process_all_files( commands: Option>>, files: Vec, - mut context: ProcessingContext, + context: &mut ProcessingContext, ) -> UResult<()> { context.unbuffered = context.unbuffered || io::stdout().is_terminal(); @@ -618,7 +671,7 @@ pub fn process_all_files( if context.separate { context.line_number = 0; } - process_file(&commands, &mut reader, output, &mut context)?; + process_file(&commands, &mut reader, output, context)?; // Handle any N command remains. if context.last_file && !context.separate && !context.quiet { diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index d61bc96c..fa34a512 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -35,10 +35,10 @@ const USAGE: &str = "sed [OPTION]... [script] [file]..."; pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; let (scripts, files) = get_scripts_files(&matches)?; - let mut processing_context = build_context(&matches); + let mut context = build_context(&matches); - let executable = compile(scripts, &mut processing_context)?; - process_all_files(executable, files, processing_context)?; + let executable = compile(scripts, &mut context)?; + process_all_files(executable, files, &mut context)?; Ok(()) } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 9e33f376..8030b8bb 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -281,6 +281,24 @@ check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); check_output!(pattern_re_reuse, ["-n", r"/_1/p;//p", LINES1]); check_output!(pattern_subst_re_reuse, ["-n", r"/_1/p;s//-N/p", LINES1]); +#[test] +fn test_quit_exit_code() { + new_ucmd!() + .args(&["5q 42", LINES1]) + .fails() + .code_is(42) + .stdout_is_fixture("output/pattern_quit"); +} + +#[test] +fn test_quit_now_exit_code() { + new_ucmd!() + .args(&["6Q 12", LINES1]) + .fails() + .code_is(12) + .stdout_is_fixture("output/pattern_quit"); +} + //////////////////////////////////////////////////////////// // Command blocks: {} check_output!( @@ -608,7 +626,7 @@ p ); //////////////////////////////////////////////////////////// -// r, w, 0 commands +// r, w commands check_output!(read_ok, [format!("4r {}", LINES2), LINES1.to_string()]); check_output!(read_missing, ["5r /xyzzyxyzy42", LINES1]); check_output!(read_empty, ["6r input/empty", LINES1]); @@ -656,9 +674,15 @@ fn write_two_files() -> std::io::Result<()> { Ok(()) } +//////////////////////////////////////////////////////////// +// =, l commands check_output!(number_continuous, ["/l2_/=", LINES1, LINES2]); check_output!(number_separate, ["-s", "/l._8/=", LINES1, LINES2]); +check_output!(list_ascii, ["-n", "l 60", "input/ascii"]); +check_output!(list_empty, ["-n", "l 60", "input/empty"]); +check_output!(list_unicode, ["l 60", "input/unicode"]); + //////////////////////////////////////////////////////////// // Large complex scripts diff --git a/tests/fixtures/sed/input/ascii b/tests/fixtures/sed/input/ascii new file mode 100644 index 0000000000000000000000000000000000000000..68219ef6565ce4414682943d414c58dfbd35352f GIT binary patch literal 130 zcmZQzWMXDvWn<^yMC+6cQE@6%&_`l#-T_m6KOcR8m$^Ra4i{)Y8_`)zddH zG%_|ZH8Z!cw6eCbwX=6{baHlab#wRd^z!!c_45x13?@ABCDEF\ +GHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\ +\177$ +$ diff --git a/tests/fixtures/sed/output/list_empty b/tests/fixtures/sed/output/list_empty new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/list_unicode b/tests/fixtures/sed/output/list_unicode new file mode 100644 index 00000000..f15bfa2c --- /dev/null +++ b/tests/fixtures/sed/output/list_unicode @@ -0,0 +1,4 @@ +Hello World or \u039A\u03B1\u03BB\u03B7\u03BC\u03AD\u03C1\ +\u03B1 \u03BA\u03CC\u03C3\u03BC\u03B5 or \u3053\u3093\u306B\ +\u3061\u306F \u4E16\u754C \U0001F600$ +Hello World or Καλημέρα κόσμε or こんにちは 世界 😀 From 4b547f43f6617d9eed351aa0702baa0c153ebad6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 30 May 2025 12:23:29 +0300 Subject: [PATCH 40/42] Improve error reporting By using uucore's .map_err_context. --- src/uu/sed/src/fast_io.rs | 2 +- src/uu/sed/src/named_writer.rs | 14 +++++--------- src/uu/sed/src/processor.rs | 13 +++++-------- src/uu/sed/src/script_line_provider.rs | 11 ++++------- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 1fa50f3f..6b8a821e 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -232,7 +232,7 @@ impl<'a> IOChunk<'a> { } else { let result = str::from_utf8(content); self.utf8_verified.set(true); - result.map_err(|e| USimpleError::new(1, e.to_string())) + result.map_err(|e| USimpleError::new(2, e.to_string())) } } IOChunkContent::Owned { content, .. } => Ok(content), diff --git a/src/uu/sed/src/named_writer.rs b/src/uu/sed/src/named_writer.rs index 26482acf..eb60db2b 100644 --- a/src/uu/sed/src/named_writer.rs +++ b/src/uu/sed/src/named_writer.rs @@ -14,7 +14,8 @@ use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult}; thread_local! { /// Global list of all writers that should be flushed at shutdown @@ -36,12 +37,7 @@ impl NamedWriter { .write(true) .truncate(true) .open(&path) - .map_err(|e| { - USimpleError::new( - 2, - format!("Error opening output file {}: {}", path.display(), e), - ) - })?; + .map_err_context(|| format!("error opening output file {}", path.quote()))?; let writer = Rc::new(RefCell::new(NamedWriter { path, @@ -55,14 +51,14 @@ impl NamedWriter { /// Write a line to the file with a newline, returning descriptive errors. pub fn write_line(&mut self, line: &str) -> UResult<()> { writeln!(self.writer, "{}", line) - .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + .map_err_context(|| format!("error writing to file {}", self.path.quote())) } /// Flush the writer, returning a descriptive error. pub fn flush(&mut self) -> UResult<()> { self.writer .flush() - .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + .map_err_context(|| format!("error writing to file {}", self.path.quote())) } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 5112aa3e..ad8b20bf 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -22,7 +22,8 @@ use std::cell::RefCell; use std::io::{self, IsTerminal}; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError, set_exit_code}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult, USimpleError, set_exit_code}; /// Return true if the passed address matches the current I/O context. fn match_address( @@ -335,7 +336,7 @@ macro_rules! extract_variant { }; } -/// List the passed line in unambguous form. +/// List the passed pattern space in unambiguous form. fn list(output: &mut OutputBuffer, line: &IOChunk, max_width: usize) -> UResult<()> { // Special case for an empty pattern space if line.is_empty() { @@ -660,12 +661,8 @@ pub fn process_all_files( for (index, path) in files.iter().enumerate() { context.last_file = index == last_file_index; - let mut reader = LineReader::open(path).map_err(|e| { - USimpleError::new( - 2, - format!("Error opening input file {}: {}", path.display(), e), - ) - })?; + let mut reader = LineReader::open(path) + .map_err_context(|| format!("error opening input file {}", path.quote()))?; let output = in_place.begin(path)?; if context.separate { diff --git a/src/uu/sed/src/script_line_provider.rs b/src/uu/sed/src/script_line_provider.rs index f287e982..d75eb9b9 100644 --- a/src/uu/sed/src/script_line_provider.rs +++ b/src/uu/sed/src/script_line_provider.rs @@ -12,7 +12,8 @@ use crate::command::ScriptValue; use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader}; -use uucore::error::{UResult, USimpleError}; +use uucore::display::Quotable; +use uucore::error::{FromIo, UResult}; #[derive(Debug)] /// The provider of script lines across all specified scripts @@ -131,12 +132,8 @@ impl ScriptLineProvider { line_number: 0, }; } else { - let file = File::open(p).map_err(|e| { - USimpleError::new( - 2, - format!("Error opening script file {}: {}", p.display(), e), - ) - })?; + let file = File::open(p) + .map_err_context(|| format!("error opening script file {}", p.quote()))?; self.state = State::Active { index: next_index, reader: Box::new(BufReader::new(file)), From be0a6a1ee362777c456670b1fe5a40302a2a76a9 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 30 May 2025 12:28:05 +0300 Subject: [PATCH 41/42] Remove and fix dead code --- src/uu/sed/src/command.rs | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 6acfb424..acaca06b 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -8,9 +8,6 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// TODO: remove when compile is implemented -#![allow(dead_code)] - use crate::fast_regex::{Captures, Match, Regex}; use crate::named_writer::NamedWriter; @@ -320,19 +317,6 @@ pub enum CommandData { Transliteration(Box), // Transliteration command 'y' } -#[derive(Debug)] -/// Text to append before a line is read -pub struct AppendBuffer { - append_type: AppendType, - content: String, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum AppendType { - String, - File, -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] /// Flag for space modifications pub enum SpaceFlag { @@ -528,6 +512,7 @@ mod tests { } // from_strings + #[test] fn test_basic_transliteration() { let t = Transliteration::from_strings("abcδ", "1234"); From a2e19695ff4313898307697659b6b02bf1dd9931 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 30 May 2025 18:21:30 +0300 Subject: [PATCH 42/42] Add in-place editing support --- Cargo.lock | 185 +++++++++++++++++++++++ Cargo.toml | 4 + README.md | 2 + src/uu/sed/Cargo.toml | 2 + src/uu/sed/src/fast_io.rs | 14 ++ src/uu/sed/src/in_place.rs | 289 ++++++++++++++++++++++++++++++++++-- src/uu/sed/src/processor.rs | 2 +- tests/by-util/test_sed.rs | 205 ++++++++++++++++++++++++- 8 files changed, 686 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ef70220..7e85944d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,21 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "assert_fs" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652f6cb1f516886fcfee5e7a5c078b9ade62cfcb889524efe5a64d682dd27a9" +dependencies = [ + "anstyle", + "doc-comment", + "globwalk", + "predicates", + "predicates-core", + "predicates-tree", + "tempfile", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -103,6 +118,16 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.17.0" @@ -222,6 +247,31 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "ctor" version = "0.4.2" @@ -253,6 +303,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "dns-lookup" version = "2.0.4" @@ -265,6 +321,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "dtor" version = "0.0.6" @@ -307,6 +369,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + [[package]] name = "getrandom" version = "0.3.3" @@ -325,6 +396,30 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags", + "ignore", + "walkdir", +] + [[package]] name = "iana-time-zone" version = "0.1.63" @@ -349,6 +444,22 @@ dependencies = [ "cc", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -416,6 +527,12 @@ dependencies = [ "libc", ] +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + [[package]] name = "ntapi" version = "0.4.1" @@ -551,6 +668,36 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "predicates" +version = "3.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa" + +[[package]] +name = "predicates-tree" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" +dependencies = [ + "predicates-core", + "termtree", +] + [[package]] name = "pretty_assertions" version = "1.4.1" @@ -692,10 +839,20 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "sed" version = "0.0.1" dependencies = [ + "assert_fs", "chrono", "clap", "clap_complete", @@ -708,6 +865,7 @@ dependencies = [ "once_cell", "phf", "phf_codegen", + "predicates", "pretty_assertions", "rand 0.9.1", "regex", @@ -824,6 +982,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + [[package]] name = "textwrap" version = "0.16.2" @@ -897,11 +1061,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" name = "uu_sed" version = "0.0.1" dependencies = [ + "assert_fs", "clap", "fancy-regex", "memchr", "memmap2", "once_cell", + "predicates", "regex", "tempfile", "terminal_size", @@ -966,6 +1132,16 @@ dependencies = [ "xattr", ] +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -1058,6 +1234,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index cdf511aa..87d983dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ feat_common_core = [ ] [workspace.dependencies] +assert_fs = "1.1.3" bytesize = "2.0.0" chrono = { version = "0.4.37", default-features = false, features = [ "clock", @@ -43,6 +44,7 @@ memmap2 = "0.9" once_cell = "1.21.3" phf = "0.11.2" phf_codegen = "0.11.2" +predicates = "3.1.3" rand = { version = "0.9", features = ["small_rng"] } regex = "1.10.4" sysinfo = "0.35" @@ -54,6 +56,7 @@ xattr = "1.3.1" [dependencies] +assert_fs = { workspace = true } clap = { workspace = true } clap_complete = { workspace = true } clap_mangen = { workspace = true } @@ -63,6 +66,7 @@ memchr = { workspace = true } memmap2.workspace = true once_cell = { workspace = true } phf = { workspace = true } +predicates = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } sysinfo = { workspace = true } terminal_size = { workspace = true } diff --git a/README.md b/README.md index 18a10747..1aa19134 100644 --- a/README.md +++ b/README.md @@ -46,9 +46,11 @@ cargo run --release * A `Q` command (optionally followed by an exit code) quits immediately. * The `q` command can be optionally followed by an exit code. * The `l` command can be optionally followed by the output width. +* The `--follow-symlinks` flag for in-place editing. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. +* In-place editing of file with the `-i` flag. ### New extensions * Unicode characters can be specified in regular expression pattern, replacement diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 0aa9f2b9..3b0d0cbb 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -13,6 +13,7 @@ categories = ["command-line-utilities"] [dependencies] +assert_fs = { workspace = true } clap = { workspace = true } fancy-regex = { workspace = true } memchr = { workspace = true } @@ -20,6 +21,7 @@ regex = { workspace = true } tempfile = { workspace = true } memmap2 = { workspace = true } once_cell = { workspace = true } +predicates = { workspace = true } terminal_size = { workspace = true } uucore = { workspace = true } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 6b8a821e..c101163d 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -525,6 +525,20 @@ impl OutputBuffer { } } +/// Implementation of the std::io::Write trait +impl Write for OutputBuffer { + fn write(&mut self, buf: &[u8]) -> io::Result { + let s = + std::str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + self.write_str(s)?; + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + self.flush() + } +} + #[cfg(unix)] impl OutputBuffer { /// Schedule the specified output chunk for eventual output diff --git a/src/uu/sed/src/in_place.rs b/src/uu/sed/src/in_place.rs index eb4d9403..acdf8aee 100644 --- a/src/uu/sed/src/in_place.rs +++ b/src/uu/sed/src/in_place.rs @@ -8,39 +8,298 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use std::fs; +use std::io::stdout; +use std::path::{Path, PathBuf}; + +#[cfg(unix)] +use std::os::unix::fs::MetadataExt; +#[cfg(unix)] +use std::os::unix::fs::PermissionsExt; + +use tempfile::NamedTempFile; +use uucore::display::Quotable; +use uucore::error::{FromIo, UIoError, UResult, USimpleError}; + use crate::command::ProcessingContext; use crate::fast_io::OutputBuffer; -use std::io::stdout; -use std::path::Path; -use uucore::error::UResult; /// Context for in-place editing pub struct InPlace { pub output: OutputBuffer, - pub processing_context: ProcessingContext, + pub in_place: bool, + pub in_place_suffix: Option, + pub follow_symlinks: bool, + pub temp_file: Option, + pub original_path: Option, } impl InPlace { - /// Create a new `ProcessingContext` taking ownership of processing_context - pub fn new(processing_context: ProcessingContext) -> UResult { - let output = OutputBuffer::new(Box::new(stdout())); + /// Create an in-place editing engine based on ProcessingContext. + /// Depending on its settings it may or may not perform in-place + /// editing, backup the original file, or follow symlinks. + pub fn new(context: ProcessingContext) -> Self { + Self { + output: OutputBuffer::new(Box::new(stdout())), + in_place: context.in_place, + in_place_suffix: context.in_place_suffix, + follow_symlinks: context.follow_symlinks, + temp_file: None, + original_path: None, + } + } - Ok(InPlace { - output, - processing_context, - }) + /// Return an OutputBuffer for outputting the edits to the specified file. + /// The file may be a symbolic link, which will be processed according + /// to the context specification. + pub fn begin(&mut self, file_name: &Path) -> UResult<&mut OutputBuffer> { + let resolved = if self.follow_symlinks { + fs::canonicalize(file_name) + .map_err_context(|| format!("resolving symlink {}", file_name.quote()))? + } else { + file_name.to_path_buf() + }; + self.begin_resolved(&resolved) } /// Return an OutputBuffer for outputting the edits to the specified file. - pub fn begin(&mut self, _file_name: &Path) -> UResult<&mut OutputBuffer> { - // TODO: Adjust output for in-place editing, if needed. + /// The passed file name should have resolved symbolic links according + /// to the context settings. + fn begin_resolved(&mut self, file_name: &Path) -> UResult<&mut OutputBuffer> { + if !self.in_place { + self.output = OutputBuffer::new(Box::new(stdout())); + return Ok(&mut self.output); + } + + let metadata = fs::metadata(file_name).map_err_context(|| { + format!( + "error Reading metadata of {} for in-place edit", + file_name.quote() + ) + })?; + + if !metadata.is_file() { + return Err(USimpleError::new( + 2, + format!( + "cannot in-place edit non-regular file {}", + file_name.quote() + ), + )); + } + + let dir = file_name.parent().unwrap_or_else(|| Path::new(".")); + let temp_file = NamedTempFile::new_in(dir) + .map_err_context(|| format!("error creating temporary file in {}", dir.quote()))?; + + // TODO: On Unix use fchown(metadata.{uid,dig}) and fchmod(mode) + // on let fd = temp_file.as_file().as_raw_fd() when uucore::libc + // support them. + #[cfg(unix)] + { + let mode = metadata.mode() & 0o7777; + let perms = fs::Permissions::from_mode(mode); + fs::set_permissions(temp_file.path(), perms)?; + } + + let output = OutputBuffer::new(Box::new( + temp_file.reopen().expect("reopening NamedTempFile"), + )); + self.output = output; + self.temp_file = Some(temp_file); + self.original_path = Some(file_name.to_path_buf()); + Ok(&mut self.output) } - /// Finish in-place editing. + /// Finish (potentially in-place) editing. pub fn end(&mut self) -> UResult<()> { self.output.flush()?; - // TODO: Rename and delete output file, if needed. + + if !self.in_place { + return Ok(()); + } + + let orig = self.original_path.take().expect("original_path unset"); + let temp = self.temp_file.take().expect("temp_file unset"); + + // Backup original if suffix is provided + if let Some(ref suffix) = self.in_place_suffix { + let mut backup_path = orig.clone(); + let file_name = backup_path + .file_name() + .expect("Missing file name for backup") + .to_os_string(); + let mut backup_name = file_name; + backup_name.push(suffix); + backup_path.set_file_name(backup_name); + + #[cfg(windows)] + // Try to remove to ensure the rename won't fail on Windows. + let _ = fs::remove_file(&backup_path); + + fs::rename(&orig, &backup_path).map_err_context(|| { + format!( + "error backing up {} to {}", + orig.quote(), + backup_path.quote() + ) + })?; + } else { + #[cfg(windows)] + // On Windows delete the original file for temp.persist to work + if orig.exists() { + fs::remove_file(&orig).map_err_context(|| { + format!("error removing original input file {}", orig.quote()) + })?; + } + } + + // Atomically replace the original + match temp.persist(&orig) { + Ok(_) => {} + Err(e) => { + return Err(UIoError::new( + e.error.kind(), + format!( + "error persisting temporary file {} to {}", + e.file.path().quote(), + orig.quote() + ), + )); + } + } + Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use assert_fs::TempDir; + use assert_fs::fixture::PathChild; + use std::fs; + use std::io::{Read, Write}; + use std::path::Path; + + fn minimal_context() -> ProcessingContext { + ProcessingContext { + in_place: false, + in_place_suffix: None, + follow_symlinks: false, + // fill in default values for the rest as needed + ..Default::default() + } + } + + fn write_original(file: &Path, content: &str) { + fs::write(file, content).unwrap(); + } + + fn read_file(file: &Path) -> String { + let mut contents = String::new(); + fs::File::open(file) + .unwrap() + .read_to_string(&mut contents) + .unwrap(); + contents + } + + #[test] + fn test_in_place_editing() { + let temp = TempDir::new().unwrap(); + let file = temp.child("file.txt"); + write_original(file.path(), "original\n"); + + let mut ctx = minimal_context(); + ctx.in_place = true; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(file.path()).unwrap(); + write!(buf, "updated\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(file.path()), "updated\n"); + } + + #[test] + fn test_in_place_backup() { + let temp = TempDir::new().unwrap(); + let file = temp.child("file.txt"); + let backup = temp.child("file.txt.bak"); + write_original(file.path(), "original\n"); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.in_place_suffix = Some(".bak".to_string()); + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(file.path()).unwrap(); + write!(buf, "new content\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(file.path()), "new content\n"); + assert_eq!(read_file(backup.path()), "original\n"); + } + + #[cfg(unix)] + #[test] + fn test_symlink_follow_true() { + let temp = TempDir::new().unwrap(); + let real = temp.child("target.txt"); + let link = temp.child("link.txt"); + + write_original(real.path(), "real\n"); + std::os::unix::fs::symlink(real.path(), link.path()).unwrap(); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.follow_symlinks = true; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(link.path()).unwrap(); + write!(buf, "changed\n").unwrap(); + inplace.end().unwrap(); + + assert_eq!(read_file(real.path()), "changed\n"); + assert!(link.path().exists()); // Symlink still exists + } + + #[cfg(unix)] + #[test] + fn test_symlink_follow_false() { + let temp = TempDir::new().unwrap(); + let real = temp.child("target.txt"); + let link = temp.child("link.txt"); + + write_original(real.path(), "real\n"); + std::os::unix::fs::symlink(real.path(), link.path()).unwrap(); + + let mut ctx = minimal_context(); + ctx.in_place = true; + ctx.follow_symlinks = false; + + let mut inplace = InPlace::new(ctx); + let buf = inplace.begin(link.path()).unwrap(); + write!(buf, "linked\n").unwrap(); + inplace.end().unwrap(); + + // real file should remain untouched + assert_eq!(read_file(real.path()), "real\n"); + + // link (symlink path) now contains the new content + let contents = read_file(link.path()); + assert_eq!(contents, "linked\n"); + } + + #[test] + fn test_no_in_place_outputs_to_stdout() { + let mut ctx = minimal_context(); + ctx.in_place = false; + + let mut inplace = InPlace::new(ctx); + let _buf = inplace.begin(Path::new("fake.txt")).unwrap(); + assert!(inplace.end().is_ok()); + } +} diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index ad8b20bf..e1f01764 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -656,7 +656,7 @@ pub fn process_all_files( ) -> UResult<()> { context.unbuffered = context.unbuffered || io::stdout().is_terminal(); - let mut in_place = InPlace::new(context.clone())?; + let mut in_place = InPlace::new(context.clone()); let last_file_index = files.len() - 1; for (index, path) in files.iter().enumerate() { diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 8030b8bb..20dac473 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -9,7 +9,11 @@ // file that was distributed with this source code. use std::fs; -use std::io::Read; +use std::io::{Read, Write}; + +#[cfg(unix)] +use assert_fs::fixture::{FileWriteStr, PathChild}; + use tempfile::NamedTempFile; use uutests::new_ucmd; use uutests::util::TestScenario; @@ -683,6 +687,205 @@ check_output!(list_ascii, ["-n", "l 60", "input/ascii"]); check_output!(list_empty, ["-n", "l 60", "input/empty"]); check_output!(list_unicode, ["l 60", "input/unicode"]); +//////////////////////////////////////////////////////////// +// In-place editing +#[test] +fn in_place_edit_replace() -> std::io::Result<()> { + let mut temp = NamedTempFile::new()?; + writeln!(temp.as_file_mut(), "hello, world")?; + + // Get the file path before converting to TempPath + let path = temp.path().to_path_buf(); + + // Close temp file and preserve path + let temp_path = temp.into_temp_path(); + + // Call your tool on the path + new_ucmd!() + .args(&["-i", "-e", "s/world/universe/", path.to_str().unwrap()]) + .succeeds(); + + // Read the file using standard fs + let actual = std::fs::read_to_string(&path)?; + temp_path.close()?; // Clean up + + assert_eq!(actual, "hello, universe\n"); + Ok(()) +} + +#[test] +fn in_place_edit_backup() -> std::io::Result<()> { + let mut temp = NamedTempFile::new()?; + writeln!(temp.as_file_mut(), "hello, world")?; + + let path = temp.path().to_path_buf(); + let temp_path = temp.into_temp_path(); + + // Run the sed-like command with -i (in-place edit) + new_ucmd!() + .args(&[ + "-i", + ".bak", + "-e", + "s/world/universe/", + path.to_str().unwrap(), + ]) + .succeeds(); + + // Read edited file + let actual = std::fs::read_to_string(&path)?; + assert_eq!(actual, "hello, universe\n"); + + // Read backup file + let backup_path = path.with_file_name(format!( + "{}.bak", + path.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + temp_path.close()?; // Cleanup + std::fs::remove_file(backup_path)?; // Cleanup backup + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_follow_symlink_edits_target() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "--follow-symlinks", + "-i", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + let actual = std::fs::read_to_string(target.path())?; + assert_eq!(actual, "hello, universe\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_symlink_replaced_when_not_following() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + // Run command without --follow-symlinks + new_ucmd!() + .args(&[ + "-i", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // The original target should be untouched + let original = std::fs::read_to_string(target.path())?; + assert_eq!(original, "hello, world\n"); + + // The symlink path should now contain the edited content + let edited = std::fs::read_to_string(link.path())?; + assert_eq!(edited, "hello, universe\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_follow_symlink_with_backup() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "--follow-symlinks", + "-i", + ".bak", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // Verify target was modified + let edited = std::fs::read_to_string(target.path())?; + assert_eq!(edited, "hello, universe\n"); + + // Backup file is created alongside the target + let backup_path = target.path().with_file_name(format!( + "{}.bak", + target.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + Ok(()) +} + +#[cfg(unix)] +#[test] +fn in_place_edit_symlink_replaced_with_backup() -> Result<(), Box> { + let temp_dir = assert_fs::TempDir::new()?; + let target = temp_dir.child("target.txt"); + let link = temp_dir.child("link.txt"); + + target.write_str("hello, world\n")?; + + std::os::unix::fs::symlink(target.path(), link.path())?; + + new_ucmd!() + .args(&[ + "-i", + ".bak", + "-e", + "s/world/universe/", + link.path().to_str().unwrap(), + ]) + .succeeds(); + + // Target should remain untouched + let unchanged = std::fs::read_to_string(target.path())?; + assert_eq!(unchanged, "hello, world\n"); + + // Symlink path should now contain the updated content + let edited = std::fs::read_to_string(link.path())?; + assert_eq!(edited, "hello, universe\n"); + + // Backup of the symlink file (not the target) should exist + let backup_path = link.path().with_file_name(format!( + "{}.bak", + link.file_name().unwrap().to_string_lossy() + )); + let backup = std::fs::read_to_string(&backup_path)?; + assert_eq!(backup, "hello, world\n"); + + Ok(()) +} + //////////////////////////////////////////////////////////// // Large complex scripts