From e9e5ce744d5558692a172fe8aacac82dd40ca1b6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 23 Mar 2025 16:12:35 +0200 Subject: [PATCH 01/16] Initialize command map statically --- src/uu/sed/src/compiler.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 5d016299..a9ea21a9 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -18,6 +18,9 @@ use uucore::error::{UResult, USimpleError}; // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); +// A global, immutable map of command properties, initialized on first access +static CMD_MAP: Lazy> = Lazy::new(build_command_map); + // Types of command arguments recognized by the parser #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CommandArgs { From 543864b0cff150b0060e2589a9a5d21a0376be4b Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 22 Apr 2025 11:20:55 +0300 Subject: [PATCH 02/16] Add compile_char_escape function --- src/uu/sed/src/escape_compiler.rs | 304 ++++++++++++++++++++++++++++++ src/uu/sed/src/sed.rs | 1 + 2 files changed, 305 insertions(+) create mode 100644 src/uu/sed/src/escape_compiler.rs diff --git a/src/uu/sed/src/escape_compiler.rs b/src/uu/sed/src/escape_compiler.rs new file mode 100644 index 00000000..0988f090 --- /dev/null +++ b/src/uu/sed/src/escape_compiler.rs @@ -0,0 +1,304 @@ +// Compile escaped character sequences +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use crate::script_char_provider::ScriptCharProvider; +use std::char; + +/// Return true if c is a valid octal digit +fn is_ascii_octal_digit(c: char) -> bool { + matches!(c, '0'..='7') +} + +/// Compile a numeric character escape and return the corresponding char. +/// Advance line to the first character not part of the escape. +/// ndigits is the maximum number of allowed digits and radix is the value's +/// radix (e.g. 8, 10, 16 for octal, decimal, and hex escapes). +/// Return `None` if no valid character has been specified. +fn compile_numeric_escape( + line: &mut ScriptCharProvider, + is_allowed_char: fn(char) -> bool, + ndigits: u32, + radix: u32, +) -> Option { + let mut valid_chars = Vec::new(); + + for _ in 0..ndigits { + if !line.eol() && is_allowed_char(line.current()) { + valid_chars.push(line.current()); + line.advance(); + } else { + break; + } + } + + if valid_chars.is_empty() { + return None; + } + + let char_string: String = valid_chars.into_iter().collect(); + match u32::from_str_radix(&char_string, radix) + .ok() + .and_then(char::from_u32) + { + Some(decoded) => Some(decoded), + None => panic!("Unable to decode numeric character escape."), + } +} + +/// Transforms the specified character into the corresponding ASCII +/// control character as follows. +/// - Convert lowercase letters to uppercase +/// - XOR the ASCII value with 0x40 (inverts bit 6) +/// +/// Return `None` if the result is not a valid Unicode scalar. +fn create_control_char(x: char) -> Option { + if !x.is_ascii() { + return None; + } + + let mut c = x; + if c.is_ascii_lowercase() { + c = c.to_ascii_uppercase(); + } + + let transformed = (c as u8) ^ 0x40; + char::from_u32(transformed as u32) +} + +/// Compile a character escape valid in all contexts (RE pattern, substitution, +/// transliterarion) and return the corresponding char. +/// Advance line to the first character not part of the escape. +/// Return `None` if an invalid escape has been specified. +pub fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { + match line.current() { + 'a' => { + line.advance(); + Some('\x07') + } + 'f' => { + line.advance(); + Some('\x0c') + } + 'n' => { + line.advance(); + Some('\n') + } + 'r' => { + line.advance(); + Some('\r') + } + 't' => { + line.advance(); + Some('\t') + } + 'v' => { + line.advance(); + Some('\x0b') + } + + 'c' => { + // Control character escape: \cC + line.advance(); // move past 'c' + match create_control_char(line.current()) { + Some(decoded) => { + line.advance(); + Some(decoded) + } + None => Some('c'), + } + } + + 'd' => { + // Decimal escape: \dnnn + line.advance(); // move past 'd' + match compile_numeric_escape(line, |c| c.is_ascii_digit(), 3, 10) { + Some(decoded) => Some(decoded), + None => Some('d'), + } + } + + 'o' => { + // Octal escape: \onnn + line.advance(); // move past 'o' + match compile_numeric_escape(line, is_ascii_octal_digit, 3, 8) { + Some(decoded) => Some(decoded), + None => Some('o'), + } + } + + 'x' => { + // Hexadecimal escape: \xnn + line.advance(); // move past 'x' + match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 2, 16) { + Some(decoded) => Some(decoded), + None => Some('x'), + } + } + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // compile_numeric_escape + #[test] + fn test_compile_octal_escape() { + let mut provider = ScriptCharProvider::new("141rest"); + let c = compile_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + assert_eq!(c, Some('a')); + assert_eq!(provider.current(), 'r'); // "141" was consumed + } + + #[test] + fn test_compile_octal_escape_eol() { + let mut provider = ScriptCharProvider::new("141"); + let c = compile_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + assert_eq!(c, Some('a')); + assert!(provider.eol()); // "141" was consumed + } + + #[test] + fn test_compile_decimal_escape() { + let mut provider = ScriptCharProvider::new("0659"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, Some('A')); + assert_eq!(provider.current(), '9'); // "65" was consumed + } + + #[test] + fn test_compile_hex_escape() { + let mut provider = ScriptCharProvider::new("3cZ"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + assert_eq!(c, Some('<')); + assert_eq!(provider.current(), 'Z'); // "41" was consumed + } + + #[test] + fn test_compile_hex_escape_truncated() { + let mut provider = ScriptCharProvider::new("4G"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + assert_eq!(c, Some('\u{4}')); // Only '4' is valid hex + assert_eq!(provider.current(), 'G'); // "41" was consumed + } + + #[test] + fn test_no_valid_digits() { + let mut provider = ScriptCharProvider::new("xyz"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, None); + assert_eq!(provider.current(), 'x'); // No advancement + } + + // create_control_char + #[test] + fn test_lowercase_letter() { + assert_eq!(create_control_char('z'), Some('\u{1a}')); // 0x5A ^ 0x40 = 0x1A + assert_eq!(create_control_char('a'), Some('\u{01}')); // 0x41 ^ 0x40 = 0x01 + } + + #[test] + fn test_uppercase_letter() { + assert_eq!(create_control_char('Z'), Some('\u{1a}')); + assert_eq!(create_control_char('A'), Some('\u{01}')); + } + + #[test] + fn test_symbol_characters() { + assert_eq!(create_control_char('{'), Some(';')); // 0x7B ^ 0x40 = 0x3B + assert_eq!(create_control_char(';'), Some('{')); // 0x3B ^ 0x40 = 0x7B + } + + #[test] + fn test_non_ascii_char() { + // This will not match any transformation and may panic if it overflows + // But the current function only handles ASCII-safe chars + assert_eq!(create_control_char('é'), None); // outside ASCII + } + + #[test] + fn test_edge_ascii_values() { + assert_eq!(create_control_char('@'), Some('\0')); // 0x40 ^ 0x40 = 0x00 + assert_eq!(create_control_char('\x7F'), Some('\x3F')); // 0x7F ^ 0x40 = 0x3F + } + + // compile_char_escape + fn escape_result_with_current(input: &str) -> (Option, Option) { + let mut provider = ScriptCharProvider::new(input); + let result = compile_char_escape(&mut provider); + let current = if provider.eol() { + None + } else { + Some(provider.current()) + }; + (result, current) + } + #[test] + + fn test_standard_escapes() { + assert_eq!(escape_result_with_current("a"), (Some('\x07'), None)); + assert_eq!(escape_result_with_current("f"), (Some('\x0c'), None)); + assert_eq!(escape_result_with_current("n"), (Some('\n'), None)); + assert_eq!(escape_result_with_current("r"), (Some('\r'), None)); + assert_eq!(escape_result_with_current("t"), (Some('\t'), None)); + assert_eq!(escape_result_with_current("v"), (Some('\x0b'), None)); + } + + #[test] + fn test_escape_invalid() { + assert_eq!(escape_result_with_current("zx"), (None, Some('z'))); + } + + #[test] + fn test_control_escape_valid() { + assert_eq!(escape_result_with_current("cZ"), (Some('\x1A'), None)); + } + + #[test] + fn test_control_escape_invalid() { + assert_eq!(escape_result_with_current("cé"), (Some('c'), Some('é'))); + } + + #[test] + fn test_decimal_escape_valid() { + assert_eq!(escape_result_with_current("d065r"), (Some('A'), Some('r'))); + } + + #[test] + fn test_octal_escape_valid() { + assert_eq!(escape_result_with_current("o141x"), (Some('a'), Some('x'))); + } + + #[test] + fn test_hex_escape_valid() { + assert_eq!(escape_result_with_current("x41;"), (Some('A'), Some(';'))); + } + + #[test] + fn test_decimal_escape_fallback() { + assert_eq!(escape_result_with_current("d;."), (Some('d'), Some(';'))); + } + + #[test] + fn test_octal_escape_fallback() { + assert_eq!(escape_result_with_current("o9x"), (Some('o'), Some('9'))); + } + + #[test] + fn test_hex_escape_fallback() { + assert_eq!(escape_result_with_current("xyz"), (Some('x'), Some('y'))); + } + + #[test] + fn test_unknown_escape() { + assert_eq!(escape_result_with_current("q"), (None, Some('q'))); + } +} diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 2a3dae8b..f2621243 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -10,6 +10,7 @@ pub mod command; pub mod compiler; +pub mod escape_compiler; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From b320fcbb3c1f9bc7ad4cd702a9b304ad136228f5 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 22 Apr 2025 15:44:09 +0300 Subject: [PATCH 03/16] Support Unicode character escapes --- README.md | 12 +++- src/uu/sed/src/escape_compiler.rs | 99 ++++++++++++++++++++++++-- src/uu/sed/src/script_char_provider.rs | 39 ++++++++++ 3 files changed, 144 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 32dd9fb7..637ab53d 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,9 @@ # sed Rust reimplementation of the [sed utility](https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html) -with some [GNU sed](https://www.gnu.org/software/sed/manual/sed.html) -and [FreeBSD sed](https://man.freebsd.org/cgi/man.cgi?sed(1)) extensions. +with some [GNU sed](https://www.gnu.org/software/sed/manual/sed.html), +[FreeBSD sed](https://man.freebsd.org/cgi/man.cgi?sed(1)), +and other extensions. ## Installation @@ -23,6 +24,13 @@ cd sed cargo build --release cargo run --release ``` +## Extensions +### GNU +* Command-line arguments can be specified in long (`--`) form. + +### Other +* Unicode characters can be specified in regular expression pattern, replacement + and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. ## License diff --git a/src/uu/sed/src/escape_compiler.rs b/src/uu/sed/src/escape_compiler.rs index 0988f090..63504bc0 100644 --- a/src/uu/sed/src/escape_compiler.rs +++ b/src/uu/sed/src/escape_compiler.rs @@ -18,13 +18,15 @@ fn is_ascii_octal_digit(c: char) -> bool { /// Compile a numeric character escape and return the corresponding char. /// Advance line to the first character not part of the escape. -/// ndigits is the maximum number of allowed digits and radix is the value's +/// ndigits is the number of allowed digits and radix is the value's /// radix (e.g. 8, 10, 16 for octal, decimal, and hex escapes). +/// For values up to 3 ndigits is the maximum number of allowed digits, +/// for values above 3 ndigits is the exact number of allowed digits. /// Return `None` if no valid character has been specified. fn compile_numeric_escape( line: &mut ScriptCharProvider, is_allowed_char: fn(char) -> bool, - ndigits: u32, + ndigits: usize, radix: u32, ) -> Option { let mut valid_chars = Vec::new(); @@ -42,6 +44,11 @@ fn compile_numeric_escape( return None; } + if ndigits > 3 && valid_chars.len() != ndigits { + line.retreat(valid_chars.len()); + return None; + } + let char_string: String = valid_chars.into_iter().collect(); match u32::from_str_radix(&char_string, radix) .ok() @@ -133,6 +140,24 @@ pub fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { } } + 'u' => { + // Short Unicode escape \uXXXX (exactly four hex digits) + line.advance(); // move past 'x' + match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 4, 16) { + Some(decoded) => Some(decoded), + None => Some('u'), + } + } + + 'U' => { + // Short Unicode escape \UXXXXXXXX (exactly eight heax digits) + line.advance(); // move past 'x' + match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 8, 16) { + Some(decoded) => Some(decoded), + None => Some('U'), + } + } + 'x' => { // Hexadecimal escape: \xnn line.advance(); // move past 'x' @@ -174,6 +199,14 @@ mod tests { assert_eq!(provider.current(), '9'); // "65" was consumed } + #[test] + fn test_compile_decimal_invalid() { + let mut provider = ScriptCharProvider::new("QR"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + assert_eq!(c, None); + assert_eq!(provider.current(), 'Q'); + } + #[test] fn test_compile_hex_escape() { let mut provider = ScriptCharProvider::new("3cZ"); @@ -190,6 +223,41 @@ mod tests { assert_eq!(provider.current(), 'G'); // "41" was consumed } + #[test] + fn test_compile_unicode_escape_short() { + // U+2665 = '♥' + let mut provider = ScriptCharProvider::new("26650"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + assert_eq!(c, Some('♥')); + assert_eq!(provider.current(), '0'); // "2665" was consumed + } + + #[test] + fn test_compile_unicode_escape_short_invalid() { + let mut provider = ScriptCharProvider::new("123Q"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + assert_eq!(c, None); + assert_eq!(provider.current(), '1'); + } + + #[test] + fn test_compile_unicode_escape_long_invalid() { + // U+2665 = '♥' + let mut provider = ScriptCharProvider::new("1234567Q"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + assert_eq!(c, None); + assert_eq!(provider.current(), '1'); + } + + #[test] + fn test_compile_unicode_escape_long() { + // U+1F600 = 😀 + let mut provider = ScriptCharProvider::new("0001F6009"); + let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + assert_eq!(c, Some('😀')); + assert_eq!(provider.current(), '9'); // "0001F600" was consumed + } + #[test] fn test_no_valid_digits() { let mut provider = ScriptCharProvider::new("xyz"); @@ -241,9 +309,9 @@ mod tests { }; (result, current) } - #[test] - fn test_standard_escapes() { + #[test] + fn test_standard_escapes_eol() { assert_eq!(escape_result_with_current("a"), (Some('\x07'), None)); assert_eq!(escape_result_with_current("f"), (Some('\x0c'), None)); assert_eq!(escape_result_with_current("n"), (Some('\n'), None)); @@ -252,6 +320,16 @@ mod tests { assert_eq!(escape_result_with_current("v"), (Some('\x0b'), None)); } + #[test] + fn test_standard_escapes_more() { + assert_eq!(escape_result_with_current("a."), (Some('\x07'), Some('.'))); + assert_eq!(escape_result_with_current("f."), (Some('\x0c'), Some('.'))); + assert_eq!(escape_result_with_current("n."), (Some('\n'), Some('.'))); + assert_eq!(escape_result_with_current("r."), (Some('\r'), Some('.'))); + assert_eq!(escape_result_with_current("t."), (Some('\t'), Some('.'))); + assert_eq!(escape_result_with_current("v."), (Some('\x0b'), Some('.'))); + } + #[test] fn test_escape_invalid() { assert_eq!(escape_result_with_current("zx"), (None, Some('z'))); @@ -282,6 +360,19 @@ mod tests { assert_eq!(escape_result_with_current("x41;"), (Some('A'), Some(';'))); } + #[test] + fn test_short_unicode_escape_valid() { + assert_eq!(escape_result_with_current("u2665;"), (Some('♥'), Some(';'))); + } + + #[test] + fn test_long_unicode_escape_valid() { + assert_eq!( + escape_result_with_current("U0001F600;"), + (Some('😀'), Some(';')) + ); + } + #[test] fn test_decimal_escape_fallback() { assert_eq!(escape_result_with_current("d;."), (Some('d'), Some(';'))); diff --git a/src/uu/sed/src/script_char_provider.rs b/src/uu/sed/src/script_char_provider.rs index 6f404693..bb537f3c 100644 --- a/src/uu/sed/src/script_char_provider.rs +++ b/src/uu/sed/src/script_char_provider.rs @@ -28,6 +28,15 @@ impl ScriptCharProvider { } } + /// Retreats current position by specified number or to beginning. + pub fn retreat(&mut self, n: usize) { + if n > self.pos { + self.pos = 0; + } else { + self.pos -= n; + } + } + /// Returns the current character. Panics if out of bounds. pub fn current(&self) -> char { self.line[self.pos] @@ -98,4 +107,34 @@ mod tests { provider.eat_spaces(); assert_eq!(provider.current(), 'a'); } + + #[test] + fn test_retreat_normal() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 4; // simulate position at 'e' + chars.retreat(2); + + assert_eq!(chars.get_pos(), 2); + assert_eq!(chars.current(), 'c'); + } + + #[test] + fn test_retreat_to_start() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 3; // simulate position at 'd' + chars.retreat(5); // retreat more than current pos + + assert_eq!(chars.get_pos(), 0); + assert_eq!(chars.current(), 'a'); + } + + #[test] + fn test_retreat_zero() { + let mut chars = ScriptCharProvider::new("abcdef"); + chars.pos = 2; // at 'c' + chars.retreat(0); // retreat by 0 + + assert_eq!(chars.get_pos(), 2); + assert_eq!(chars.current(), 'c'); + } } From 6db4ff11bfcb5b0c0c01814ab01a47bc308ed759 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 22 Apr 2025 19:20:16 +0300 Subject: [PATCH 04/16] Implement compile_character_class --- src/uu/sed/src/compiler.rs | 21 +- ...cape_compiler.rs => delimited_compiler.rs} | 272 +++++++++++++++++- src/uu/sed/src/sed.rs | 2 +- 3 files changed, 273 insertions(+), 22 deletions(-) rename src/uu/sed/src/{escape_compiler.rs => delimited_compiler.rs} (59%) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index a9ea21a9..5f44c679 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -9,11 +9,12 @@ // file that was distributed with this source code. use crate::command::{CliOptions, Command, CommandData, ScriptValue}; +use crate::delimited_compiler::compile_error; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; use std::collections::HashMap; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); @@ -339,24 +340,6 @@ fn compile_command( Ok(ContinueAction::NextLine) } -// Fail with msg as a compile error at the current location -fn compile_error( - lines: &ScriptLineProvider, - line: &ScriptCharProvider, - msg: impl ToString, -) -> UResult { - Err(USimpleError::new( - 1, - format!( - "{}:{}:{}: error: {}", - lines.get_input_name(), - lines.get_line_number(), - line.get_pos(), - msg.to_string() - ), - )) -} - // Return the specification for the command letter at the current line position // checking for diverse errors. fn get_cmd_spec( diff --git a/src/uu/sed/src/escape_compiler.rs b/src/uu/sed/src/delimited_compiler.rs similarity index 59% rename from src/uu/sed/src/escape_compiler.rs rename to src/uu/sed/src/delimited_compiler.rs index 63504bc0..675fc88f 100644 --- a/src/uu/sed/src/escape_compiler.rs +++ b/src/uu/sed/src/delimited_compiler.rs @@ -1,4 +1,4 @@ -// Compile escaped character sequences +// Compile delimited character sequences // // SPDX-License-Identifier: MIT // Copyright (c) 2025 Diomidis Spinellis @@ -9,7 +9,27 @@ // file that was distributed with this source code. use crate::script_char_provider::ScriptCharProvider; +use crate::script_line_provider::ScriptLineProvider; use std::char; +use uucore::error::{UResult, USimpleError}; + +// Fail with msg as a compile error at the current location +pub fn compile_error( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + msg: impl ToString, +) -> UResult { + Err(USimpleError::new( + 1, + format!( + "{}:{}:{}: error: {}", + lines.get_input_name(), + lines.get_line_number(), + line.get_pos(), + msg.to_string() + ), + )) +} /// Return true if c is a valid octal digit fn is_ascii_octal_digit(c: char) -> bool { @@ -81,9 +101,10 @@ fn create_control_char(x: char) -> Option { /// Compile a character escape valid in all contexts (RE pattern, substitution, /// transliterarion) and return the corresponding char. +/// At entry line.current() must have advanced after the `\\`. /// Advance line to the first character not part of the escape. /// Return `None` if an invalid escape has been specified. -pub fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { +fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { match line.current() { 'a' => { line.advance(); @@ -170,6 +191,124 @@ pub fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { } } +/// Compile a POSIX RE character class returning it as a string. +/// This functionality is needed to avoid terminating delimited +/// sequences when a delimiter appears within a character class. +/// While at it, handle escaped characters for the sake of consistency. +pub fn compile_character_class( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let mut result = String::new(); + + if line.eol() || line.current() != '[' { + panic!("Invalid character class."); + } + + line.advance(); + result.push('['); + + // Optional negation + if !line.eol() && line.current() == '^' { + result.push('^'); + line.advance(); + } + + // Optional leading ']' inside the class + if !line.eol() && line.current() == ']' { + result.push(']'); + line.advance(); + } + + while !line.eol() { + let ch = line.current(); + + if ch == ']' { + result.push(']'); + line.advance(); + return Ok(result); + } + + if ch == '[' { + line.advance(); + if !line.eol() { + let marker = line.current(); + // POSIX character class, collating symbol, or equivalence + if marker == ':' || marker == '.' || marker == '=' { + line.advance(); + + result.push('['); + result.push(marker); + + let mut inner = String::new(); + let mut terminated = false; + + while !line.eol() { + let c = line.current(); + if c == marker { + line.advance(); + if !line.eol() && line.current() == ']' { + line.advance(); + result.push_str(&inner); + result.push(marker); + result.push(']'); + terminated = true; + break; + } else { + // False alarm, just part of the inner name + inner.push(marker); + } + } else { + inner.push(c); + line.advance(); + } + } + + if !terminated { + return compile_error( + lines, + line, + "Unterminated POSIX character class, equivalence or collating symbol", + ); + } + + continue; + } else { + // Not a POSIX construct — treat as literal + result.push('['); + result.push(marker); + line.advance(); + continue; + } + } else { + result.push('['); + continue; + } + } + + if ch == '\\' { + // Handle escape sequence + line.advance(); + if line.eol() { + break; + } + match compile_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + } else { + result.push(ch); + line.advance(); + } + } + + compile_error(lines, line, "Unterminated bracket expression") +} + #[cfg(test)] mod tests { use super::*; @@ -392,4 +531,133 @@ mod tests { fn test_unknown_escape() { assert_eq!(escape_result_with_current("q"), (None, Some('q'))); } + + // compile_character_class + fn char_provider_from(input: &str) -> ScriptCharProvider { + ScriptCharProvider::new(input) + } + + fn test_lines() -> ScriptLineProvider { + ScriptLineProvider::with_active_state("test.sed", 3) + } + + #[test] + fn test_basic_character_class() { + let mut line = char_provider_from("[qr]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[qr]"); + } + + #[test] + fn test_negated_class() { + let mut line = char_provider_from("[^abc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[^abc]"); + } + + #[test] + fn test_leading_close_bracket() { + let mut line = char_provider_from("[]abc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[]abc]"); + } + + #[test] + fn test_leading_negated_close_bracket() { + let mut line = char_provider_from("[^]abc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[^]abc]"); + } + + #[test] + fn test_escaped_character_begin() { + let mut line = char_provider_from("[\\nabc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[\nabc]"); + } + + #[test] + fn test_escaped_character_middle() { + let mut line = char_provider_from("[a\\nbc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[a\nbc]"); + } + + #[test] + fn test_escaped_character_end() { + let mut line = char_provider_from("[abc\\n]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[abc\n]"); + } + + #[test] + fn test_escaped_delimiter() { + let mut line = char_provider_from("[a\\]bc]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[a\\]bc]"); + } + + #[test] + fn test_posix_class() { + let mut line = char_provider_from("[[:digit:]]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[:digit:]]"); + } + + #[test] + fn test_equivalence_class() { + let mut line = char_provider_from("[[=a=]]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[=a=]]"); + } + + #[test] + fn test_collating_symbol() { + let mut line = char_provider_from("[[.ch.]]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[.ch.]]"); + } + + #[test] + fn test_unterminated_class_error() { + let mut line = char_provider_from("[abc"); // missing closing ] + let lines = test_lines(); + let err = compile_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_unterminated_posix_class_error() { + let mut line = char_provider_from("[[:digit:]"); + let lines = test_lines(); + let err = compile_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_unterminated_escape_error() { + let mut line = char_provider_from("[abc\\"); // missing closing ] + let lines = test_lines(); + let err = compile_character_class(&lines, &mut line); + assert!(err.is_err()); + } + + #[test] + fn test_malformed_posix_like_pattern_treated_as_literal() { + let mut line = char_provider_from("[[x]yz]"); + let lines = test_lines(); + let result = compile_character_class(&lines, &mut line).unwrap(); + assert_eq!(result, "[[x]"); + } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index f2621243..d1caf285 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -10,7 +10,7 @@ pub mod command; pub mod compiler; -pub mod escape_compiler; +pub mod delimited_compiler; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From 1e6012d0e99a528492cadea910b0f7d60ef3fb41 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 22 Apr 2025 23:33:54 +0300 Subject: [PATCH 05/16] Improve naming for parse routines --- src/uu/sed/src/compiler.rs | 20 ++-- ...imited_compiler.rs => delimited_parser.rs} | 92 +++++++++---------- src/uu/sed/src/sed.rs | 2 +- 3 files changed, 57 insertions(+), 57 deletions(-) rename src/uu/sed/src/{delimited_compiler.rs => delimited_parser.rs} (85%) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 5f44c679..709a40b9 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -9,7 +9,7 @@ // file that was distributed with this source code. use crate::command::{CliOptions, Command, CommandData, ScriptValue}; -use crate::delimited_compiler::compile_error; +use crate::delimited_parser::compilation_error; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; @@ -305,7 +305,7 @@ fn compile_command( return Ok(ContinueAction::NextChar); } if !line.eol() { - return compile_error( + return compilation_error( lines, line, format!("extra characters at the end of the {} command", cmd.code), @@ -348,19 +348,19 @@ fn get_cmd_spec( n_addr: usize, ) -> UResult<&'static CommandSpec> { if line.eol() { - return compile_error(lines, line, "command expected"); + return compilation_error(lines, line, "command expected"); } let ch = line.current(); let opt_cmd_spec = lookup_command(ch); if opt_cmd_spec.is_none() { - return compile_error(lines, line, format!("invalid command code {}", ch)); + return compilation_error(lines, line, format!("invalid command code {}", ch)); } let cmd_spec = opt_cmd_spec.unwrap(); if n_addr > cmd_spec.n_addr { - return compile_error( + return compilation_error( lines, line, format!( @@ -478,9 +478,9 @@ mod tests { ScriptCharProvider::new(s) } - // compile_error + // compilation_error #[test] - fn test_compile_error_message_format() { + fn test_compilation_error_message_format() { let lines = ScriptLineProvider::with_active_state("test.sed", 42); let mut line = char_provider_from("whatever"); line.advance(); // move to position 1 @@ -489,7 +489,7 @@ mod tests { line.advance(); // now at position 4 let msg = "unexpected token"; - let result: UResult<()> = compile_error(&lines, &line, msg); + let result: UResult<()> = compilation_error(&lines, &line, msg); assert!(result.is_err()); @@ -500,13 +500,13 @@ mod tests { } #[test] - fn test_compile_error_with_format_message() { + fn test_compilation_error_with_format_message() { let lines = ScriptLineProvider::with_active_state("input.txt", 3); let line = char_provider_from("x"); // We're at position 0 let result: UResult<()> = - compile_error(&lines, &line, format!("invalid command '{}'", 'x')); + compilation_error(&lines, &line, format!("invalid command '{}'", 'x')); assert!(result.is_err()); diff --git a/src/uu/sed/src/delimited_compiler.rs b/src/uu/sed/src/delimited_parser.rs similarity index 85% rename from src/uu/sed/src/delimited_compiler.rs rename to src/uu/sed/src/delimited_parser.rs index 675fc88f..0020c881 100644 --- a/src/uu/sed/src/delimited_compiler.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -1,4 +1,4 @@ -// Compile delimited character sequences +// Parse delimited character sequences // // SPDX-License-Identifier: MIT // Copyright (c) 2025 Diomidis Spinellis @@ -14,7 +14,7 @@ use std::char; use uucore::error::{UResult, USimpleError}; // Fail with msg as a compile error at the current location -pub fn compile_error( +pub fn compilation_error( lines: &ScriptLineProvider, line: &ScriptCharProvider, msg: impl ToString, @@ -36,14 +36,14 @@ fn is_ascii_octal_digit(c: char) -> bool { matches!(c, '0'..='7') } -/// Compile a numeric character escape and return the corresponding char. +/// Parse a numeric character escape and return the corresponding char. /// Advance line to the first character not part of the escape. /// ndigits is the number of allowed digits and radix is the value's /// radix (e.g. 8, 10, 16 for octal, decimal, and hex escapes). /// For values up to 3 ndigits is the maximum number of allowed digits, /// for values above 3 ndigits is the exact number of allowed digits. /// Return `None` if no valid character has been specified. -fn compile_numeric_escape( +fn parse_numeric_escape( line: &mut ScriptCharProvider, is_allowed_char: fn(char) -> bool, ndigits: usize, @@ -99,12 +99,12 @@ fn create_control_char(x: char) -> Option { char::from_u32(transformed as u32) } -/// Compile a character escape valid in all contexts (RE pattern, substitution, +/// Parse a character escape valid in all contexts (RE pattern, substitution, /// transliterarion) and return the corresponding char. /// At entry line.current() must have advanced after the `\\`. /// Advance line to the first character not part of the escape. /// Return `None` if an invalid escape has been specified. -fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { +fn parse_char_escape(line: &mut ScriptCharProvider) -> Option { match line.current() { 'a' => { line.advance(); @@ -146,7 +146,7 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { 'd' => { // Decimal escape: \dnnn line.advance(); // move past 'd' - match compile_numeric_escape(line, |c| c.is_ascii_digit(), 3, 10) { + match parse_numeric_escape(line, |c| c.is_ascii_digit(), 3, 10) { Some(decoded) => Some(decoded), None => Some('d'), } @@ -155,7 +155,7 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { 'o' => { // Octal escape: \onnn line.advance(); // move past 'o' - match compile_numeric_escape(line, is_ascii_octal_digit, 3, 8) { + match parse_numeric_escape(line, is_ascii_octal_digit, 3, 8) { Some(decoded) => Some(decoded), None => Some('o'), } @@ -164,7 +164,7 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { 'u' => { // Short Unicode escape \uXXXX (exactly four hex digits) line.advance(); // move past 'x' - match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 4, 16) { + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 4, 16) { Some(decoded) => Some(decoded), None => Some('u'), } @@ -173,7 +173,7 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { 'U' => { // Short Unicode escape \UXXXXXXXX (exactly eight heax digits) line.advance(); // move past 'x' - match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 8, 16) { + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 8, 16) { Some(decoded) => Some(decoded), None => Some('U'), } @@ -182,7 +182,7 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { 'x' => { // Hexadecimal escape: \xnn line.advance(); // move past 'x' - match compile_numeric_escape(line, |c| c.is_ascii_hexdigit(), 2, 16) { + match parse_numeric_escape(line, |c| c.is_ascii_hexdigit(), 2, 16) { Some(decoded) => Some(decoded), None => Some('x'), } @@ -191,11 +191,11 @@ fn compile_char_escape(line: &mut ScriptCharProvider) -> Option { } } -/// Compile a POSIX RE character class returning it as a string. +/// Parse a POSIX RE character class returning it as a string. /// This functionality is needed to avoid terminating delimited /// sequences when a delimiter appears within a character class. /// While at it, handle escaped characters for the sake of consistency. -pub fn compile_character_class( +pub fn parse_character_class( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, ) -> UResult { @@ -265,7 +265,7 @@ pub fn compile_character_class( } if !terminated { - return compile_error( + return compilation_error( lines, line, "Unterminated POSIX character class, equivalence or collating symbol", @@ -292,7 +292,7 @@ pub fn compile_character_class( if line.eol() { break; } - match compile_char_escape(line) { + match parse_char_escape(line) { Some(decoded) => result.push(decoded), None => { result.push('\\'); @@ -306,18 +306,18 @@ pub fn compile_character_class( } } - compile_error(lines, line, "Unterminated bracket expression") + compilation_error(lines, line, "Unterminated bracket expression") } #[cfg(test)] mod tests { use super::*; - // compile_numeric_escape + // parse_numeric_escape #[test] fn test_compile_octal_escape() { let mut provider = ScriptCharProvider::new("141rest"); - let c = compile_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + let c = parse_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); assert_eq!(c, Some('a')); assert_eq!(provider.current(), 'r'); // "141" was consumed } @@ -325,7 +325,7 @@ mod tests { #[test] fn test_compile_octal_escape_eol() { let mut provider = ScriptCharProvider::new("141"); - let c = compile_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); + let c = parse_numeric_escape(&mut provider, is_ascii_octal_digit, 3, 8); assert_eq!(c, Some('a')); assert!(provider.eol()); // "141" was consumed } @@ -333,7 +333,7 @@ mod tests { #[test] fn test_compile_decimal_escape() { let mut provider = ScriptCharProvider::new("0659"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); assert_eq!(c, Some('A')); assert_eq!(provider.current(), '9'); // "65" was consumed } @@ -341,7 +341,7 @@ mod tests { #[test] fn test_compile_decimal_invalid() { let mut provider = ScriptCharProvider::new("QR"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); assert_eq!(c, None); assert_eq!(provider.current(), 'Q'); } @@ -349,7 +349,7 @@ mod tests { #[test] fn test_compile_hex_escape() { let mut provider = ScriptCharProvider::new("3cZ"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); assert_eq!(c, Some('<')); assert_eq!(provider.current(), 'Z'); // "41" was consumed } @@ -357,7 +357,7 @@ mod tests { #[test] fn test_compile_hex_escape_truncated() { let mut provider = ScriptCharProvider::new("4G"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 2, 16); assert_eq!(c, Some('\u{4}')); // Only '4' is valid hex assert_eq!(provider.current(), 'G'); // "41" was consumed } @@ -366,7 +366,7 @@ mod tests { fn test_compile_unicode_escape_short() { // U+2665 = '♥' let mut provider = ScriptCharProvider::new("26650"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); assert_eq!(c, Some('♥')); assert_eq!(provider.current(), '0'); // "2665" was consumed } @@ -374,7 +374,7 @@ mod tests { #[test] fn test_compile_unicode_escape_short_invalid() { let mut provider = ScriptCharProvider::new("123Q"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 4, 16); assert_eq!(c, None); assert_eq!(provider.current(), '1'); } @@ -383,7 +383,7 @@ mod tests { fn test_compile_unicode_escape_long_invalid() { // U+2665 = '♥' let mut provider = ScriptCharProvider::new("1234567Q"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); assert_eq!(c, None); assert_eq!(provider.current(), '1'); } @@ -392,7 +392,7 @@ mod tests { fn test_compile_unicode_escape_long() { // U+1F600 = 😀 let mut provider = ScriptCharProvider::new("0001F6009"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_hexdigit(), 8, 16); assert_eq!(c, Some('😀')); assert_eq!(provider.current(), '9'); // "0001F600" was consumed } @@ -400,7 +400,7 @@ mod tests { #[test] fn test_no_valid_digits() { let mut provider = ScriptCharProvider::new("xyz"); - let c = compile_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); + let c = parse_numeric_escape(&mut provider, |c| c.is_ascii_digit(), 3, 10); assert_eq!(c, None); assert_eq!(provider.current(), 'x'); // No advancement } @@ -437,10 +437,10 @@ mod tests { assert_eq!(create_control_char('\x7F'), Some('\x3F')); // 0x7F ^ 0x40 = 0x3F } - // compile_char_escape + // parse_char_escape fn escape_result_with_current(input: &str) -> (Option, Option) { let mut provider = ScriptCharProvider::new(input); - let result = compile_char_escape(&mut provider); + let result = parse_char_escape(&mut provider); let current = if provider.eol() { None } else { @@ -532,7 +532,7 @@ mod tests { assert_eq!(escape_result_with_current("q"), (None, Some('q'))); } - // compile_character_class + // parse_character_class fn char_provider_from(input: &str) -> ScriptCharProvider { ScriptCharProvider::new(input) } @@ -545,7 +545,7 @@ mod tests { fn test_basic_character_class() { let mut line = char_provider_from("[qr]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[qr]"); } @@ -553,7 +553,7 @@ mod tests { fn test_negated_class() { let mut line = char_provider_from("[^abc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[^abc]"); } @@ -561,7 +561,7 @@ mod tests { fn test_leading_close_bracket() { let mut line = char_provider_from("[]abc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[]abc]"); } @@ -569,7 +569,7 @@ mod tests { fn test_leading_negated_close_bracket() { let mut line = char_provider_from("[^]abc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[^]abc]"); } @@ -577,7 +577,7 @@ mod tests { fn test_escaped_character_begin() { let mut line = char_provider_from("[\\nabc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[\nabc]"); } @@ -585,7 +585,7 @@ mod tests { fn test_escaped_character_middle() { let mut line = char_provider_from("[a\\nbc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[a\nbc]"); } @@ -593,7 +593,7 @@ mod tests { fn test_escaped_character_end() { let mut line = char_provider_from("[abc\\n]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[abc\n]"); } @@ -601,7 +601,7 @@ mod tests { fn test_escaped_delimiter() { let mut line = char_provider_from("[a\\]bc]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[a\\]bc]"); } @@ -609,7 +609,7 @@ mod tests { fn test_posix_class() { let mut line = char_provider_from("[[:digit:]]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[[:digit:]]"); } @@ -617,7 +617,7 @@ mod tests { fn test_equivalence_class() { let mut line = char_provider_from("[[=a=]]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[[=a=]]"); } @@ -625,7 +625,7 @@ mod tests { fn test_collating_symbol() { let mut line = char_provider_from("[[.ch.]]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[[.ch.]]"); } @@ -633,7 +633,7 @@ mod tests { fn test_unterminated_class_error() { let mut line = char_provider_from("[abc"); // missing closing ] let lines = test_lines(); - let err = compile_character_class(&lines, &mut line); + let err = parse_character_class(&lines, &mut line); assert!(err.is_err()); } @@ -641,7 +641,7 @@ mod tests { fn test_unterminated_posix_class_error() { let mut line = char_provider_from("[[:digit:]"); let lines = test_lines(); - let err = compile_character_class(&lines, &mut line); + let err = parse_character_class(&lines, &mut line); assert!(err.is_err()); } @@ -649,7 +649,7 @@ mod tests { fn test_unterminated_escape_error() { let mut line = char_provider_from("[abc\\"); // missing closing ] let lines = test_lines(); - let err = compile_character_class(&lines, &mut line); + let err = parse_character_class(&lines, &mut line); assert!(err.is_err()); } @@ -657,7 +657,7 @@ mod tests { fn test_malformed_posix_like_pattern_treated_as_literal() { let mut line = char_provider_from("[[x]yz]"); let lines = test_lines(); - let result = compile_character_class(&lines, &mut line).unwrap(); + let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[[x]"); } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index d1caf285..1f736296 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -10,7 +10,7 @@ pub mod command; pub mod compiler; -pub mod delimited_compiler; +pub mod delimited_parser; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From e08086dbdff9add0fc9926f5c816cdfb89c8d366 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 26 Apr 2025 21:00:18 +0300 Subject: [PATCH 06/16] Remove duplicate definition --- src/uu/sed/src/compiler.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 709a40b9..709a277e 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -19,9 +19,6 @@ use uucore::error::UResult; // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); -// A global, immutable map of command properties, initialized on first access -static CMD_MAP: Lazy> = Lazy::new(build_command_map); - // Types of command arguments recognized by the parser #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CommandArgs { From ee0f9cb8184f0321299e353189a7962d479843d9 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 26 Apr 2025 22:59:48 +0300 Subject: [PATCH 07/16] Add parse_regex function --- src/uu/sed/src/delimited_parser.rs | 149 +++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index 0020c881..2225db4f 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -309,6 +309,60 @@ pub fn parse_character_class( compilation_error(lines, line, "Unterminated bracket expression") } +/// Parse the regular expression delimited by the current line +/// character and return it as a string. +/// On return the line is on the closing delimiter. +pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + // Sanity checks + if line.eol() { + return compilation_error(lines, line, "unexpected end of line".to_string()); + } + + let delimiter = line.current(); + if delimiter == '\\' { + return compilation_error(lines, line, "\\ cannot be used as a string delimiter"); + } + + line.advance(); // skip the opening delimiter + let mut result = String::new(); + + while !line.eol() { + match line.current() { + '[' if delimiter != '[' => { + let cc = parse_character_class(lines, line)?; + result.push_str(&cc); + continue; + } + '\\' => { + line.advance(); + if line.eol() { + return compilation_error(lines, line, "unterminated regular expression"); + } + if line.current() == delimiter { + // Push escaped delimiter + result.push(line.current()); + line.advance(); + continue; + } + match parse_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + // Pass through \ to RE engine for further treatment + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + continue; + } + c if c == delimiter => return Ok(result), + c => result.push(c), + } + line.advance(); + } + compilation_error(lines, line, "unterminated regular expression") +} + #[cfg(test)] mod tests { use super::*; @@ -660,4 +714,99 @@ mod tests { let result = parse_character_class(&lines, &mut line).unwrap(); assert_eq!(result, "[[x]"); } + + // parse_regex + fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { + let lines = ScriptLineProvider::new(vec![]); // Empty for tests + let line = ScriptCharProvider::new(input); + (lines, line) + } + + #[test] + fn test_simple_regex() { + let (lines, mut line) = make_providers("/abc/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escaped_delimiter() { + let (lines, mut line) = make_providers("/ab\\/c/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab/c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escape_sequence() { + let (lines, mut line) = make_providers("/ab\\n/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\n"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn errors_on_unterminated_regex() { + let (lines, mut line) = make_providers("/unterminated"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("unterminated regular expression")); + } + + #[test] + fn errors_on_backslash_delimiter() { + let (lines, mut line) = make_providers("\\bad"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("\\ cannot be used as a string delimiter")); + } + + #[test] + fn test_regex_with_character_class() { + let (lines, mut line) = make_providers("/[a-z]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a-z]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_bracket_delimiter() { + let (lines, mut line) = make_providers("[abc["); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '['); + } + + #[test] + fn test_bracket_regex_with_bracket_delimiter() { + let (lines, mut line) = make_providers("[a\\[0-9]bc["); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "a[0-9]bc"); + assert_eq!(line.current(), '['); + } + + #[test] + fn test_regex_with_escaped_bracket_in_character_class() { + let (lines, mut line) = make_providers("/[a\\]z]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a\\]z]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_delimiter_inside_character_class() { + let (lines, mut line) = make_providers("/[a/c]/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "[a/c]"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_regex_with_escaped_paren_and_backslash() { + let (lines, mut line) = make_providers("/\\(\\\\/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, "\\(\\\\"); + assert_eq!(line.current(), '/'); + } } From a3956b8c0264c4cd1345b31bb89a0f35a5e3b37a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 27 Apr 2025 09:49:28 +0300 Subject: [PATCH 08/16] Refactor scan_delimiter into separate function This can also be used by parse_transliteration --- src/uu/sed/src/delimited_parser.rs | 37 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index 2225db4f..43b2fb61 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -309,11 +309,10 @@ pub fn parse_character_class( compilation_error(lines, line, "Unterminated bracket expression") } -/// Parse the regular expression delimited by the current line -/// character and return it as a string. -/// On return the line is on the closing delimiter. -pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { - // Sanity checks +/// Scan and return the opening delimiter of a delimited string +/// Advances the line past the opening delimiter +fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + // Sanity check if line.eol() { return compilation_error(lines, line, "unexpected end of line".to_string()); } @@ -322,8 +321,18 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> if delimiter == '\\' { return compilation_error(lines, line, "\\ cannot be used as a string delimiter"); } - line.advance(); // skip the opening delimiter + Ok(delimiter) +} + +/// Parse the regular expression delimited by the current line +/// character and return it as a string. +/// On return the line is on the closing delimiter. +pub fn parse_regex( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let delimiter = scan_delimiter(lines, line)?; let mut result = String::new(); while !line.eol() { @@ -336,7 +345,11 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> '\\' => { line.advance(); if line.eol() { - return compilation_error(lines, line, "unterminated regular expression"); + return compilation_error( + lines, + line, + "unterminated regular expression", + ); } if line.current() == delimiter { // Push escaped delimiter @@ -360,7 +373,11 @@ pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> } line.advance(); } - compilation_error(lines, line, "unterminated regular expression") + compilation_error( + lines, + line, + "unterminated regular expression", + ) } #[cfg(test)] @@ -757,9 +774,7 @@ mod tests { fn errors_on_backslash_delimiter() { let (lines, mut line) = make_providers("\\bad"); let err = parse_regex(&lines, &mut line).unwrap_err(); - assert!(err - .to_string() - .contains("\\ cannot be used as a string delimiter")); + assert!(err.to_string().contains("\\ cannot be used as a string delimiter")); } #[test] From 619df93cc9dcf4df6150b0ca224f343546853a65 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 27 Apr 2025 11:01:24 +0300 Subject: [PATCH 09/16] Implement parse_transliteration To benefit from having recently worked on the regex code. --- src/uu/sed/src/delimited_parser.rs | 123 +++++++++++++++++++++++++---- 1 file changed, 107 insertions(+), 16 deletions(-) diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index 43b2fb61..f24e82b5 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -328,11 +328,8 @@ fn scan_delimiter(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> /// Parse the regular expression delimited by the current line /// character and return it as a string. /// On return the line is on the closing delimiter. -pub fn parse_regex( - lines: &ScriptLineProvider, - line: &mut ScriptCharProvider, -) -> UResult { - let delimiter = scan_delimiter(lines, line)?; +pub fn parse_regex(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + let delimiter = scan_delimiter(lines, line)?; let mut result = String::new(); while !line.eol() { @@ -345,11 +342,7 @@ pub fn parse_regex( '\\' => { line.advance(); if line.eol() { - return compilation_error( - lines, - line, - "unterminated regular expression", - ); + return compilation_error(lines, line, "unterminated regular expression"); } if line.current() == delimiter { // Push escaped delimiter @@ -373,11 +366,49 @@ pub fn parse_regex( } line.advance(); } - compilation_error( - lines, - line, - "unterminated regular expression", - ) + compilation_error(lines, line, "unterminated regular expression") +} + +/// Parse the transliteration string delimited by the current line +/// character and return it as a string. +/// On return the line is on the closing delimiter. +pub fn parse_transliteration( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let delimiter = scan_delimiter(lines, line)?; + let mut result = String::new(); + + while !line.eol() { + match line.current() { + '\\' => { + line.advance(); + if line.eol() { + return compilation_error(lines, line, "unterminated transliteration string"); + } + if line.current() == delimiter || line.current() == '\\' { + // Push only the escaped character + result.push(line.current()); + line.advance(); + continue; + } + match parse_char_escape(line) { + Some(decoded) => result.push(decoded), + None => { + // Pass through \ to tr for literal use + result.push('\\'); + result.push(line.current()); + line.advance(); + } + } + continue; + } + c if c == delimiter => return Ok(result), + c => result.push(c), + } + line.advance(); + } + compilation_error(lines, line, "unterminated transliteration string") } #[cfg(test)] @@ -770,11 +801,20 @@ mod tests { assert!(err.to_string().contains("unterminated regular expression")); } + #[test] + fn errors_on_esc_at_re_eol() { + let (lines, mut line) = make_providers("/foo\\"); + let err = parse_regex(&lines, &mut line).unwrap_err(); + assert!(err.to_string().contains("unterminated regular expression")); + } + #[test] fn errors_on_backslash_delimiter() { let (lines, mut line) = make_providers("\\bad"); let err = parse_regex(&lines, &mut line).unwrap_err(); - assert!(err.to_string().contains("\\ cannot be used as a string delimiter")); + assert!(err + .to_string() + .contains("\\ cannot be used as a string delimiter")); } #[test] @@ -824,4 +864,55 @@ mod tests { assert_eq!(parsed, "\\(\\\\"); assert_eq!(line.current(), '/'); } + + // parse_transliteration + #[test] + fn test_simple_transliteration() { + let (lines, mut line) = make_providers("/abc/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "abc"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escaped_delimiter() { + let (lines, mut line) = make_providers("/ab\\/c/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab/c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escaped_backslash() { + let (lines, mut line) = make_providers("/ab\\\\c/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\\c"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn test_transliteration_with_escape_sequence() { + let (lines, mut line) = make_providers("/ab\\n/"); + let parsed = parse_transliteration(&lines, &mut line).unwrap(); + assert_eq!(parsed, "ab\n"); + assert_eq!(line.current(), '/'); + } + + #[test] + fn errors_on_unterminated_transliteration() { + let (lines, mut line) = make_providers("/unterminated"); + let err = parse_transliteration(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("unterminated transliteration string")); + } + + #[test] + fn errors_on_esc_at_tr_eol() { + let (lines, mut line) = make_providers("/foo\\"); + let err = parse_transliteration(&lines, &mut line).unwrap_err(); + assert!(err + .to_string() + .contains("unterminated transliteration string")); + } } From c3a8837f0eecd119c5661af4aee966c06cd38f35 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 27 Apr 2025 15:15:57 +0300 Subject: [PATCH 10/16] Show that make_providers is used by multiple tests --- src/uu/sed/src/delimited_parser.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index f24e82b5..ff62df25 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -195,7 +195,7 @@ fn parse_char_escape(line: &mut ScriptCharProvider) -> Option { /// This functionality is needed to avoid terminating delimited /// sequences when a delimiter appears within a character class. /// While at it, handle escaped characters for the sake of consistency. -pub fn parse_character_class( +fn parse_character_class( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, ) -> UResult { @@ -415,6 +415,12 @@ pub fn parse_transliteration( mod tests { use super::*; + fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { + let lines = ScriptLineProvider::new(vec![]); // Empty for tests + let line = ScriptCharProvider::new(input); + (lines, line) + } + // parse_numeric_escape #[test] fn test_compile_octal_escape() { @@ -764,12 +770,6 @@ mod tests { } // parse_regex - fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { - let lines = ScriptLineProvider::new(vec![]); // Empty for tests - let line = ScriptCharProvider::new(input); - (lines, line) - } - #[test] fn test_simple_regex() { let (lines, mut line) = make_providers("/abc/"); From 207fee2e0b3cb4cf9756ddc34bf2ea9b118f1b7d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 27 Apr 2025 15:16:30 +0300 Subject: [PATCH 11/16] Implement compile_addresses --- README.md | 4 + src/uu/sed/src/command.rs | 15 ++ src/uu/sed/src/compiler.rs | 489 +++++++++++++++++++++++++++++++++++-- 3 files changed, 487 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 637ab53d..5f42a768 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,10 @@ cargo run --release ## Extensions ### GNU * Command-line arguments can be specified in long (`--`) form. +* Spaces can precede a regular expression modifier. + +### BSD and GNU +* The second address in a range can be specified as a relative address with +N. ### Other * Unicode characters can be specified in regular expression pattern, replacement diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index a70f681f..8e8d132b 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -107,6 +107,21 @@ pub struct Command { pub next: Option>, // Pointer to next command } +impl Default for Command { + fn default() -> Self { + Command { + code: '_', + addr1: None, + addr2: None, + non_select: false, + start_line: Some(0), + text: None, + data: CommandData::None, + next: None, + } + } +} + #[derive(Debug)] pub enum CommandData { None, diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 709a277e..d80b2614 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -8,14 +8,21 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{CliOptions, Command, CommandData, ScriptValue}; -use crate::delimited_parser::compilation_error; +use crate::command::{Address, AddressType, AddressValue, CliOptions, Command, ScriptValue}; +use crate::delimited_parser::{compilation_error, parse_regex}; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; +use regex::Regex; +use std::cell::RefCell; use std::collections::HashMap; use uucore::error::UResult; +thread_local! { + /// The previously saved RE. It is reused when specifying an empty one. + static SAVED_REGEX: RefCell> = const { RefCell::new(None) }; +} + // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); @@ -203,9 +210,9 @@ pub fn compile( scripts: Vec, cli_options: &mut CliOptions, ) -> UResult>> { - let mut line_provider = ScriptLineProvider::new(scripts); + let mut make_providers = ScriptLineProvider::new(scripts); - let result = compile_thread(&mut line_provider, cli_options)?; + let result = compile_thread(&mut make_providers, cli_options)?; // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -238,18 +245,8 @@ fn compile_thread( continue 'next_char; } - let mut cmd = Box::new(Command { - next: None, - addr1: None, - addr2: None, - start_line: Some(0), - text: None, - data: CommandData::None, - code: '_', - non_select: false, - }); - - let n_addr = compile_addresses(&mut line, &mut cmd); + let mut cmd = Box::new(Command::default()); + let n_addr = compile_addresses(lines, &mut line, &mut cmd)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; if cmd_spec.args == CommandArgs::NonSelect { @@ -275,11 +272,150 @@ fn compile_thread( } } -// Compile a command's addresses into cmd. -// Return the number of addresses encountered. -fn compile_addresses(_line: &mut ScriptCharProvider, _cmd: &mut Command) -> usize { - // TODO: implement address parsing - 0 +/// Return true if c is a valid character for specifying a context address +fn is_address_char(c: char) -> bool { + matches!(c, '0'..='9' | '/' | '\\' | '$') +} + +/// Compile a command's optional address range into cmd. +/// Return the number of addresses encountered. +fn compile_addresses( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + let mut n_addr = 0; + + line.eat_spaces(); + if !line.eol() && is_address_char(line.current()) { + if let Ok(addr1) = compile_address(lines, line) { + cmd.addr1 = Some(addr1); + n_addr += 1; + } + } + + line.eat_spaces(); + if n_addr == 1 && !line.eol() && line.current() == ',' { + line.advance(); + line.eat_spaces(); + if !line.eol() { + if let Ok(addr2) = compile_address(lines, line) { + cmd.addr2 = Some(addr2); + n_addr += 1; + } + } + } + + Ok(n_addr) +} + +/// Compile and return a single range address specification. +fn compile_address(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult
{ + let mut icase = false; + + if line.eol() { + return compilation_error(lines, line, "expected context address"); + } + + match line.current() { + '\\' | '/' => { + // Regular expression + if line.current() == '\\' { + // The next character is an arbitrary delimiter + line.advance(); + } + let re = parse_regex(lines, line)?; + // Skip over delimiter + line.advance(); + + line.eat_spaces(); + if !line.eol() && line.current() == 'I' { + icase = true; + line.advance(); + } + + Ok(Address { + atype: AddressType::Re, + value: AddressValue::Regex(compile_regex(lines, line, &re, icase)?), + }) + } + '$' => { + line.advance(); + Ok(Address { + atype: AddressType::Last, + value: AddressValue::LineNumber(0), + }) + } + '+' => { + line.advance(); + let number = parse_number(lines, line)?; + Ok(Address { + atype: AddressType::RelLine, + value: AddressValue::LineNumber(number), + }) + } + c if c.is_ascii_digit() => { + let number = parse_number(lines, line)?; + Ok(Address { + atype: AddressType::Line, + value: AddressValue::LineNumber(number), + }) + } + _ => panic!("invalid context address"), + } +} + +/// Parse and return the decimal number at the current line position. +/// Advance the line to first non-digit or EOL. +fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult { + let mut num_str = String::new(); + + while !line.eol() && line.current().is_ascii_digit() { + num_str.push(line.current()); + line.advance(); + } + + num_str + .parse::() + .map_err(|_| format!("invalid number '{}'", num_str)) + .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) +} + +/// Compile the provided regular expression string into a corresponding engine. +fn compile_regex( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + pattern: &str, + icase: bool, +) -> UResult { + if pattern.is_empty() { + SAVED_REGEX.with(|cell| { + if let Some(existing) = &*cell.borrow() { + Ok(existing.clone()) + } else { + compilation_error(lines, line, "no previously compiled regex available") + } + }) + } else { + let full_pattern = if icase { + if pattern.is_empty() { + return compilation_error(lines, line, "cannot specify a modifier on an empty RE"); + } + format!("(?i){}", pattern) + } else { + pattern.to_string() + }; + + let compiled = Regex::new(&full_pattern).map_err(|e| { + compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) + .unwrap_err() + })?; + + SAVED_REGEX.with(|cell| { + *cell.borrow_mut() = Some(compiled.clone()); + }); + Ok(compiled) + } } // Compile the specified command @@ -379,6 +515,12 @@ fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { mod tests { use super::*; + fn make_providers(input: &str) -> (ScriptLineProvider, ScriptCharProvider) { + let lines = ScriptLineProvider::new(vec![]); // Empty for tests + let line = ScriptCharProvider::new(input); + (lines, line) + } + // lookup_command #[test] fn test_lookup_empty_command() { @@ -559,4 +701,309 @@ mod tests { let spec = result.unwrap(); assert_eq!(spec.code, 'a'); } + + // parse_number + #[test] + fn test_parse_number_basic() { + let (lines, mut chars) = make_providers("123abc"); + assert_eq!(parse_number(&lines, &mut chars).unwrap(), 123); + assert_eq!(chars.current(), 'a'); // Should stop at first non-digit + } + + #[test] + fn test_parse_number_invalid() { + let (lines, mut chars) = make_providers("abc"); + assert!(parse_number(&lines, &mut chars).is_err()); + } + + // compile_re + fn dummy_providers() -> (ScriptLineProvider, ScriptCharProvider) { + make_providers("dummy input") + } + + #[test] + fn test_compile_re_basic() { + let (lines, chars) = dummy_providers(); + let regex = compile_regex(&lines, &chars, "abc", false).unwrap(); + assert!(regex.is_match("abc")); + assert!(!regex.is_match("ABC")); + } + + #[test] + fn test_compile_re_case_insensitive() { + let (lines, chars) = dummy_providers(); + let regex = compile_regex(&lines, &chars, "abc", true).unwrap(); + assert!(regex.is_match("abc")); + assert!(regex.is_match("ABC")); + assert!(regex.is_match("AbC")); + } + + #[test] + fn test_compile_re_saved_and_reuse() { + // Save a regex + let (lines1, chars1) = dummy_providers(); + let _ = compile_regex(&lines1, &chars1, "abc", false).unwrap(); + + // Now try to reuse it + let (lines2, chars2) = dummy_providers(); + let reused = compile_regex(&lines2, &chars2, "", false).unwrap(); + + assert!(reused.is_match("abc")); + } + + #[test] + fn test_compile_re_empty_and_not_saved() { + // Clear saved regex + SAVED_REGEX.with(|cell| { + *cell.borrow_mut() = None; + }); + + let (lines, chars) = dummy_providers(); + let result = compile_regex(&lines, &chars, "", false); + assert!(result.is_err()); // Should fail because nothing was saved + } + + #[test] + fn test_compile_re_invalid() { + let (lines, chars) = dummy_providers(); + let result = compile_regex(&lines, &chars, "a[d", false); + assert!(result.is_err()); // Should fail due to open bracketed expression + } + + // compile_address + #[test] + fn test_compile_addr_line_number() { + let (lines, mut chars) = make_providers("42"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Line)); + if let AddressValue::LineNumber(n) = addr.value { + assert_eq!(n, 42); + } else { + panic!("expected LineNumber address value"); + } + } + + #[test] + fn test_compile_addr_relative_line() { + let (lines, mut chars) = make_providers("+7"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::RelLine)); + if let AddressValue::LineNumber(n) = addr.value { + assert_eq!(n, 7); + } else { + panic!("expected LineNumber address value"); + } + } + + #[test] + fn test_compile_addr_last_line() { + let (lines, mut chars) = make_providers("$"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Last)); + } + + #[test] + fn test_compile_addr_regex() { + let (lines, mut chars) = make_providers("/hello/"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("hello")); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_other_delimiter() { + let (lines, mut chars) = make_providers("\\#hello#"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("hello")); + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_regex_with_modifier() { + let (lines, mut chars) = make_providers("/hello/I"); + let addr = compile_address(&lines, &mut chars).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("HELLO")); // case-insensitive + } else { + panic!("expected Regex address value"); + } + } + + #[test] + fn test_compile_addr_empty_regex_saved() { + // First save a regex + let (lines1, mut chars1) = make_providers("/saved/"); + let _ = compile_address(&lines1, &mut chars1).unwrap(); + + // Then reuse it with empty regex + let (lines2, mut chars2) = make_providers("//"); + let addr = compile_address(&lines2, &mut chars2).unwrap(); + assert!(matches!(addr.atype, AddressType::Re)); + if let AddressValue::Regex(re) = addr.value { + assert!(re.is_match("saved")); + } else { + panic!("expected Regex address value"); + } + } + + // compile_addresses + #[test] + fn test_compile_single_line_address() { + let (lines, mut chars) = make_providers("42"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + } + + #[test] + fn test_compile_relative_address_range() { + let (lines, mut chars) = make_providers("2,+3"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + let v1 = match &cmd.addr1.as_ref().unwrap().value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(v1, 2); + + assert!(matches!( + cmd.addr2.as_ref().unwrap().atype, + AddressType::RelLine + )); + let v2 = match &cmd.addr2.as_ref().unwrap().value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(v2, 3); + } + + #[test] + fn test_compile_last_address() { + let (lines, mut chars) = make_providers("$"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Last + )); + } + + #[test] + fn test_compile_absolute_address_range() { + let (lines, mut chars) = make_providers("5,10"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + assert!(matches!( + cmd.addr1.as_ref().unwrap().atype, + AddressType::Line + )); + assert!(matches!( + cmd.addr2.as_ref().unwrap().atype, + AddressType::Line + )); + } + + #[test] + fn test_compile_regex_address() { + let (lines, mut chars) = make_providers("/foo/"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("foo")); + assert!(!re.is_match("bar")); + } else { + panic!("expected a regex address"); + } + } + + #[test] + fn test_compile_regex_address_range_other_delimiter() { + let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 2); + + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("foo")); + assert!(!re.is_match("bar")); + } else { + panic!("expected a regex address"); + } + + assert!(matches!(cmd.addr2.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr2.as_ref().unwrap().value { + assert!(re.is_match("bar")); + assert!(!re.is_match("foo")); + } else { + panic!("expected a regex address"); + } + } + + #[test] + fn test_compile_regex_with_modifier() { + let (lines, mut chars) = make_providers("/foo/I"); + let mut cmd = Command::default(); + let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); + if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(re.is_match("FOO")); + assert!(re.is_match("foo")); + } else { + panic!("expected a regex address with case-insensitive match"); + } + } + + #[test] + fn test_compile_re_reuse_saved() { + // First save a regex + let (lines1, mut chars1) = make_providers("/abc/"); + let mut cmd1 = Command::default(); + compile_addresses(&lines1, &mut chars1, &mut cmd1).unwrap(); + + // Now reuse it + let (lines2, mut chars2) = make_providers("//"); + let mut cmd2 = Command::default(); + let n_addr = compile_addresses(&lines2, &mut chars2, &mut cmd2).unwrap(); + + assert_eq!(n_addr, 1); + assert!(matches!( + cmd2.addr1.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd2.addr1.as_ref().unwrap().value { + assert!(re.is_match("abc")); + } + } } From 70d6ef0144bab7fc16e6277b6903199d93f5503a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Mon, 28 Apr 2025 23:11:25 +0300 Subject: [PATCH 12/16] Improve function name --- src/uu/sed/src/compiler.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index d80b2614..b9aa83c9 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -246,7 +246,7 @@ fn compile_thread( } let mut cmd = Box::new(Command::default()); - let n_addr = compile_addresses(lines, &mut line, &mut cmd)?; + let n_addr = compile_address_range(lines, &mut line, &mut cmd)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; if cmd_spec.args == CommandArgs::NonSelect { @@ -279,7 +279,7 @@ fn is_address_char(c: char) -> bool { /// Compile a command's optional address range into cmd. /// Return the number of addresses encountered. -fn compile_addresses( +fn compile_address_range( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, @@ -855,12 +855,12 @@ mod tests { } } - // compile_addresses + // compile_address_range #[test] fn test_compile_single_line_address() { let (lines, mut chars) = make_providers("42"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -873,7 +873,7 @@ mod tests { fn test_compile_relative_address_range() { let (lines, mut chars) = make_providers("2,+3"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); @@ -902,7 +902,7 @@ mod tests { fn test_compile_last_address() { let (lines, mut chars) = make_providers("$"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -915,7 +915,7 @@ mod tests { fn test_compile_absolute_address_range() { let (lines, mut chars) = make_providers("5,10"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); assert!(matches!( @@ -932,7 +932,7 @@ mod tests { fn test_compile_regex_address() { let (lines, mut chars) = make_providers("/foo/"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); @@ -948,7 +948,7 @@ mod tests { fn test_compile_regex_address_range_other_delimiter() { let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); @@ -973,7 +973,7 @@ mod tests { fn test_compile_regex_with_modifier() { let (lines, mut chars) = make_providers("/foo/I"); let mut cmd = Command::default(); - let n_addr = compile_addresses(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); @@ -990,12 +990,12 @@ mod tests { // First save a regex let (lines1, mut chars1) = make_providers("/abc/"); let mut cmd1 = Command::default(); - compile_addresses(&lines1, &mut chars1, &mut cmd1).unwrap(); + compile_address_range(&lines1, &mut chars1, &mut cmd1).unwrap(); // Now reuse it let (lines2, mut chars2) = make_providers("//"); let mut cmd2 = Command::default(); - let n_addr = compile_addresses(&lines2, &mut chars2, &mut cmd2).unwrap(); + let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( From 118d28a98ba0c967fa6611ac01aa20b38200d13c Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 1 May 2025 23:41:00 +0300 Subject: [PATCH 13/16] Add unit tests for compile_thread --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/compiler.rs | 81 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 8e8d132b..53761d85 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -18,7 +18,7 @@ use std::path::PathBuf; // For file descriptors and equivalent // Compilation and processing options provided mostly through the // command-line interface -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CliOptions { // Command-line flags with corresponding names pub all_output_files: bool, diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index b9aa83c9..c45a1ad4 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -1006,4 +1006,85 @@ mod tests { assert!(re.is_match("abc")); } } + + // compile_thread + fn make_provider(lines: &[&str]) -> ScriptLineProvider { + let input = lines + .iter() + .map(|s| ScriptValue::StringVal(s.to_string())) + .collect(); + ScriptLineProvider::new(input) + } + + fn make_cli_options() -> CliOptions { + CliOptions::default() + } + + #[test] + fn test_compile_thread_empty_input() { + let mut provider = make_provider(&[]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_compile_thread_comment_only() { + let mut provider = make_provider(&["# comment", " ", ";;"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_compile_thread_single_command() { + let mut provider = make_provider(&["42q"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let cmd = result.unwrap(); + + assert_eq!(cmd.code, 'q'); + + let addr = cmd.addr1.as_ref().expect("addr1 should be set"); + assert!(matches!(addr.atype, AddressType::Line)); + + let value = match &addr.value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(value, 42); + + assert!(cmd.next.is_none()); + } + + #[test] + fn test_compile_thread_multiple_lines() { + let mut provider = make_provider(&["1q", "2d"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let first = result.unwrap(); + + assert_eq!(first.code, 'q'); + let second = first.next.unwrap(); + assert_eq!(second.code, 'd'); + assert!(second.next.is_none()); + } + + #[test] + fn test_compile_thread_single_line_multiple_commands() { + let mut provider = make_provider(&["1q;2d"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let first = result.unwrap(); + + assert_eq!(first.code, 'q'); + let second = first.next.unwrap(); + assert_eq!(second.code, 'd'); + assert!(second.next.is_none()); + } } From 780a0550a5d61750b8e6452481b06d00df3c2357 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 2 May 2025 08:36:55 +0300 Subject: [PATCH 14/16] Add test for non selected lines --- src/uu/sed/src/compiler.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index c45a1ad4..99695093 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -249,6 +249,7 @@ fn compile_thread( let n_addr = compile_address_range(lines, &mut line, &mut cmd)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; + // The ! command shall be followed by another one if cmd_spec.args == CommandArgs::NonSelect { line.advance(); line.eat_spaces(); @@ -445,11 +446,11 @@ fn compile_command( ); } } + CommandArgs::NonSelect => { // ! + } // TODO CommandArgs::Text => { // a c i } - CommandArgs::NonSelect => { // ! - } CommandArgs::Group => { // { } CommandArgs::EndGroup => { // } @@ -1047,6 +1048,30 @@ mod tests { let cmd = result.unwrap(); assert_eq!(cmd.code, 'q'); + assert!(!cmd.non_select); + + let addr = cmd.addr1.as_ref().expect("addr1 should be set"); + assert!(matches!(addr.atype, AddressType::Line)); + + let value = match &addr.value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(value, 42); + + assert!(cmd.next.is_none()); + } + + #[test] + fn test_compile_thread_non_selected_single_command() { + let mut provider = make_provider(&["42!p"]); + let mut opts = make_cli_options(); + + let result = compile_thread(&mut provider, &mut opts).unwrap(); + let cmd = result.unwrap(); + + assert_eq!(cmd.code, 'p'); + assert!(cmd.non_select); let addr = cmd.addr1.as_ref().expect("addr1 should be set"); assert!(matches!(addr.atype, AddressType::Line)); From 76d67ed5840eecc0d2c4a391f81fcd8f30c723a8 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 2 May 2025 08:41:31 +0300 Subject: [PATCH 15/16] Add unit test for compile This is mostly a placeholder for more unit tests as we add functionality to it. --- src/uu/sed/src/compiler.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 99695093..341a8d17 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -1112,4 +1112,27 @@ mod tests { assert_eq!(second.code, 'd'); assert!(second.next.is_none()); } + + // compile + #[test] + fn test_compile_single_command() { + let scripts = vec![ScriptValue::StringVal("1q".to_string())]; + let mut opts = CliOptions::default(); + + let result = compile(scripts, &mut opts).unwrap(); + let cmd = result.unwrap(); + + assert_eq!(cmd.code, 'q'); + + let addr = cmd.addr1.as_ref().unwrap(); + assert!(matches!(addr.atype, AddressType::Line)); + + let line = match &addr.value { + AddressValue::LineNumber(n) => *n, + _ => panic!(), + }; + assert_eq!(line, 1); + + assert!(cmd.next.is_none()); + } } From e791a44f2d7d9ebb7e7bc72d0ea2f5f3def3c811 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 3 May 2025 12:47:06 +0300 Subject: [PATCH 16/16] Address code review comments --- src/uu/sed/src/delimited_parser.rs | 5 +---- src/uu/sed/src/script_char_provider.rs | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index ff62df25..ff71653c 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -90,10 +90,7 @@ fn create_control_char(x: char) -> Option { return None; } - let mut c = x; - if c.is_ascii_lowercase() { - c = c.to_ascii_uppercase(); - } + let c = x.to_ascii_uppercase(); let transformed = (c as u8) ^ 0x40; char::from_u32(transformed as u32) diff --git a/src/uu/sed/src/script_char_provider.rs b/src/uu/sed/src/script_char_provider.rs index bb537f3c..52b54e01 100644 --- a/src/uu/sed/src/script_char_provider.rs +++ b/src/uu/sed/src/script_char_provider.rs @@ -30,11 +30,7 @@ impl ScriptCharProvider { /// Retreats current position by specified number or to beginning. pub fn retreat(&mut self, n: usize) { - if n > self.pos { - self.pos = 0; - } else { - self.pos -= n; - } + self.pos = self.pos.saturating_sub(n); } /// Returns the current character. Panics if out of bounds.