diff --git a/Cargo.lock b/Cargo.lock index 1ca9a621..74b316fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -853,7 +853,9 @@ name = "uu_sed" version = "0.0.1" dependencies = [ "clap", + "once_cell", "regex", + "tempfile", "uucore", ] diff --git a/Cargo.toml b/Cargo.toml index 240d9acc..a5a895bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ clap_mangen = "0.2" regex = "1.10.4" sysinfo = "0.34" libc = "0.2.153" +once_cell = "1.21" phf = "0.11.2" phf_codegen = "0.11.2" textwrap = { version = "0.16.1", features = ["terminal_size"] } diff --git a/LICENSE b/LICENSE index 6bd7307b..c66459ce 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 uutils +Copyright (c) 2025 Diomidis Spinellis Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index a62bf28f..32dd9fb7 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ # sed -Rust reimplementation of the [sed utility](https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html). +Rust reimplementation of the [sed utility](https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html) +with some [GNU sed](https://www.gnu.org/software/sed/manual/sed.html) +and [FreeBSD sed](https://man.freebsd.org/cgi/man.cgi?sed(1)) extensions. ## Installation diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 36c6cdca..4a095456 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -15,7 +15,9 @@ categories = ["command-line-utilities"] [dependencies] uucore = { workspace = true } clap = { workspace = true } +once_cell = { workspace = true } regex = { workspace = true } +tempfile = { workspace = true } [lib] path = "src/sed.rs" diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index c395c8fb..a70f681f 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -1,7 +1,10 @@ // Definitions for the compiled code data structures // -// This file is part of the uutils sed package. +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis // +// This file is part of the uutils sed package. +// It is licensed under the MIT License. // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. @@ -13,6 +16,26 @@ use std::collections::HashMap; use std::fs::File; use std::path::PathBuf; // For file descriptors and equivalent +// Compilation and processing options provided mostly through the +// command-line interface +#[derive(Debug)] +pub struct CliOptions { + // Command-line flags with corresponding names + pub all_output_files: bool, + pub debug: bool, + pub regexp_extended: bool, + pub follow_symlinks: bool, + pub in_place: bool, + pub in_place_suffix: Option, + pub length: usize, + pub quiet: bool, + pub posix: bool, + pub separate: bool, + pub sandbox: bool, + pub unbuffered: bool, + pub null_data: bool, +} + // The specification of a script: through a string or a file #[derive(Debug, PartialEq)] pub enum ScriptValue { @@ -24,7 +47,7 @@ pub enum ScriptValue { * Types of address specifications */ #[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum AddressType { +pub enum AddressType { Re, // Line that matches regex Line, // Specific line RelLine, // Relative line @@ -35,14 +58,14 @@ enum AddressType { * Format of an address */ #[derive(Debug)] -struct Address { - atype: AddressType, // Address type - value: AddressValue, // Line number or regex +pub struct Address { + pub atype: AddressType, // Address type + pub value: AddressValue, // Line number or regex } #[derive(Debug)] -enum AddressValue { - LineNumber(u64), +pub enum AddressValue { + LineNumber(usize), Regex(Regex), } @@ -50,25 +73,23 @@ enum AddressValue { * Substitution command */ #[derive(Debug)] -struct Substitution { - occurrence: usize, // Which occurrence to substitute - print_flag: bool, // True if 'p' flag - ignore_case: bool, // True if 'I' flag - write_file: Option, // Path to file if 'w' flag is used - file_descriptor: Option, // Cached file descriptor - regex: Regex, // Regular expression - max_backref: u32, // Largest backreference - line_number: u64, // Line number - replacement: String, // Replacement text +pub struct Substitution { + pub occurrence: usize, // Which occurrence to substitute + pub print_flag: bool, // True if 'p' flag + pub ignore_case: bool, // True if 'I' flag + pub write_file: Option, // Path to file if 'w' flag is used + pub file_descriptor: Option, // Cached file descriptor + pub regex: Regex, // Regular expression + pub max_backref: u32, // Largest backreference + pub line_number: usize, // Line number + pub replacement: String, // Replacement text } -/* - * Translate command. - */ +// Transliteration command (y) #[derive(Debug)] -struct TranslateCommand { - byte_table: [u8; 256], // Byte translation table - multi_map: HashMap, // Direct mapping from one char to another +pub struct Transliteration { + pub byte_table: [u8; 256], // Byte translation table + pub multi_map: HashMap, // Direct mapping from one char to another } /* @@ -76,54 +97,36 @@ struct TranslateCommand { */ #[derive(Debug)] pub struct Command { - next: Option>, // Pointer to next command - addr1: Option
, // Start address - addr2: Option
, // End address - start_line: Option, // Start line number (or None) - text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' - data: CommandData, // Union equivalent - code: char, // Command code - non_select: bool, // True if '!' + pub code: char, // Command code + pub addr1: Option
, // Start address + pub addr2: Option
, // End address + pub non_select: bool, // True if '!' + pub start_line: Option, // Start line number (or None) + pub text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' + pub data: CommandData, // Command-specific data + pub next: Option>, // Pointer to next command } #[derive(Debug)] -enum CommandData { - SubCommands(Vec), // Commands for 'b', 't', '{' - Substitution(Box), // Substitute command - Translate(Box), // Replace command array - WriteFileDescriptor(File), // File descriptor for 'w' -} - -/* - * Types of command arguments recognized by the parser - */ -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum CommandArgs { - Empty, // d D g G h H l n N p P q x = \0 - Text, // a c i - NonSelect, // ! - Group, // { - EndGroup, // } - Comment, // # - Branch, // b t - Label, // : - ReadFile, // r - WriteFile, // w - Substitute, // s - Translate, // y +pub enum CommandData { + None, + SubCommands(Vec), // Commands for 'b', 't', '{' + Substitution(Box), // Substitute command 's' + Transliteration(Box), // Transliteration command 'y' + WriteFileDescriptor(File), // File descriptor for 'w' } /* * Structure containing things to append before a line is read */ #[derive(Debug)] -struct AppendBuffer { +pub struct AppendBuffer { append_type: AppendType, content: String, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum AppendType { +pub enum AppendType { String, File, } @@ -132,7 +135,7 @@ enum AppendType { * Special flag for space modifications */ #[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum SpaceFlag { +pub enum SpaceFlag { Append, // Append to contents Replace, // Replace contents } @@ -141,9 +144,9 @@ enum SpaceFlag { * Structure for a processing space (process, hold, otherwise). */ #[derive(Debug)] -struct Space { - current: String, // Current space content - deleted: bool, // Whether content was deleted - append_newline: bool, // Whether originally terminated by \n - backup: String, // Backing memory +pub struct Space { + pub current: String, // Current space content + pub deleted: bool, // Whether content was deleted + pub append_newline: bool, // Whether originally terminated by \n + pub backup: String, // Backing memory } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 590da110..5d016299 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -1,14 +1,579 @@ // Compile the scripts into the internal representation of commands // -// This file is part of the uutils sed package. +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis // +// This file is part of the uutils sed package. +// It is licensed under the MIT License. // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Command, ScriptValue}; -use uucore::error::UResult; +use crate::command::{CliOptions, Command, CommandData, ScriptValue}; +use crate::script_char_provider::ScriptCharProvider; +use crate::script_line_provider::ScriptLineProvider; +use once_cell::sync::Lazy; +use std::collections::HashMap; +use uucore::error::{UResult, USimpleError}; + +// A global, immutable map of command properties, initialized on first access +static CMD_MAP: Lazy> = Lazy::new(build_command_map); + +// Types of command arguments recognized by the parser +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum CommandArgs { + Empty, // d D g G h H l n N p P q x = \0 + Text, // a c i + NonSelect, // ! + Group, // { + EndGroup, // } + Comment, // # + Branch, // b t + Label, // : + ReadFile, // r + WriteFile, // w + Substitute, // s + Translate, // y +} + +// Command specification +#[derive(Debug, Clone, Copy)] +struct CommandSpec { + code: char, // Command letter used by sed + n_addr: usize, // Number of supported addresses + args: CommandArgs, // Type of command arguments +} + +// Build the command specification map (char -> CommandSpec) +fn build_command_map() -> HashMap { + let formats = [ + CommandSpec { + code: '{', + n_addr: 2, + args: CommandArgs::Group, + }, + CommandSpec { + code: '}', + n_addr: 0, + args: CommandArgs::EndGroup, + }, + CommandSpec { + code: 'a', + n_addr: 1, + args: CommandArgs::Text, + }, + CommandSpec { + code: 'b', + n_addr: 2, + args: CommandArgs::Branch, + }, + CommandSpec { + code: 'c', + n_addr: 2, + args: CommandArgs::Text, + }, + CommandSpec { + code: 'd', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'D', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'g', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'G', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'h', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'H', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'i', + n_addr: 1, + args: CommandArgs::Text, + }, + CommandSpec { + code: 'l', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'n', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'N', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'p', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'P', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'q', + n_addr: 1, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'r', + n_addr: 1, + args: CommandArgs::ReadFile, + }, + CommandSpec { + code: 's', + n_addr: 2, + args: CommandArgs::Substitute, + }, + CommandSpec { + code: 't', + n_addr: 2, + args: CommandArgs::Branch, + }, + CommandSpec { + code: 'w', + n_addr: 2, + args: CommandArgs::WriteFile, + }, + CommandSpec { + code: 'x', + n_addr: 2, + args: CommandArgs::Empty, + }, + CommandSpec { + code: 'y', + n_addr: 2, + args: CommandArgs::Translate, + }, + CommandSpec { + code: '!', + n_addr: 2, + args: CommandArgs::NonSelect, + }, + CommandSpec { + code: ':', + n_addr: 0, + args: CommandArgs::Label, + }, + CommandSpec { + code: '#', + n_addr: 0, + args: CommandArgs::Comment, + }, + CommandSpec { + code: '=', + n_addr: 1, + args: CommandArgs::Empty, + }, + ]; + + formats.into_iter().map(|f| (f.code, f)).collect() +} + +// How to continue after processing a command +#[derive(Debug)] +enum ContinueAction { + NextLine, + NextChar, +} + +pub fn compile( + scripts: Vec, + cli_options: &mut CliOptions, +) -> UResult>> { + let mut line_provider = ScriptLineProvider::new(scripts); + + let result = compile_thread(&mut line_provider, cli_options)?; + // TODO: fix-up labels, check used labels, setup append & match structures + Ok(result) +} + +// Compile provided scripts into a thread of commands +fn compile_thread( + lines: &mut ScriptLineProvider, + _cli_options: &mut CliOptions, +) -> UResult>> { + let mut head: Option> = None; + // A mutable reference to the place we’ll insert next + let mut next_p = &mut head; + + 'next_line: loop { + match lines.next_line().unwrap() { + None => { + // TODO: Error if stack isn't empty + return Ok(head); + } + Some(line_string) => { + let mut line = ScriptCharProvider::new(&line_string); + + // TODO: set cli_options.quiet for StringVal starting with #n + 'next_char: loop { + line.eat_spaces(); + if line.eol() || line.current() == '#' { + continue 'next_line; + } else if line.current() == ';' { + line.advance(); + continue 'next_char; + } + + let mut cmd = Box::new(Command { + next: None, + addr1: None, + addr2: None, + start_line: Some(0), + text: None, + data: CommandData::None, + code: '_', + non_select: false, + }); + + let n_addr = compile_addresses(&mut line, &mut cmd); + let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; + + if cmd_spec.args == CommandArgs::NonSelect { + line.advance(); + line.eat_spaces(); + cmd.non_select = true; + cmd_spec = get_cmd_spec(lines, &line, n_addr)?; + } + + // Move cmd into next_p, transferring its ownership + let action = compile_command(lines, &mut line, &mut cmd, cmd_spec)?; + + *next_p = Some(cmd); + next_p = &mut next_p.as_mut().unwrap().next; + + match action { + ContinueAction::NextLine => continue 'next_line, + ContinueAction::NextChar => continue 'next_char, + } + } + } + } + } +} + +// Compile a command's addresses into cmd. +// Return the number of addresses encountered. +fn compile_addresses(_line: &mut ScriptCharProvider, _cmd: &mut Command) -> usize { + // TODO: implement address parsing + 0 +} + +// Compile the specified command +fn compile_command( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, + cmd_spec: &'static CommandSpec, +) -> UResult { + cmd.code = line.current(); + + match cmd_spec.args { + CommandArgs::Empty => { + // d D g G h H l n N p P q x = + line.advance(); + line.eat_spaces(); + if !line.eol() && line.current() == ';' { + line.advance(); + // TODO: update link + return Ok(ContinueAction::NextChar); + } + if !line.eol() { + return compile_error( + lines, + line, + format!("extra characters at the end of the {} command", cmd.code), + ); + } + } + // TODO + CommandArgs::Text => { // a c i + } + CommandArgs::NonSelect => { // ! + } + CommandArgs::Group => { // { + } + CommandArgs::EndGroup => { // } + } + CommandArgs::Comment => { // # + } + CommandArgs::Branch => { // b t + } + CommandArgs::Label => { // : + } + CommandArgs::ReadFile => { // r + } + CommandArgs::WriteFile => { // w + } + CommandArgs::Substitute => { // s + } + CommandArgs::Translate => { // y + } + } + + Ok(ContinueAction::NextLine) +} + +// Fail with msg as a compile error at the current location +fn compile_error( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + msg: impl ToString, +) -> UResult { + Err(USimpleError::new( + 1, + format!( + "{}:{}:{}: error: {}", + lines.get_input_name(), + lines.get_line_number(), + line.get_pos(), + msg.to_string() + ), + )) +} + +// Return the specification for the command letter at the current line position +// checking for diverse errors. +fn get_cmd_spec( + lines: &ScriptLineProvider, + line: &ScriptCharProvider, + n_addr: usize, +) -> UResult<&'static CommandSpec> { + if line.eol() { + return compile_error(lines, line, "command expected"); + } + + let ch = line.current(); + let opt_cmd_spec = lookup_command(ch); + + if opt_cmd_spec.is_none() { + return compile_error(lines, line, format!("invalid command code {}", ch)); + } + + let cmd_spec = opt_cmd_spec.unwrap(); + if n_addr > cmd_spec.n_addr { + return compile_error( + lines, + line, + format!( + "command {} expects up to {} address(es), found {}", + ch, cmd_spec.n_addr, n_addr + ), + ); + } + + Ok(cmd_spec) +} + +// Look up a command format by its command code. +fn lookup_command(cmd: char) -> Option<&'static CommandSpec> { + CMD_MAP.get(&cmd) +} + +#[cfg(test)] +mod tests { + use super::*; + + // lookup_command + #[test] + fn test_lookup_empty_command() { + let cmd = lookup_command('d').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::Empty); + } + + #[test] + fn test_lookup_text_command() { + let cmd = lookup_command('a').unwrap(); + assert_eq!(cmd.n_addr, 1); + assert_eq!(cmd.args, CommandArgs::Text); + } + + #[test] + fn test_lookup_nonselect_command() { + let cmd = lookup_command('!').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::NonSelect); + } + + #[test] + fn test_lookup_group_command() { + let cmd = lookup_command('{').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::Group); + } + + #[test] + fn test_lookup_endgroup_command() { + let cmd = lookup_command('}').unwrap(); + assert_eq!(cmd.n_addr, 0); + assert_eq!(cmd.args, CommandArgs::EndGroup); + } + + #[test] + fn test_lookup_comment_command() { + let cmd = lookup_command('#').unwrap(); + assert_eq!(cmd.n_addr, 0); + assert_eq!(cmd.args, CommandArgs::Comment); + } + + #[test] + fn test_lookup_branch_command() { + let cmd = lookup_command('b').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::Branch); + } + + #[test] + fn test_lookup_label_command() { + let cmd = lookup_command(':').unwrap(); + assert_eq!(cmd.n_addr, 0); + assert_eq!(cmd.args, CommandArgs::Label); + } + + #[test] + fn test_lookup_readfile_command() { + let cmd = lookup_command('r').unwrap(); + assert_eq!(cmd.n_addr, 1); + assert_eq!(cmd.args, CommandArgs::ReadFile); + } + + #[test] + fn test_lookup_writefile_command() { + let cmd = lookup_command('w').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::WriteFile); + } + + #[test] + fn test_lookup_substitute_command() { + let cmd = lookup_command('s').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::Substitute); + } + + #[test] + fn test_lookup_translate_command() { + let cmd = lookup_command('y').unwrap(); + assert_eq!(cmd.n_addr, 2); + assert_eq!(cmd.args, CommandArgs::Translate); + } + + #[test] + fn test_lookup_invalid_command() { + let result = lookup_command('Z'); + assert!(result.is_none()); + } + + // Utility to create a ScriptCharProvider from a &str + fn char_provider_from(s: &str) -> ScriptCharProvider { + ScriptCharProvider::new(s) + } + + // compile_error + #[test] + fn test_compile_error_message_format() { + let lines = ScriptLineProvider::with_active_state("test.sed", 42); + let mut line = char_provider_from("whatever"); + line.advance(); // move to position 1 + line.advance(); // move to position 2 + line.advance(); // move to position 3 + line.advance(); // now at position 4 + + let msg = "unexpected token"; + let result: UResult<()> = compile_error(&lines, &line, msg); + + assert!(result.is_err()); + + let err = result.unwrap_err(); + let msg = err.to_string(); + + assert!(msg.contains("test.sed:42:4: error: unexpected token")); + } + + #[test] + fn test_compile_error_with_format_message() { + let lines = ScriptLineProvider::with_active_state("input.txt", 3); + let line = char_provider_from("x"); + // We're at position 0 + + let result: UResult<()> = + compile_error(&lines, &line, format!("invalid command '{}'", 'x')); + + assert!(result.is_err()); + + let err = result.unwrap_err(); + let msg = err.to_string(); + + assert_eq!(msg, "input.txt:3:0: error: invalid command 'x'"); + } + + // get_cmd_spec + #[test] + fn test_missing_command_character() { + let lines = ScriptLineProvider::with_active_state("test.sed", 1); + let line = char_provider_from(""); + let result = get_cmd_spec(&lines, &line, 0); + + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("test.sed:1:0: error: command expected")); + } + + #[test] + fn test_invalid_command_character() { + let lines = ScriptLineProvider::with_active_state("script.sed", 2); + let line = char_provider_from("@"); + let result = get_cmd_spec(&lines, &line, 0); + + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("script.sed:2:0: error: invalid command code @")); + } + + #[test] + fn test_too_many_addresses() { + let lines = ScriptLineProvider::with_active_state("input.sed", 3); + let line = char_provider_from("q"); // q takes one address + let result = get_cmd_spec(&lines, &line, 2); + + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!( + msg.contains("input.sed:3:0: error: command q expects up to 1 address(es), found 2") + ); + } + + #[test] + fn test_valid_command_spec() { + let lines = ScriptLineProvider::with_active_state("input.sed", 4); + let line = char_provider_from("a"); // valid command + let result = get_cmd_spec(&lines, &line, 1); -pub fn compile(_scripts: Vec) -> UResult> { - // TODO - Ok(None) + assert!(result.is_ok()); + let spec = result.unwrap(); + assert_eq!(spec.code, 'a'); + } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index e481c1de..4d5b3c15 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -1,15 +1,23 @@ // Process the files with the compiled scripts // -// This file is part of the uutils sed package. +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis // +// This file is part of the uutils sed package. +// It is licensed under the MIT License. // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use crate::command::CliOptions; use crate::command::Command; use std::path::PathBuf; use uucore::error::UResult; -pub fn process(_code: Option, _files: Vec) -> UResult<()> { +pub fn process( + _code: Option>, + _files: Vec, + _cli_options: &mut CliOptions, +) -> UResult<()> { // TODO Ok(()) } diff --git a/src/uu/sed/src/script_char_provider.rs b/src/uu/sed/src/script_char_provider.rs new file mode 100644 index 00000000..6f404693 --- /dev/null +++ b/src/uu/sed/src/script_char_provider.rs @@ -0,0 +1,101 @@ +// Provide the script contents character by character +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +pub struct ScriptCharProvider { + line: Vec, + pos: usize, +} + +impl ScriptCharProvider { + pub fn new(line_string: &str) -> Self { + Self { + line: line_string.chars().collect(), + pos: 0, + } + } + + /// Advances to the next character, if not at end of line. + pub fn advance(&mut self) { + if self.pos < self.line.len() { + self.pos += 1; + } + } + + /// Returns the current character. Panics if out of bounds. + pub fn current(&self) -> char { + self.line[self.pos] + } + + /// Returns true if at the end of the line. + pub fn eol(&self) -> bool { + self.pos >= self.line.len() + } + + /// Advances the position past any whitespace characters. + pub fn eat_spaces(&mut self) { + while self.pos < self.line.len() && self.line[self.pos].is_whitespace() { + self.pos += 1; + } + } + + /// Return current position + pub fn get_pos(&self) -> usize { + self.pos + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_navigation() { + let mut provider = ScriptCharProvider::new("abc"); + assert_eq!(provider.get_pos(), 0); + assert_eq!(provider.current(), 'a'); + provider.advance(); + assert_eq!(provider.get_pos(), 1); + assert_eq!(provider.current(), 'b'); + provider.advance(); + assert_eq!(provider.get_pos(), 2); + assert_eq!(provider.current(), 'c'); + provider.advance(); + assert_eq!(provider.get_pos(), 3); + assert!(provider.eol()); + } + + #[test] + #[should_panic] + fn test_current_panics_out_of_bounds() { + let mut provider = ScriptCharProvider::new("x"); + provider.advance(); // now at end + provider.current(); // should panic + } + + #[test] + fn test_eat_spaces() { + let mut provider = ScriptCharProvider::new(" xyz"); + provider.eat_spaces(); + assert_eq!(provider.current(), 'x'); + } + + #[test] + fn test_eol_on_empty() { + let provider = ScriptCharProvider::new(""); + assert!(provider.eol()); + } + + #[test] + fn test_eat_spaces_mixed() { + let mut provider = ScriptCharProvider::new(" \t\nabc"); + provider.eat_spaces(); + assert_eq!(provider.current(), 'a'); + } +} diff --git a/src/uu/sed/src/script_line_provider.rs b/src/uu/sed/src/script_line_provider.rs new file mode 100644 index 00000000..24841d5d --- /dev/null +++ b/src/uu/sed/src/script_line_provider.rs @@ -0,0 +1,263 @@ +//! Provide the script contents line by line +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use crate::command::ScriptValue; +use std::fs::File; +use std::io::{self, BufRead, BufReader}; + +/// The provider of script lines across all specified scripts +/// Scripts can be specified to sed as files or as strings. +pub struct ScriptLineProvider { + sources: Vec, + state: State, +} + +// Encapsulation of the script line provider's state +enum State { + NotStarted, // Processing has not yet started + Active { + index: usize, + reader: Box, // Object on which read_line is called + input_name: String, // Input description (path or script string) + line_number: usize, // Current line number + }, + Done, // All scripts have been processed +} + +impl ScriptLineProvider { + /// Construct the script provider from the specified script sources + pub fn new(sources: Vec) -> Self { + Self { + sources, + state: State::NotStarted, + } + } + + /// Return the currently processed script line number. + pub fn get_line_number(&self) -> usize { + match &self.state { + State::Active { line_number, .. } => *line_number, + _ => 0, + } + } + + /// Return the currently processed script descriptive name. + pub fn get_input_name(&self) -> &str { + match &self.state { + State::Active { input_name, .. } => input_name.as_str(), + _ => "", + } + } + + /// Return the next script line to process across all scripts. + pub fn next_line(&mut self) -> io::Result> { + let mut line = String::new(); + + loop { + let advance = match &mut self.state { + State::NotStarted => Some(0), + State::Active { + index, + reader, + line_number, + .. + } => { + line.clear(); + let bytes = reader.read_line(&mut line)?; + if bytes == 0 { + Some(*index + 1) // finished reading this source + } else { + *line_number += 1; + return Ok(Some(line)); + } + } + State::Done => { + return Ok(None); + } + }; + + if let Some(next_index) = advance { + self.advance_source(next_index)?; + } + } + } + + // Move to the next available script source. + fn advance_source(&mut self, next_index: usize) -> io::Result<()> { + if next_index >= self.sources.len() { + self.state = State::Done; + return Ok(()); + } + + fn truncate_with_ellipsis(input: &str) -> String { + const MAX_LEN: usize = 20; + if input.chars().count() <= MAX_LEN { + input.to_string() + } else { + input.chars().take(MAX_LEN).collect::() + "..." + } + } + + match &self.sources[next_index] { + ScriptValue::StringVal(s) => { + let cursor = std::io::Cursor::new(s.clone()); + self.state = State::Active { + index: next_index, + reader: Box::new(BufReader::new(cursor)), + input_name: truncate_with_ellipsis(s), + line_number: 0, + }; + } + ScriptValue::PathVal(p) => { + if p.to_string_lossy() == "-" { + self.state = State::Active { + index: next_index, + reader: Box::new(BufReader::new(io::stdin())), + input_name: "".to_string(), + line_number: 0, + }; + } else { + let file = File::open(p)?; + self.state = State::Active { + index: next_index, + reader: Box::new(BufReader::new(file)), + input_name: p.to_string_lossy().to_string(), + line_number: 0, + }; + } + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn test_string_source() { + let input = vec![ + ScriptValue::StringVal("line one\nline two\n".to_string()), + ScriptValue::StringVal("line three".to_string()), + ]; + let mut provider = ScriptLineProvider::new(input); + + let mut lines = Vec::new(); + while let Some(line) = provider.next_line().unwrap() { + lines.push(line.trim_end().to_string()); + } + + assert_eq!(lines, vec!["line one", "line two", "line three"]); + } + + #[test] + fn test_file_source() { + let mut temp_file = NamedTempFile::new().unwrap(); + writeln!(temp_file, "file line 1").unwrap(); + writeln!(temp_file, "file line 2").unwrap(); + + let input = vec![ScriptValue::PathVal(temp_file.path().to_path_buf())]; + let mut provider = ScriptLineProvider::new(input); + + let mut lines = Vec::new(); + while let Some(line) = provider.next_line().unwrap() { + lines.push(line.trim_end().to_string()); + } + + assert_eq!(lines, vec!["file line 1", "file line 2"]); + } + + #[test] + fn test_mixed_source() { + let mut temp_file = NamedTempFile::new().unwrap(); + writeln!(temp_file, "file line 1").unwrap(); + writeln!(temp_file, "file line 2").unwrap(); + let temp_file2 = NamedTempFile::new().unwrap(); + + let input = vec![ + ScriptValue::PathVal(temp_file.path().to_path_buf()), + ScriptValue::StringVal("script line 1".to_string()), + ScriptValue::PathVal(temp_file.path().to_path_buf()), + ScriptValue::StringVal("".to_string()), + ScriptValue::PathVal(temp_file2.path().to_path_buf()), + ScriptValue::StringVal("other script line 1".to_string()), + ]; + let mut provider = ScriptLineProvider::new(input); + + let mut lines = Vec::new(); + while let Some(line) = provider.next_line().unwrap() { + lines.push(line.trim_end().to_string()); + } + + assert_eq!( + lines, + vec![ + "file line 1", + "file line 2", + "script line 1", + "file line 1", + "file line 2", + "other script line 1", + ] + ); + } + + #[test] + fn test_getters() { + let input = vec![ + ScriptValue::StringVal("l1\nl2\n".to_string()), + ScriptValue::StringVal("l3".to_string()), + ]; + let mut provider = ScriptLineProvider::new(input); + + if let Some(line) = provider.next_line().unwrap() { + assert_eq!(line.trim(), "l1"); + assert_eq!(provider.get_line_number(), 1); + assert_eq!(provider.get_input_name(), "l1\nl2\n"); + } else { + panic!("Expected a line"); + } + + if let Some(line) = provider.next_line().unwrap() { + assert_eq!(line.trim(), "l2"); + assert_eq!(provider.get_line_number(), 2); + assert_eq!(provider.get_input_name(), "l1\nl2\n"); + } else { + panic!("Expected a line"); + } + + if let Some(line) = provider.next_line().unwrap() { + assert_eq!(line.trim(), "l3"); + assert_eq!(provider.get_line_number(), 1); + assert_eq!(provider.get_input_name(), "l3"); + } else { + panic!("Expected a line"); + } + } +} + +#[cfg(test)] +impl ScriptLineProvider { + pub fn with_active_state(input_name: &str, line_number: usize) -> Self { + Self { + sources: vec![], + state: State::Active { + input_name: input_name.to_string(), + line_number, + index: 0, + reader: Box::new(BufReader::new(io::stdin())), + }, + } + } +} diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 48fa4590..2a3dae8b 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -1,13 +1,20 @@ -// This file is part of the uutils sed package. +// Program entry point and CLI processing +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis // +// This file is part of the uutils sed package. +// It is licensed under the MIT License. // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. pub mod command; pub mod compiler; pub mod processor; +pub mod script_char_provider; +pub mod script_line_provider; -use crate::command::ScriptValue; +use crate::command::{CliOptions, ScriptValue}; use crate::compiler::compile; use crate::processor::process; use clap::{arg, Arg, ArgMatches, Command}; @@ -18,11 +25,80 @@ use uucore::format_usage; const ABOUT: &str = "Stream editor for filtering and transforming text"; const USAGE: &str = "sed [OPTION]... [script] [file]..."; -/* - * Iterate through script and file arguments specified in matches and - * return vectors of all scripts and input files in the specified order. - * If no script is specified fail with "missing script" error. - */ +#[uucore::main] +pub fn uumain(args: impl uucore::Args) -> UResult<()> { + let matches = uu_app().try_get_matches_from(args)?; + let (scripts, files) = get_scripts_files(&matches)?; + let mut cli_options = build_context(&matches); + + let executable = compile(scripts, &mut cli_options)?; + process(executable, files, &mut cli_options)?; + Ok(()) +} + +#[allow(clippy::cognitive_complexity)] +pub fn uu_app() -> Command { + Command::new(uucore::util_name()) + .about(ABOUT) + .override_usage(format_usage(USAGE)) + .infer_long_args(true) + .args([ + arg!([script] "Script to execute if not otherwise provided."), + Arg::new("file") + .help("Input files") + .value_parser(clap::value_parser!(PathBuf)) + .num_args(0..), + Arg::new("all-output-files") + .long("all-output-files") + .short('a') + .help("Create or truncate all output files before processing.") + .action(clap::ArgAction::SetTrue), + arg!(--debug "Annotate program execution."), + Arg::new("regexp-extended") + .short('E') + .long("regexp-extended") + .short_alias('r') + .help("Use extended regular expressions.") + .action(clap::ArgAction::SetTrue), + arg!(-e --expression