From 3a700c1b0cecf194836411f527b7797a3295ef27 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 3 May 2025 21:34:03 +0300 Subject: [PATCH 01/85] Add fast I/O code (WIP) --- Cargo.lock | 11 ++ Cargo.toml | 3 + src/uu/sed/Cargo.toml | 1 + src/uu/sed/src/fast_io.rs | 336 ++++++++++++++++++++++++++++++++++++++ src/uu/sed/src/sed.rs | 1 + 5 files changed, 352 insertions(+) create mode 100644 src/uu/sed/src/fast_io.rs diff --git a/Cargo.lock b/Cargo.lock index 81d2d3eb..208cf4ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -368,6 +368,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + [[package]] name = "nix" version = "0.29.0" @@ -666,6 +675,7 @@ dependencies = [ "clap_mangen", "ctor", "libc", + "memmap2", "phf", "phf_codegen", "pretty_assertions", @@ -863,6 +873,7 @@ name = "uu_sed" version = "0.0.1" dependencies = [ "clap", + "memmap2", "once_cell", "regex", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 0d4b88aa..7989651a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ clap_mangen = "0.2" regex = "1.10.4" sysinfo = "0.35" libc = "0.2.153" +memmap2 = "0.5" once_cell = "1.21" phf = "0.11.2" phf_codegen = "0.11.2" @@ -49,6 +50,7 @@ chrono = { version = "0.4.37", default-features = false, features = [ "clock", ] } + [dependencies] clap = { workspace = true } clap_complete = { workspace = true } @@ -61,6 +63,7 @@ sysinfo = { workspace = true } sed = { optional = true, version = "0.0.1", package = "uu_sed", path = "src/uu/sed" } uutests = "0.0.30" ctor = "0.4.1" +memmap2.workspace = true [dev-dependencies] pretty_assertions = "1" diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 4a095456..53869c11 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -18,6 +18,7 @@ clap = { workspace = true } once_cell = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } +memmap2.workspace = true [lib] path = "src/sed.rs" diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs new file mode 100644 index 00000000..fdfd0422 --- /dev/null +++ b/src/uu/sed/src/fast_io.rs @@ -0,0 +1,336 @@ +// Zero-copy line-based I/O +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +#[cfg(unix)] +use memmap2::Mmap; +use std::borrow::Cow; +use std::fs::File; +use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; +#[cfg(unix)] +use std::os::unix::io::AsRawFd; +use std::path::PathBuf; +#[cfg(unix)] +use uucore::libc::{c_void, write}; + +/// Cursor for zero-copy iteration over mmap’d file. +pub struct MmapLineCursor<'a> { + data: &'a [u8], + pos: usize, +} + +#[derive(Debug)] +#[cfg(unix)] +pub enum LineChunk<'a> { + #[cfg(unix)] + Mmap { + content: &'a [u8], // line without newline + full_span: &'a [u8], // line including original newline + }, + Owned(Vec), // line content without newline +} + +#[cfg(not(unix))] +pub enum LineChunk { + Owned(Vec), // line content without newline +} + +#[cfg(unix)] +type LineChunkRef<'a> = LineChunk<'a>; + +#[cfg(not(unix))] +type LineChunkRef = LineChunk; + +impl<'a> MmapLineCursor<'a> { + pub fn new(data: &'a [u8]) -> Self { + Self { data, pos: 0 } + } + pub fn get_line(&mut self) -> io::Result> { + if self.pos >= self.data.len() { + return Ok(None); + } + + let start = self.pos; + let mut end = start; + while end < self.data.len() && self.data[end] != b'\n' { + end += 1; + } + + if end < self.data.len() { + end += 1; // include \n in full span + } + + self.pos = end; + let full_span = &self.data[start..end]; + let content = if full_span.ends_with(b"\n") { + &full_span[..full_span.len() - 1] + } else { + full_span + }; + + Ok(Some((content, full_span))) + } +} + +/// Buffered line reader from any BufRead input. +pub struct ReadLineCursor { + lines: std::io::Lines, +} + +impl ReadLineCursor { + pub fn new(reader: R) -> Self { + Self { + lines: reader.lines(), + } + } + + pub fn get_line(&mut self) -> io::Result, usize)>> { + match self.lines.next() { + Some(Ok(line)) => Ok(Some((Cow::Owned(line.clone()), line.len()))), + Some(Err(e)) => Err(e), + None => Ok(None), + } + } +} + +/// Unified reader that uses mmap when possible, falls back to buffered reading. +pub enum LineReader { + #[cfg(unix)] + Mmap { + mmap: Mmap, + cursor: MmapLineCursor<'static>, + }, + Fallback(ReadLineCursor>>), +} + +fn fallback_reader(file: File) -> io::Result { + let boxed: Box = Box::new(file); + let reader = BufReader::new(boxed); + Ok(LineReader::Fallback(ReadLineCursor::new(reader))) +} + +impl LineReader { + pub fn open(path: &PathBuf) -> io::Result { + if path.as_os_str() == "-" { + let stdin = io::stdin(); + let boxed: Box = Box::new(stdin.lock()); + let reader = BufReader::new(boxed); + return Ok(LineReader::Fallback(ReadLineCursor::new(reader))); + } + + let file = File::open(path)?; + + #[cfg(unix)] + { + match unsafe { Mmap::map(&file) } { + Ok(mmap) => { + // SAFETY: mmap owns the data and lives in the same variant + let slice: &'static [u8] = + unsafe { std::slice::from_raw_parts(mmap.as_ptr(), mmap.len()) }; + let cursor = MmapLineCursor::new(slice); + Ok(LineReader::Mmap { mmap, cursor }) + } + Err(_) => fallback_reader(file), + } + } + + #[cfg(not(unix))] + { + fallback_reader(file) + } + } + + pub fn get_line(&mut self) -> io::Result> { + match self { + #[cfg(unix)] + LineReader::Mmap { cursor, .. } => { + if let Some((content, full_span)) = cursor.get_line()? { + Ok(Some(LineChunk::Mmap { content, full_span })) + } else { + Ok(None) + } + } + LineReader::Fallback(cursor) => { + if let Some((line, _)) = cursor.get_line()? { + Ok(Some(LineChunk::Owned(line.into_owned().into_bytes()))) + } else { + Ok(None) + } + } + } + } +} + +pub struct OutputBuffer { + out: BufWriter, + #[cfg(unix)] + mmap_ptr: Option<(*const u8, usize)>, +} + +/// Type to use for writing +// Example: DynOutputBuffer::new(Box::new(io::stdout().lock()) +pub type DynOutputBuffer = OutputBuffer>; + +#[cfg(unix)] +fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { + let ret = unsafe { write(fd, ptr as *const c_void, len) }; + if ret < 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } +} + +/// Threshold to use buffered writes for output +// This is half the size of the BufWriter buffer. +#[cfg(unix)] +const MIN_DIRECT_WRITE: usize = 4096; + +impl OutputBuffer { + pub fn new(w: W) -> Self { + Self { + out: BufWriter::new(w), + #[cfg(unix)] + mmap_ptr: None, + } + } + + pub fn write_chunk(&mut self, chunk: &LineChunk) -> io::Result<()> { + match chunk { + #[cfg(unix)] + LineChunk::Mmap { full_span, .. } => { + let ptr = full_span.as_ptr(); + let len = full_span.len(); + + if let Some((p, l)) = self.mmap_ptr { + // Coalesce if adjacent + if unsafe { p.add(l) } == ptr { + self.mmap_ptr = Some((p, l + len)); + return Ok(()); + } else { + self.flush_mmap()?; // not contiguous + } + } + self.mmap_ptr = Some((ptr, len)); + Ok(()) + } + + LineChunk::Owned(buf) => { + #[cfg(unix)] + { + self.flush_mmap()?; + } + self.out.write_all(buf)?; + self.out.write_all(b"\n")?; + Ok(()) + } + } + } + + // Flush any pending mmap data + #[cfg(unix)] + fn flush_mmap(&mut self) -> io::Result<()> { + if let Some((ptr, len)) = self.mmap_ptr.take() { + if len < MIN_DIRECT_WRITE { + // SAFELY treat as &[u8] and write to buffered writer + let slice = unsafe { std::slice::from_raw_parts(ptr, len) }; + return self.out.write_all(slice); + } else { + // Large enough: write directly using zero-copy + let fd = io::stdout().as_raw_fd(); + self.out.flush()?; // sync any buffered data + return write_syscall(fd, ptr, len); + } + } + Ok(()) + } + + /// Flush everything: pending mmap and buffered data. + pub fn flush(&mut self) -> io::Result<()> { + #[cfg(unix)] + { + self.flush_mmap()?; // flush mmap if any + } + self.out.flush() // then flush buffered data + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + impl OutputBuffer>> { + /// Grab the raw bytes written so far. + pub fn test_contents(&self) -> &[u8] { + // 1) get_ref() on BufWriter> → &Cursor<...> + // 2) get_ref() on Cursor> → &Vec + // 3) as_slice() on Vec → &[u8] + self.out.get_ref().get_ref().as_slice() + } + } + + fn make_owned_line(s: &str) -> LineChunk { + LineChunk::Owned(s.as_bytes().to_vec()) + } + + #[test] + fn test_owned_line_output() { + let sink = Cursor::new(Vec::new()); + let mut out = OutputBuffer::new(sink); + + out.write_chunk(&make_owned_line("foo")).unwrap(); + out.write_chunk(&make_owned_line("bar")).unwrap(); + out.flush().unwrap(); + + assert_eq!(out.test_contents(), b"foo\nbar\n"); + } + + #[cfg(unix)] + fn make_mmap_line<'a>(buf: &'a [u8]) -> LineChunk<'a> { + LineChunk::Mmap { + content: &buf[..buf.len() - 1], // exclude \n + full_span: buf, // include \n + } + } + + #[cfg(unix)] + #[test] + fn test_mmap_line_output_single() { + let mmap_data = b"line one\nline two\n"; + let sink = Cursor::new(Vec::new()); + let mut out = OutputBuffer::new(sink); + + // first nine bytes are "line one\n" + out.write_chunk(&make_mmap_line(&mmap_data[..9])).unwrap(); + // the rest are "line two\n" + out.write_chunk(&make_mmap_line(&mmap_data[9..])).unwrap(); + out.flush().unwrap(); + + assert_eq!(out.test_contents(), b"line one\nline two\n"); + } + + #[cfg(unix)] + #[test] + fn test_mixed_output_order_preserved() { + let mmap_data = b"zero\none\n"; + let sink = Cursor::new(Vec::new()); + let mut out = OutputBuffer::new(sink); + + // "zero\n" + out.write_chunk(&make_mmap_line(&mmap_data[..5])).unwrap(); + // now an owned line + out.write_chunk(&make_owned_line("middle")).unwrap(); + // then "one\n" + out.write_chunk(&make_mmap_line(&mmap_data[5..])).unwrap(); + out.flush().unwrap(); + + assert_eq!(out.test_contents(), b"zero\nmiddle\none\n"); + } +} diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 1f736296..b4b061fe 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -11,6 +11,7 @@ pub mod command; pub mod compiler; pub mod delimited_parser; +pub mod fast_io; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From 0388dc4aac24cfc71f1075867284235057dc8758 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 11:40:35 +0300 Subject: [PATCH 02/85] Improve naming and documentation --- src/uu/sed/src/fast_io.rs | 141 +++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 46 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index fdfd0422..3828913e 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -1,5 +1,11 @@ // Zero-copy line-based I/O // +// Abstractions that allow file lines to be processed and output +// in mmapped memory space. By coallescing output requests an +// efficient write(2) system call can be issued for them, bypassing +// the copy required for output through BufWriter. +// Search for "main" to see a usage example. +// // SPDX-License-Identifier: MIT // Copyright (c) 2025 Diomidis Spinellis // @@ -19,38 +25,22 @@ use std::path::PathBuf; #[cfg(unix)] use uucore::libc::{c_void, write}; +// Define two cursors for iterating over lines: +// - MmapLineCursor based on mmap(2), +// - ReadLineCursorbased on BufReader. + /// Cursor for zero-copy iteration over mmap’d file. pub struct MmapLineCursor<'a> { data: &'a [u8], pos: usize, } -#[derive(Debug)] -#[cfg(unix)] -pub enum LineChunk<'a> { - #[cfg(unix)] - Mmap { - content: &'a [u8], // line without newline - full_span: &'a [u8], // line including original newline - }, - Owned(Vec), // line content without newline -} - -#[cfg(not(unix))] -pub enum LineChunk { - Owned(Vec), // line content without newline -} - -#[cfg(unix)] -type LineChunkRef<'a> = LineChunk<'a>; - -#[cfg(not(unix))] -type LineChunkRef = LineChunk; - impl<'a> MmapLineCursor<'a> { pub fn new(data: &'a [u8]) -> Self { Self { data, pos: 0 } } + + /// Return the next line, if available, or None. pub fn get_line(&mut self) -> io::Result> { if self.pos >= self.data.len() { return Ok(None); @@ -90,6 +80,7 @@ impl ReadLineCursor { } } + /// Return the next line, if available, or None. pub fn get_line(&mut self) -> io::Result, usize)>> { match self.lines.next() { Some(Ok(line)) => Ok(Some((Cow::Owned(line.clone()), line.len()))), @@ -99,20 +90,47 @@ impl ReadLineCursor { } } +/// Data to be written to a file. It can come from the mmapped +/// memory space, in which case it is tracked to allow coallescing +/// and bypassing BufWriter, or it can be other data from the process's +/// memory space. +#[derive(Debug)] +#[cfg(unix)] +pub enum OutputChunk<'a> { + MmapInput { + content: &'a [u8], // Line without newline + full_span: &'a [u8], // Line including original newline + }, + Owned(Vec), // Line content without newline +} + +#[cfg(unix)] +type OutputChunkRef<'a> = OutputChunk<'a>; + +// The same as above for non-Unix platforms, which lack mmap(2) +#[cfg(not(unix))] +pub enum OutputChunk { + Owned(Vec), // Line content without newline +} + +#[cfg(not(unix))] +type OutputChunkRef = OutputChunk; + /// Unified reader that uses mmap when possible, falls back to buffered reading. pub enum LineReader { #[cfg(unix)] - Mmap { - mmap: Mmap, + MmapInput { + mapped_file: Mmap, // A handle that can derive the mapped file slice cursor: MmapLineCursor<'static>, }, - Fallback(ReadLineCursor>>), + ReadInput(ReadLineCursor>>), } -fn fallback_reader(file: File) -> io::Result { +/// Return a LineReader that uses the ReadInput method fot the specified file. +fn line_reader_read_input(file: File) -> io::Result { let boxed: Box = Box::new(file); let reader = BufReader::new(boxed); - Ok(LineReader::Fallback(ReadLineCursor::new(reader))) + Ok(LineReader::ReadInput(ReadLineCursor::new(reader))) } impl LineReader { @@ -121,7 +139,7 @@ impl LineReader { let stdin = io::stdin(); let boxed: Box = Box::new(stdin.lock()); let reader = BufReader::new(boxed); - return Ok(LineReader::Fallback(ReadLineCursor::new(reader))); + return Ok(LineReader::ReadInput(ReadLineCursor::new(reader))); } let file = File::open(path)?; @@ -129,36 +147,42 @@ impl LineReader { #[cfg(unix)] { match unsafe { Mmap::map(&file) } { - Ok(mmap) => { + Ok(mapped_file) => { // SAFETY: mmap owns the data and lives in the same variant - let slice: &'static [u8] = - unsafe { std::slice::from_raw_parts(mmap.as_ptr(), mmap.len()) }; + let slice: &'static [u8] = unsafe { + std::slice::from_raw_parts(mapped_file.as_ptr(), mapped_file.len()) + }; let cursor = MmapLineCursor::new(slice); - Ok(LineReader::Mmap { mmap, cursor }) + Ok(LineReader::MmapInput { + mapped_file, + cursor, + }) } - Err(_) => fallback_reader(file), + // Fallback to ReadInput + Err(_) => line_reader_read_input(file), } } #[cfg(not(unix))] { - fallback_reader(file) + line_reader_read_input(file) } } - pub fn get_line(&mut self) -> io::Result> { + /// Return the next line, if available, or None. + pub fn get_line(&mut self) -> io::Result> { match self { #[cfg(unix)] - LineReader::Mmap { cursor, .. } => { + LineReader::MmapInput { cursor, .. } => { if let Some((content, full_span)) = cursor.get_line()? { - Ok(Some(LineChunk::Mmap { content, full_span })) + Ok(Some(OutputChunk::MmapInput { content, full_span })) } else { Ok(None) } } - LineReader::Fallback(cursor) => { + LineReader::ReadInput(cursor) => { if let Some((line, _)) = cursor.get_line()? { - Ok(Some(LineChunk::Owned(line.into_owned().into_bytes()))) + Ok(Some(OutputChunk::Owned(line.into_owned().into_bytes()))) } else { Ok(None) } @@ -167,6 +191,12 @@ impl LineReader { } } +/// Abstraction for outputting data, potentially from the mmapped file +/// Outputs from mmapped data are coallesced and written via a write(2) +/// system call without any copying if worthwhile. +/// All other output is buffered and writen via BufWriter. +/// The generic argument W is used for obtaining the output when +/// testing. pub struct OutputBuffer { out: BufWriter, #[cfg(unix)] @@ -201,10 +231,11 @@ impl OutputBuffer { } } - pub fn write_chunk(&mut self, chunk: &LineChunk) -> io::Result<()> { + /// Schedule the specified output chunk for eventual output + pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { match chunk { #[cfg(unix)] - LineChunk::Mmap { full_span, .. } => { + OutputChunk::MmapInput { full_span, .. } => { let ptr = full_span.as_ptr(); let len = full_span.len(); @@ -221,7 +252,7 @@ impl OutputBuffer { Ok(()) } - LineChunk::Owned(buf) => { + OutputChunk::Owned(buf) => { #[cfg(unix)] { self.flush_mmap()?; @@ -261,6 +292,24 @@ impl OutputBuffer { } } +// Usage example (never compiled) +#[cfg(any())] +pub fn main() -> io::Result<()> { + let path = std::env::args() + .nth(1) + .map(PathBuf::from) + .unwrap_or_else(|| "-".into()); + let mut reader = LineReader::open(&path)?; + let stdout = Box::new(io::stdout().lock()); + let mut output = OutputBuffer::new(stdout); + + while let Some(chunk) = reader.get_line()? { + output.write_chunk(&chunk)?; + } + + output.flush() +} + #[cfg(test)] mod tests { use super::*; @@ -276,8 +325,8 @@ mod tests { } } - fn make_owned_line(s: &str) -> LineChunk { - LineChunk::Owned(s.as_bytes().to_vec()) + fn make_owned_line(s: &str) -> OutputChunk { + OutputChunk::Owned(s.as_bytes().to_vec()) } #[test] @@ -293,8 +342,8 @@ mod tests { } #[cfg(unix)] - fn make_mmap_line<'a>(buf: &'a [u8]) -> LineChunk<'a> { - LineChunk::Mmap { + fn make_mmap_line<'a>(buf: &'a [u8]) -> OutputChunk<'a> { + OutputChunk::MmapInput { content: &buf[..buf.len() - 1], // exclude \n full_span: buf, // include \n } From 1eec45125d82f3417473b668e50c749f2b5fc9e0 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 11:48:54 +0300 Subject: [PATCH 03/85] Implement write_str convenience method --- src/uu/sed/src/fast_io.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 3828913e..8a740a34 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -264,6 +264,11 @@ impl OutputBuffer { } } + /// Schedule the specified string for eventual output + pub fn write_str(&mut self, s: &str) -> io::Result<()> { + self.write_chunk(&OutputChunk::Owned(s.as_bytes().to_vec())) + } + // Flush any pending mmap data #[cfg(unix)] fn flush_mmap(&mut self) -> io::Result<()> { @@ -325,17 +330,13 @@ mod tests { } } - fn make_owned_line(s: &str) -> OutputChunk { - OutputChunk::Owned(s.as_bytes().to_vec()) - } - #[test] fn test_owned_line_output() { let sink = Cursor::new(Vec::new()); let mut out = OutputBuffer::new(sink); - out.write_chunk(&make_owned_line("foo")).unwrap(); - out.write_chunk(&make_owned_line("bar")).unwrap(); + out.write_str("foo").unwrap(); + out.write_str("bar").unwrap(); out.flush().unwrap(); assert_eq!(out.test_contents(), b"foo\nbar\n"); @@ -375,7 +376,7 @@ mod tests { // "zero\n" out.write_chunk(&make_mmap_line(&mmap_data[..5])).unwrap(); // now an owned line - out.write_chunk(&make_owned_line("middle")).unwrap(); + out.write_str("middle").unwrap(); // then "one\n" out.write_chunk(&make_mmap_line(&mmap_data[5..])).unwrap(); out.flush().unwrap(); From f941e263488dcea834f71452563c85bfdd312b0e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 13:36:39 +0300 Subject: [PATCH 04/85] Improve testing and fix output bug Add test cases for output and zero-copy verification. While at it fix a bug that used a hardcoded stdout for the output. This required introducing RawFd, a stronger separation between the Unix and the fallback implementation, and the use of temporary output files for testing. --- src/uu/sed/src/fast_io.rs | 277 ++++++++++++++++++++++++++++++-------- 1 file changed, 218 insertions(+), 59 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 8a740a34..d648aeb8 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -94,7 +94,7 @@ impl ReadLineCursor { /// memory space, in which case it is tracked to allow coallescing /// and bypassing BufWriter, or it can be other data from the process's /// memory space. -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] #[cfg(unix)] pub enum OutputChunk<'a> { MmapInput { @@ -201,12 +201,15 @@ pub struct OutputBuffer { out: BufWriter, #[cfg(unix)] mmap_ptr: Option<(*const u8, usize)>, + #[cfg(test)] + writes_issued: usize, // Number of issued write(2) calls } /// Type to use for writing // Example: DynOutputBuffer::new(Box::new(io::stdout().lock()) pub type DynOutputBuffer = OutputBuffer>; +/// Wrapper that issues the write(2) system call #[cfg(unix)] fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { let ret = unsafe { write(fd, ptr as *const c_void, len) }; @@ -218,7 +221,11 @@ fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { } /// Threshold to use buffered writes for output -// This is half the size of the BufWriter buffer. +// These 4k are half the 8k size of the BufWriter buffer. +// The constant guarantees that, at worst, mmapped output will +// result in a doubling of the issued write(2) system calls. +// Taking into account the non-copied data, this should result +// in overall fewer CPU instructions. #[cfg(unix)] const MIN_DIRECT_WRITE: usize = 4096; @@ -228,13 +235,16 @@ impl OutputBuffer { out: BufWriter::new(w), #[cfg(unix)] mmap_ptr: None, + #[cfg(test)] + writes_issued: 0, } } +} +impl OutputBuffer { /// Schedule the specified output chunk for eventual output pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { match chunk { - #[cfg(unix)] OutputChunk::MmapInput { full_span, .. } => { let ptr = full_span.as_ptr(); let len = full_span.len(); @@ -253,10 +263,7 @@ impl OutputBuffer { } OutputChunk::Owned(buf) => { - #[cfg(unix)] - { - self.flush_mmap()?; - } + self.flush_mmap()?; self.out.write_all(buf)?; self.out.write_all(b"\n")?; Ok(()) @@ -266,6 +273,7 @@ impl OutputBuffer { /// Schedule the specified string for eventual output pub fn write_str(&mut self, s: &str) -> io::Result<()> { + // Use the write_chunk corresponding to cfg self.write_chunk(&OutputChunk::Owned(s.as_bytes().to_vec())) } @@ -279,8 +287,12 @@ impl OutputBuffer { return self.out.write_all(slice); } else { // Large enough: write directly using zero-copy - let fd = io::stdout().as_raw_fd(); + let fd = self.out.get_ref().as_raw_fd(); self.out.flush()?; // sync any buffered data + #[cfg(test)] + { + self.writes_issued += 1; + } return write_syscall(fd, ptr, len); } } @@ -289,10 +301,32 @@ impl OutputBuffer { /// Flush everything: pending mmap and buffered data. pub fn flush(&mut self) -> io::Result<()> { - #[cfg(unix)] - { - self.flush_mmap()?; // flush mmap if any + self.flush_mmap()?; // flush mmap if any + self.out.flush() // then flush buffered data + } +} + +#[cfg(not(unix))] +impl OutputBuffer { + /// Schedule the specified output chunk for eventual output + pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { + match chunk { + OutputChunk::Owned(buf) => { + self.out.write_all(buf)?; + self.out.write_all(b"\n")?; + Ok(()) + } } + } + + /// Schedule the specified string for eventual output + pub fn write_str(&mut self, s: &str) -> io::Result<()> { + // Use the write_chunk corresponding to cfg + self.write_chunk(&OutputChunk::Owned(s.as_bytes().to_vec())) + } + + /// Flush everything: pending mmap and buffered data. + pub fn flush(&mut self) -> io::Result<()> { self.out.flush() // then flush buffered data } } @@ -318,69 +352,194 @@ pub fn main() -> io::Result<()> { #[cfg(test)] mod tests { use super::*; - use std::io::Cursor; - - impl OutputBuffer>> { - /// Grab the raw bytes written so far. - pub fn test_contents(&self) -> &[u8] { - // 1) get_ref() on BufWriter> → &Cursor<...> - // 2) get_ref() on Cursor> → &Vec - // 3) as_slice() on Vec → &[u8] - self.out.get_ref().get_ref().as_slice() - } + use std::fs; + use std::fs::File; + use std::io::{self, Write}; + use tempfile::NamedTempFile; + + /// Helper: produce a 4 096-byte Vec of `'.'`s ending in `'\n'`. + fn make_dot_line_4k() -> Vec { + let mut buf = Vec::with_capacity(4096); + buf.extend(std::iter::repeat(b'.').take(4095)); + buf.push(b'\n'); + buf } #[test] - fn test_owned_line_output() { - let sink = Cursor::new(Vec::new()); - let mut out = OutputBuffer::new(sink); - - out.write_str("foo").unwrap(); - out.write_str("bar").unwrap(); - out.flush().unwrap(); - - assert_eq!(out.test_contents(), b"foo\nbar\n"); + fn test_owned_line_output() -> io::Result<()> { + let tmp = NamedTempFile::new()?; + { + let file = tmp.reopen()?; + let mut out = OutputBuffer::new(file); + out.write_str("foo")?; + out.write_str("bar")?; + out.flush()?; + assert_eq!(out.writes_issued, 0); + } // File closes here as it leaves the scope + + let contents = fs::read(tmp.path())?; + assert_eq!(contents.as_slice(), b"foo\nbar\n"); + Ok(()) } + #[test] #[cfg(unix)] - fn make_mmap_line<'a>(buf: &'a [u8]) -> OutputChunk<'a> { - OutputChunk::MmapInput { - content: &buf[..buf.len() - 1], // exclude \n - full_span: buf, // include \n + fn test_mmap_line_output_single() -> io::Result<()> { + use std::fs; + use std::io::Write; + use tempfile::NamedTempFile; + + // Prepare the input buffer: two lines in one contiguous mmap region + let mmap_data = b"line one\nline two\n"; + + // Write that into a temp file + let mut input = NamedTempFile::new()?; + input.write_all(mmap_data)?; + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open the reader on that file + let mut reader = LineReader::open(&input_path)?; + + // Prepare an output temp file and wrap it in our OutputBuffer + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let out_file = std::fs::File::create(&output_path)?; + let mut out = OutputBuffer::new(out_file); + + // Drain reader → writer + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; } + out.flush()?; + + assert_eq!(out.writes_issued, 0); + + let written = fs::read(&output_path)?; + assert_eq!(written.as_slice(), mmap_data); + + Ok(()) } + #[test] #[cfg(unix)] + fn test_mixed_output_order_preserved() -> io::Result<()> { + use std::fs; + use std::fs::File; + use std::io::Write; + use tempfile::NamedTempFile; + + // Prepare an input file containing two lines: "zero\none\n" + let data = b"zero\none\n"; + let mut input = NamedTempFile::new()?; + input.write_all(data)?; + input.flush()?; + let input_path = input.path().to_path_buf(); + let mut reader = LineReader::open(&input_path)?; + + // Prepare an empty output file + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let out_file = File::create(&output_path)?; + let mut out = OutputBuffer::new(out_file); + + // Read the first mmap line ("zero\n") and write it + if let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + } + + // Write an owned line ("middle\n") + out.write_str("middle")?; + + // Read the second mmap line ("one\n") and write it + if let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + } + + out.flush()?; + + // Since all writes are small (<4K), we expect zero zero copy syscalls + assert_eq!(out.writes_issued, 0); + + // Read both files back and compare + let expected = { + let mut v = Vec::new(); + v.extend_from_slice(b"zero\n"); + v.extend_from_slice(b"middle\n"); + v.extend_from_slice(b"one\n"); + v + }; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + + Ok(()) + } #[test] - fn test_mmap_line_output_single() { - let mmap_data = b"line one\nline two\n"; - let sink = Cursor::new(Vec::new()); - let mut out = OutputBuffer::new(sink); + #[cfg(unix)] + fn test_large_file_zero_copy() -> io::Result<()> { + // Create and fill the input temp file: + let mut input = NamedTempFile::new()?; + write!(input, "first line\nsecond line\n")?; + let dot_line = make_dot_line_4k(); + input.write_all(&dot_line)?; + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open reader on input file: + let mut reader = LineReader::open(&input_path)?; + + // Create the output temp file (empty): + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let output_file = File::create(&output_path)?; + + // Wrap it in your OutputBuffer and run the loop: + let mut out = OutputBuffer::new(output_file); + let mut nline = 0; + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + nline += 1; + } + assert_eq!(nline, 3); - // first nine bytes are "line one\n" - out.write_chunk(&make_mmap_line(&mmap_data[..9])).unwrap(); - // the rest are "line two\n" - out.write_chunk(&make_mmap_line(&mmap_data[9..])).unwrap(); - out.flush().unwrap(); + out.flush()?; + assert_eq!(out.writes_issued, 1); - assert_eq!(out.test_contents(), b"line one\nline two\n"); + // Verify that files match: + let expected = fs::read(&input_path)?; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + Ok(()) } - #[cfg(unix)] #[test] - fn test_mixed_output_order_preserved() { - let mmap_data = b"zero\none\n"; - let sink = Cursor::new(Vec::new()); - let mut out = OutputBuffer::new(sink); - - // "zero\n" - out.write_chunk(&make_mmap_line(&mmap_data[..5])).unwrap(); - // now an owned line - out.write_str("middle").unwrap(); - // then "one\n" - out.write_chunk(&make_mmap_line(&mmap_data[5..])).unwrap(); - out.flush().unwrap(); - - assert_eq!(out.test_contents(), b"zero\nmiddle\none\n"); + #[cfg(unix)] + fn test_read_from_tempfile() -> std::io::Result<()> { + // Create temporary file with known contents + let mut tmp = NamedTempFile::new()?; + write!(tmp, "first line\nsecond line\n")?; + tmp.flush()?; + + let path = tmp.path().to_path_buf(); + let mut reader = LineReader::open(&path)?; + + // Verify the reader's operation + assert_eq!( + reader.get_line()?, + Some(OutputChunk::MmapInput { + content: b"first line".as_ref(), + full_span: b"first line\n".as_ref(), + }) + ); + assert_eq!( + reader.get_line()?, + Some(OutputChunk::MmapInput { + content: b"second line".as_ref(), + full_span: b"second line\n".as_ref(), + }) + ); + assert_eq!(reader.get_line()?, None); + + Ok(()) } } From f266e881da39dba0a777a4807aa1988badcbdb37 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 13:53:31 +0300 Subject: [PATCH 05/85] Ensure regular output flushing --- src/uu/sed/src/fast_io.rs | 52 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index d648aeb8..4990bc31 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -229,6 +229,13 @@ fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { #[cfg(unix)] const MIN_DIRECT_WRITE: usize = 4096; +/// The maximum size of a pending write buffer +// Once more than 64k accumulate, issue a write to allow the OS +// and downstream pipes to handle the output processing in parallel +// with our processing. +#[cfg(unix)] +const MAX_PENDING_WRITE: usize = 64 * 1024; + impl OutputBuffer { pub fn new(w: W) -> Self { Self { @@ -251,7 +258,7 @@ impl OutputBuffer { if let Some((p, l)) = self.mmap_ptr { // Coalesce if adjacent - if unsafe { p.add(l) } == ptr { + if unsafe { p.add(l) } == ptr && l < MAX_PENDING_WRITE { self.mmap_ptr = Some((p, l + len)); return Ok(()); } else { @@ -514,7 +521,48 @@ mod tests { #[test] #[cfg(unix)] - fn test_read_from_tempfile() -> std::io::Result<()> { + fn test_large_file_zero_copy_with_flush() -> io::Result<()> { + // Create and fill the input temp file: + let mut input = NamedTempFile::new()?; + write!(input, "first line\nsecond line\n")?; + let dot_line = make_dot_line_4k(); + // Write 64k + 16k to ensure one flush when writing + for _i in 0..20 { + input.write_all(&dot_line)?; + } + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open reader on input file: + let mut reader = LineReader::open(&input_path)?; + + // Create the output temp file (empty): + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let output_file = File::create(&output_path)?; + + // Wrap it in your OutputBuffer and run the loop: + let mut out = OutputBuffer::new(output_file); + let mut nline = 0; + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + nline += 1; + } + assert_eq!(nline, 22); + + out.flush()?; + assert_eq!(out.writes_issued, 2); + + // Verify that files match: + let expected = fs::read(&input_path)?; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + Ok(()) + } + + #[test] + #[cfg(unix)] + fn test_mmap_read() -> std::io::Result<()> { // Create temporary file with known contents let mut tmp = NamedTempFile::new()?; write!(tmp, "first line\nsecond line\n")?; From 7b874a4f85889f494028e52504e989a490a1d0fb Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 19:41:01 +0300 Subject: [PATCH 06/85] Fix non-Unix compilation --- src/uu/sed/src/fast_io.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 4990bc31..3798279f 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -248,6 +248,7 @@ impl OutputBuffer { } } +#[cfg(unix)] impl OutputBuffer { /// Schedule the specified output chunk for eventual output pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { @@ -360,11 +361,14 @@ pub fn main() -> io::Result<()> { mod tests { use super::*; use std::fs; + #[cfg(unix)] use std::fs::File; + #[cfg(unix)] use std::io::{self, Write}; use tempfile::NamedTempFile; /// Helper: produce a 4 096-byte Vec of `'.'`s ending in `'\n'`. + #[cfg(unix)] fn make_dot_line_4k() -> Vec { let mut buf = Vec::with_capacity(4096); buf.extend(std::iter::repeat(b'.').take(4095)); From e4f73367eab2287e2e172022b39fbec8c68f66c6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 20:14:05 +0300 Subject: [PATCH 07/85] Add tests for non-newline terminated files --- src/uu/sed/src/fast_io.rs | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 3798279f..698afe0a 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -485,6 +485,7 @@ mod tests { Ok(()) } + #[test] #[cfg(unix)] fn test_large_file_zero_copy() -> io::Result<()> { @@ -523,6 +524,80 @@ mod tests { Ok(()) } + #[test] + #[cfg(unix)] + fn test_large_file_zero_copy_unterminated() -> io::Result<()> { + // Create and fill the input temp file: + let mut input = NamedTempFile::new()?; + write!(input, "first line\nsecond line\n")?; + let dot_line = make_dot_line_4k(); + input.write_all(&dot_line)?; + write!(input, "last line (unterminated)")?; + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open reader on input file: + let mut reader = LineReader::open(&input_path)?; + + // Create the output temp file (empty): + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let output_file = File::create(&output_path)?; + + // Wrap it in your OutputBuffer and run the loop: + let mut out = OutputBuffer::new(output_file); + let mut nline = 0; + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + nline += 1; + } + assert_eq!(nline, 4); + + out.flush()?; + assert_eq!(out.writes_issued, 1); + + // Verify that files match: + let expected = fs::read(&input_path)?; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + Ok(()) + } + + #[test] + fn test_small_file_unterminated() -> io::Result<()> { + // Create and fill the input temp file: + let mut input = NamedTempFile::new()?; + write!(input, "first line\nsecond line\nlast line (unterminated)")?; + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open reader on input file: + let mut reader = LineReader::open(&input_path)?; + + // Create the output temp file (empty): + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let output_file = File::create(&output_path)?; + + // Wrap it in your OutputBuffer and run the loop: + let mut out = OutputBuffer::new(output_file); + let mut nline = 0; + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + nline += 1; + } + assert_eq!(nline, 3); + + out.flush()?; + assert_eq!(out.writes_issued, 0); + + // Verify that files match: + let expected = fs::read(&input_path)?; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + Ok(()) + } + #[test] #[cfg(unix)] fn test_large_file_zero_copy_with_flush() -> io::Result<()> { From 814e370c7522581fa096b77bf2470723d3b944e6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 21:32:25 +0300 Subject: [PATCH 08/85] Handle missing newlines --- src/uu/sed/src/fast_io.rs | 132 ++++++++++++++++++++++++++++++-------- 1 file changed, 105 insertions(+), 27 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 698afe0a..9903fae3 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -70,23 +70,32 @@ impl<'a> MmapLineCursor<'a> { /// Buffered line reader from any BufRead input. pub struct ReadLineCursor { - lines: std::io::Lines, + reader: R, + buffer: String, } impl ReadLineCursor { pub fn new(reader: R) -> Self { Self { - lines: reader.lines(), + reader, + buffer: String::new(), } } - - /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result, usize)>> { - match self.lines.next() { - Some(Ok(line)) => Ok(Some((Cow::Owned(line.clone()), line.len()))), - Some(Err(e)) => Err(e), - None => Ok(None), + /// Return the next line and its \n termination, if available, or None. + pub fn get_line(&mut self) -> io::Result, bool)>> { + self.buffer.clear(); + // read_line *includes* the '\n' if present + let bytes_read = self.reader.read_line(&mut self.buffer)?; + if bytes_read == 0 { + return Ok(None); } + // O(1) check whether it ended in '\n' + let has_newline = self.buffer.ends_with('\n'); + // strip it if you don’t want to expose it to the caller + if has_newline { + self.buffer.pop(); + } + Ok(Some((Cow::Owned(self.buffer.clone()), has_newline))) } } @@ -99,9 +108,12 @@ impl ReadLineCursor { pub enum OutputChunk<'a> { MmapInput { content: &'a [u8], // Line without newline - full_span: &'a [u8], // Line including original newline + full_span: &'a [u8], // Line including original newline, if any + }, + Owned { + content: Vec, // Line content without newline + has_newline: bool, // True if \n-terminated }, - Owned(Vec), // Line content without newline } #[cfg(unix)] @@ -110,7 +122,10 @@ type OutputChunkRef<'a> = OutputChunk<'a>; // The same as above for non-Unix platforms, which lack mmap(2) #[cfg(not(unix))] pub enum OutputChunk { - Owned(Vec), // Line content without newline + Owned { + content: Vec, // Line content without newline + has_newline: bool, // True if \n-terminated + }, } #[cfg(not(unix))] @@ -134,6 +149,8 @@ fn line_reader_read_input(file: File) -> io::Result { } impl LineReader { + /// Open the specified file for line input. + // Use "-" to read from the standard input. pub fn open(path: &PathBuf) -> io::Result { if path.as_os_str() == "-" { let stdin = io::stdin(); @@ -169,6 +186,13 @@ impl LineReader { } } + /// Open the specified file to read as a stream. + #[cfg(test)] + pub fn open_stream(path: &PathBuf) -> io::Result { + let file = File::open(path)?; + line_reader_read_input(file) + } + /// Return the next line, if available, or None. pub fn get_line(&mut self) -> io::Result> { match self { @@ -181,8 +205,11 @@ impl LineReader { } } LineReader::ReadInput(cursor) => { - if let Some((line, _)) = cursor.get_line()? { - Ok(Some(OutputChunk::Owned(line.into_owned().into_bytes()))) + if let Some((line, has_newline)) = cursor.get_line()? { + Ok(Some(OutputChunk::Owned { + content: line.into_owned().into_bytes(), + has_newline, + })) } else { Ok(None) } @@ -227,7 +254,7 @@ fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { // Taking into account the non-copied data, this should result // in overall fewer CPU instructions. #[cfg(unix)] -const MIN_DIRECT_WRITE: usize = 4096; +const MIN_DIRECT_WRITE: usize = 4 * 1024; /// The maximum size of a pending write buffer // Once more than 64k accumulate, issue a write to allow the OS @@ -270,10 +297,15 @@ impl OutputBuffer { Ok(()) } - OutputChunk::Owned(buf) => { + OutputChunk::Owned { + content, + has_newline, + } => { self.flush_mmap()?; - self.out.write_all(buf)?; - self.out.write_all(b"\n")?; + self.out.write_all(content)?; + if *has_newline { + self.out.write_all(b"\n")?; + } Ok(()) } } @@ -282,7 +314,10 @@ impl OutputBuffer { /// Schedule the specified string for eventual output pub fn write_str(&mut self, s: &str) -> io::Result<()> { // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::Owned(s.as_bytes().to_vec())) + self.write_chunk(&OutputChunk::Owned { + content: s.as_bytes().to_vec(), + has_newline: false, + }) } // Flush any pending mmap data @@ -319,9 +354,14 @@ impl OutputBuffer { /// Schedule the specified output chunk for eventual output pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { match chunk { - OutputChunk::Owned(buf) => { - self.out.write_all(buf)?; - self.out.write_all(b"\n")?; + OutputChunk::Owned { + content, + has_newline, + } => { + self.out.write_all(content)?; + if *has_newline { + self.out.write_all(b"\n")?; + } Ok(()) } } @@ -330,7 +370,10 @@ impl OutputBuffer { /// Schedule the specified string for eventual output pub fn write_str(&mut self, s: &str) -> io::Result<()> { // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::Owned(s.as_bytes().to_vec())) + self.write_chunk(&OutputChunk::Owned { + content: s.as_bytes().to_vec(), + has_newline: false, + }) } /// Flush everything: pending mmap and buffered data. @@ -367,7 +410,7 @@ mod tests { use std::io::{self, Write}; use tempfile::NamedTempFile; - /// Helper: produce a 4 096-byte Vec of `'.'`s ending in `'\n'`. + /// Helper: produce a 4k-byte Vec of `'.'`s ending in `'\n'`. #[cfg(unix)] fn make_dot_line_4k() -> Vec { let mut buf = Vec::with_capacity(4096); @@ -382,8 +425,8 @@ mod tests { { let file = tmp.reopen()?; let mut out = OutputBuffer::new(file); - out.write_str("foo")?; - out.write_str("bar")?; + out.write_str("foo\n")?; + out.write_str("bar\n")?; out.flush()?; assert_eq!(out.writes_issued, 0); } // File closes here as it leaves the scope @@ -460,7 +503,7 @@ mod tests { } // Write an owned line ("middle\n") - out.write_str("middle")?; + out.write_str("middle\n")?; // Read the second mmap line ("one\n") and write it if let Some(chunk) = reader.get_line()? { @@ -598,6 +641,41 @@ mod tests { Ok(()) } + #[test] + fn test_small_file_unterminated_stream() -> io::Result<()> { + // Create and fill the input temp file: + let mut input = NamedTempFile::new()?; + write!(input, "first line\nsecond line\nlast line (unterminated)")?; + input.flush()?; + let input_path = input.path().to_path_buf(); + + // Open reader on input file: + let mut reader = LineReader::open_stream(&input_path)?; + + // Create the output temp file (empty): + let output = NamedTempFile::new()?; + let output_path = output.path().to_path_buf(); + let output_file = File::create(&output_path)?; + + // Wrap it in your OutputBuffer and run the loop: + let mut out = OutputBuffer::new(output_file); + let mut nline = 0; + while let Some(chunk) = reader.get_line()? { + out.write_chunk(&chunk)?; + nline += 1; + } + assert_eq!(nline, 3); + + out.flush()?; + assert_eq!(out.writes_issued, 0); + + // Verify that files match: + let expected = fs::read(&input_path)?; + let actual = fs::read(&output_path)?; + assert_eq!(actual, expected); + Ok(()) + } + #[test] #[cfg(unix)] fn test_large_file_zero_copy_with_flush() -> io::Result<()> { From 8d48beecb47e0feb35aa4aefa70c94c7146add1d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 4 May 2025 21:49:36 +0300 Subject: [PATCH 09/85] Remove unneeded generic parameter --- src/uu/sed/src/fast_io.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 9903fae3..5f2d0f7f 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -69,18 +69,21 @@ impl<'a> MmapLineCursor<'a> { } /// Buffered line reader from any BufRead input. -pub struct ReadLineCursor { - reader: R, +pub struct ReadLineCursor { + reader: Box, buffer: String, } -impl ReadLineCursor { - pub fn new(reader: R) -> Self { +impl ReadLineCursor { + /// Construct from anything that implements `Read`. + pub fn new(r: R) -> Self { + let buf = BufReader::new(r); Self { - reader, + reader: Box::new(buf), buffer: String::new(), } } + /// Return the next line and its \n termination, if available, or None. pub fn get_line(&mut self) -> io::Result, bool)>> { self.buffer.clear(); @@ -138,7 +141,7 @@ pub enum LineReader { mapped_file: Mmap, // A handle that can derive the mapped file slice cursor: MmapLineCursor<'static>, }, - ReadInput(ReadLineCursor>>), + ReadInput(ReadLineCursor), } /// Return a LineReader that uses the ReadInput method fot the specified file. From 9f4fb9e8ef5a4397bfe70fe78b2eef195e801996 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 11:25:30 +0300 Subject: [PATCH 10/85] Remove unneeded generic argument --- src/uu/sed/src/fast_io.rs | 51 ++++++++++++++------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 5f2d0f7f..12996f96 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -225,20 +225,14 @@ impl LineReader { /// Outputs from mmapped data are coallesced and written via a write(2) /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. -/// The generic argument W is used for obtaining the output when -/// testing. -pub struct OutputBuffer { - out: BufWriter, +pub struct OutputBuffer { + out: BufWriter>, #[cfg(unix)] mmap_ptr: Option<(*const u8, usize)>, #[cfg(test)] writes_issued: usize, // Number of issued write(2) calls } -/// Type to use for writing -// Example: DynOutputBuffer::new(Box::new(io::stdout().lock()) -pub type DynOutputBuffer = OutputBuffer>; - /// Wrapper that issues the write(2) system call #[cfg(unix)] fn write_syscall(fd: i32, ptr: *const u8, len: usize) -> io::Result<()> { @@ -266,8 +260,8 @@ const MIN_DIRECT_WRITE: usize = 4 * 1024; #[cfg(unix)] const MAX_PENDING_WRITE: usize = 64 * 1024; -impl OutputBuffer { - pub fn new(w: W) -> Self { +impl OutputBuffer { + pub fn new(w: Box) -> Self { Self { out: BufWriter::new(w), #[cfg(unix)] @@ -276,10 +270,19 @@ impl OutputBuffer { writes_issued: 0, } } + + /// Schedule the specified string for eventual output + pub fn write_str(&mut self, s: &str) -> io::Result<()> { + // Use the write_chunk corresponding to cfg + self.write_chunk(&OutputChunk::Owned { + content: s.as_bytes().to_vec(), + has_newline: false, + }) + } } #[cfg(unix)] -impl OutputBuffer { +impl OutputBuffer { /// Schedule the specified output chunk for eventual output pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { match chunk { @@ -314,15 +317,6 @@ impl OutputBuffer { } } - /// Schedule the specified string for eventual output - pub fn write_str(&mut self, s: &str) -> io::Result<()> { - // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::Owned { - content: s.as_bytes().to_vec(), - has_newline: false, - }) - } - // Flush any pending mmap data #[cfg(unix)] fn flush_mmap(&mut self) -> io::Result<()> { @@ -353,7 +347,7 @@ impl OutputBuffer { } #[cfg(not(unix))] -impl OutputBuffer { +impl OutputBuffer { /// Schedule the specified output chunk for eventual output pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { match chunk { @@ -370,15 +364,6 @@ impl OutputBuffer { } } - /// Schedule the specified string for eventual output - pub fn write_str(&mut self, s: &str) -> io::Result<()> { - // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::Owned { - content: s.as_bytes().to_vec(), - has_newline: false, - }) - } - /// Flush everything: pending mmap and buffered data. pub fn flush(&mut self) -> io::Result<()> { self.out.flush() // then flush buffered data @@ -427,7 +412,7 @@ mod tests { let tmp = NamedTempFile::new()?; { let file = tmp.reopen()?; - let mut out = OutputBuffer::new(file); + let mut out = OutputBuffer::new(Box::new(file)); out.write_str("foo\n")?; out.write_str("bar\n")?; out.flush()?; @@ -626,7 +611,7 @@ mod tests { let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(output_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -661,7 +646,7 @@ mod tests { let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(output_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; From e480d51a22ec0c39a341352986541a209a4b52de Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 13:42:45 +0300 Subject: [PATCH 11/85] Implement skeleton processor --- src/uu/sed/src/command.rs | 10 ++++++ src/uu/sed/src/fast_io.rs | 16 +++++----- src/uu/sed/src/multi_io.rs | 49 ++++++++++++++++++++++++++++++ src/uu/sed/src/processor.rs | 18 +++++++---- src/uu/sed/src/sed.rs | 3 +- tests/by-util/test_sed.rs | 40 ++++++++++++++++++++++-- tests/fixtures/sed/dots-4k.txt | 1 + tests/fixtures/sed/dots-64k.txt | 16 ++++++++++ tests/fixtures/sed/dots-8k.txt | 2 ++ tests/fixtures/sed/no-new-line.txt | 1 + tests/fixtures/sed/two-lines.txt | 2 ++ 11 files changed, 141 insertions(+), 17 deletions(-) create mode 100644 src/uu/sed/src/multi_io.rs create mode 100644 tests/fixtures/sed/dots-4k.txt create mode 100644 tests/fixtures/sed/dots-64k.txt create mode 100644 tests/fixtures/sed/dots-8k.txt create mode 100644 tests/fixtures/sed/no-new-line.txt create mode 100644 tests/fixtures/sed/two-lines.txt diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 53761d85..c7ec08ff 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,6 +11,8 @@ // TODO: remove when compile is implemented #![allow(dead_code)] +use crate::fast_io::LineReader; +use crate::fast_io::OutputBuffer; use regex::Regex; use std::collections::HashMap; use std::fs::File; @@ -165,3 +167,11 @@ pub struct Space { pub append_newline: bool, // Whether originally terminated by \n pub backup: String, // Backing memory } + +/// Context for processing multiple files and in-place replacements +pub struct ProcessingContext { + pub reader: LineReader, + pub output: OutputBuffer, + pub input_files: Vec, + pub cli_options: CliOptions, +} diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 12996f96..e757004f 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -30,18 +30,20 @@ use uucore::libc::{c_void, write}; // - ReadLineCursorbased on BufReader. /// Cursor for zero-copy iteration over mmap’d file. -pub struct MmapLineCursor<'a> { +#[cfg(unix)] +struct MmapLineCursor<'a> { data: &'a [u8], pos: usize, } +#[cfg(unix)] impl<'a> MmapLineCursor<'a> { - pub fn new(data: &'a [u8]) -> Self { + fn new(data: &'a [u8]) -> Self { Self { data, pos: 0 } } /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result> { + fn get_line(&mut self) -> io::Result> { if self.pos >= self.data.len() { return Ok(None); } @@ -76,7 +78,7 @@ pub struct ReadLineCursor { impl ReadLineCursor { /// Construct from anything that implements `Read`. - pub fn new(r: R) -> Self { + fn new(r: R) -> Self { let buf = BufReader::new(r); Self { reader: Box::new(buf), @@ -85,7 +87,7 @@ impl ReadLineCursor { } /// Return the next line and its \n termination, if available, or None. - pub fn get_line(&mut self) -> io::Result, bool)>> { + fn get_line(&mut self) -> io::Result, bool)>> { self.buffer.clear(); // read_line *includes* the '\n' if present let bytes_read = self.reader.read_line(&mut self.buffer)?; @@ -120,7 +122,7 @@ pub enum OutputChunk<'a> { } #[cfg(unix)] -type OutputChunkRef<'a> = OutputChunk<'a>; +pub type OutputChunkRef<'a> = OutputChunk<'a>; // The same as above for non-Unix platforms, which lack mmap(2) #[cfg(not(unix))] @@ -132,7 +134,7 @@ pub enum OutputChunk { } #[cfg(not(unix))] -type OutputChunkRef = OutputChunk; +pub type OutputChunkRef = OutputChunk; /// Unified reader that uses mmap when possible, falls back to buffered reading. pub enum LineReader { diff --git a/src/uu/sed/src/multi_io.rs b/src/uu/sed/src/multi_io.rs new file mode 100644 index 00000000..086fb32e --- /dev/null +++ b/src/uu/sed/src/multi_io.rs @@ -0,0 +1,49 @@ +// Line-based I/O from multiple input files to multiple output files +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use crate::command::{CliOptions, ProcessingContext}; +use crate::fast_io::{LineReader, OutputBuffer, OutputChunk, OutputChunkRef}; +use std::io::{self, stdout}; +use std::path::PathBuf; +use uucore::error::UResult; + +impl ProcessingContext { + /// Create a new `ProcessingContext` taking ownership of cli_options + pub fn new(input_files: Vec, cli_options: CliOptions) -> UResult { + let first = input_files.first().expect("input_files must be non-empty"); + // Open the reader on the first path. + let reader = LineReader::open(first)?; + + // TODO: Handle in-place editing of first file + let output = OutputBuffer::new(Box::new(stdout())); + + Ok(ProcessingContext { + reader, + output, + input_files, + cli_options, + }) + } + + /// Return the next line, if available, or None. + pub fn get_line(&mut self) -> io::Result> { + // TODO: Handle iterating over all files + self.reader.get_line() + } + + /// Schedule the specified output chunk for eventual output + pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { + self.output.write_chunk(chunk) + } + + pub fn flush(&mut self) -> io::Result<()> { + self.output.flush() + } +} diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 4d5b3c15..96b267d5 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,16 +8,22 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::CliOptions; -use crate::command::Command; +use crate::command::{CliOptions, Command, ProcessingContext}; use std::path::PathBuf; use uucore::error::UResult; pub fn process( - _code: Option>, - _files: Vec, - _cli_options: &mut CliOptions, + _commands: Option>, + files: Vec, + cli_options: CliOptions, ) -> UResult<()> { - // TODO + let mut context = ProcessingContext::new(files, cli_options)?; + + while let Some(chunk) = context.get_line()? { + // TODO: process commands + context.write_chunk(&chunk)?; + } + context.flush()?; + Ok(()) } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index b4b061fe..76ef3c61 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -12,6 +12,7 @@ pub mod command; pub mod compiler; pub mod delimited_parser; pub mod fast_io; +pub mod multi_io; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; @@ -34,7 +35,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let mut cli_options = build_context(&matches); let executable = compile(scripts, &mut cli_options)?; - process(executable, files, &mut cli_options)?; + process(executable, files, cli_options)?; Ok(()) } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index f98b07d7..3b3ecd43 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -22,12 +22,12 @@ fn test_invalid_arg() { #[test] fn test_debug() { - new_ucmd!().arg("--debug").arg("").succeeds(); + new_ucmd!().args(&["--debug", ""]).succeeds(); } #[test] fn test_silent_alias() { - new_ucmd!().arg("--silent").arg("").succeeds(); + new_ucmd!().args(&["--silent", ""]).succeeds(); } #[test] @@ -43,9 +43,14 @@ fn test_positional_script_ok() { new_ucmd!().arg("l").succeeds().code_is(0); } +#[test] +fn test_empty_positional_script_ok() { + new_ucmd!().arg("").succeeds().code_is(0); +} + #[test] fn test_e_script_ok() { - new_ucmd!().arg("-e").arg("l").succeeds(); + new_ucmd!().args(&["-e", "l"]).succeeds(); } #[test] @@ -56,3 +61,32 @@ fn test_f_script_ok() { new_ucmd!().arg("-f").arg(path).succeeds(); } + +const INPUT_FILES: &[&str] = &[ + "two-lines.txt", + "no-new-line.txt", + "dots-4k.txt", + "dots-8k.txt", + "dots-64k.txt", +]; + +#[test] +fn test_no_script_stdin() { + for fixture in INPUT_FILES { + new_ucmd!() + .arg("") + .pipe_in_fixture(fixture) + .succeeds() + .stdout_is_fixture(fixture); + } +} + +#[test] +fn test_no_script_file() { + for fixture in INPUT_FILES { + new_ucmd!() + .args(&["-e", "", fixture]) + .succeeds() + .stdout_is_fixture(fixture); + } +} diff --git a/tests/fixtures/sed/dots-4k.txt b/tests/fixtures/sed/dots-4k.txt new file mode 100644 index 00000000..50a372b2 --- /dev/null +++ b/tests/fixtures/sed/dots-4k.txt @@ -0,0 +1 @@ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ diff --git a/tests/fixtures/sed/dots-64k.txt b/tests/fixtures/sed/dots-64k.txt new file mode 100644 index 00000000..9c9fd559 --- /dev/null +++ b/tests/fixtures/sed/dots-64k.txt @@ -0,0 +1,16 @@ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ diff --git a/tests/fixtures/sed/dots-8k.txt b/tests/fixtures/sed/dots-8k.txt new file mode 100644 index 00000000..cdfcce14 --- /dev/null +++ b/tests/fixtures/sed/dots-8k.txt @@ -0,0 +1,2 @@ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ +................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ diff --git a/tests/fixtures/sed/no-new-line.txt b/tests/fixtures/sed/no-new-line.txt new file mode 100644 index 00000000..5ab2f8a4 --- /dev/null +++ b/tests/fixtures/sed/no-new-line.txt @@ -0,0 +1 @@ +Hello \ No newline at end of file diff --git a/tests/fixtures/sed/two-lines.txt b/tests/fixtures/sed/two-lines.txt new file mode 100644 index 00000000..e5c5c558 --- /dev/null +++ b/tests/fixtures/sed/two-lines.txt @@ -0,0 +1,2 @@ +line one +line two From 6ad09857b196a7dffbd08e7fbacd6b56f1559767 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 13:44:44 +0300 Subject: [PATCH 12/85] Improve name for processing options --- src/uu/sed/src/command.rs | 4 ++-- src/uu/sed/src/compiler.rs | 28 ++++++++++++++-------------- src/uu/sed/src/multi_io.rs | 8 ++++---- src/uu/sed/src/processor.rs | 6 +++--- src/uu/sed/src/sed.rs | 14 +++++++------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index c7ec08ff..5934f8ff 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -21,7 +21,7 @@ use std::path::PathBuf; // For file descriptors and equivalent // Compilation and processing options provided mostly through the // command-line interface #[derive(Debug, Default)] -pub struct CliOptions { +pub struct ProcessingOptions { // Command-line flags with corresponding names pub all_output_files: bool, pub debug: bool, @@ -173,5 +173,5 @@ pub struct ProcessingContext { pub reader: LineReader, pub output: OutputBuffer, pub input_files: Vec, - pub cli_options: CliOptions, + pub processing_options: ProcessingOptions, } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 341a8d17..0042ccc6 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -8,7 +8,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Address, AddressType, AddressValue, CliOptions, Command, ScriptValue}; +use crate::command::{Address, AddressType, AddressValue, Command, ProcessingOptions, ScriptValue}; use crate::delimited_parser::{compilation_error, parse_regex}; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; @@ -208,11 +208,11 @@ enum ContinueAction { pub fn compile( scripts: Vec, - cli_options: &mut CliOptions, + processing_options: &mut ProcessingOptions, ) -> UResult>> { let mut make_providers = ScriptLineProvider::new(scripts); - let result = compile_thread(&mut make_providers, cli_options)?; + let result = compile_thread(&mut make_providers, processing_options)?; // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -220,7 +220,7 @@ pub fn compile( // Compile provided scripts into a thread of commands fn compile_thread( lines: &mut ScriptLineProvider, - _cli_options: &mut CliOptions, + _processing_options: &mut ProcessingOptions, ) -> UResult>> { let mut head: Option> = None; // A mutable reference to the place we’ll insert next @@ -235,7 +235,7 @@ fn compile_thread( Some(line_string) => { let mut line = ScriptCharProvider::new(&line_string); - // TODO: set cli_options.quiet for StringVal starting with #n + // TODO: set processing_options.quiet for StringVal starting with #n 'next_char: loop { line.eat_spaces(); if line.eol() || line.current() == '#' { @@ -1017,14 +1017,14 @@ mod tests { ScriptLineProvider::new(input) } - fn make_cli_options() -> CliOptions { - CliOptions::default() + fn make_processing_options() -> ProcessingOptions { + ProcessingOptions::default() } #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1033,7 +1033,7 @@ mod tests { #[test] fn test_compile_thread_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1042,7 +1042,7 @@ mod tests { #[test] fn test_compile_thread_single_command() { let mut provider = make_provider(&["42q"]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let cmd = result.unwrap(); @@ -1065,7 +1065,7 @@ mod tests { #[test] fn test_compile_thread_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let cmd = result.unwrap(); @@ -1088,7 +1088,7 @@ mod tests { #[test] fn test_compile_thread_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let first = result.unwrap(); @@ -1102,7 +1102,7 @@ mod tests { #[test] fn test_compile_thread_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); - let mut opts = make_cli_options(); + let mut opts = make_processing_options(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let first = result.unwrap(); @@ -1117,7 +1117,7 @@ mod tests { #[test] fn test_compile_single_command() { let scripts = vec![ScriptValue::StringVal("1q".to_string())]; - let mut opts = CliOptions::default(); + let mut opts = ProcessingOptions::default(); let result = compile(scripts, &mut opts).unwrap(); let cmd = result.unwrap(); diff --git a/src/uu/sed/src/multi_io.rs b/src/uu/sed/src/multi_io.rs index 086fb32e..7e1a6ddb 100644 --- a/src/uu/sed/src/multi_io.rs +++ b/src/uu/sed/src/multi_io.rs @@ -8,15 +8,15 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{CliOptions, ProcessingContext}; +use crate::command::{ProcessingContext, ProcessingOptions}; use crate::fast_io::{LineReader, OutputBuffer, OutputChunk, OutputChunkRef}; use std::io::{self, stdout}; use std::path::PathBuf; use uucore::error::UResult; impl ProcessingContext { - /// Create a new `ProcessingContext` taking ownership of cli_options - pub fn new(input_files: Vec, cli_options: CliOptions) -> UResult { + /// Create a new `ProcessingContext` taking ownership of processing_options + pub fn new(input_files: Vec, processing_options: ProcessingOptions) -> UResult { let first = input_files.first().expect("input_files must be non-empty"); // Open the reader on the first path. let reader = LineReader::open(first)?; @@ -28,7 +28,7 @@ impl ProcessingContext { reader, output, input_files, - cli_options, + processing_options, }) } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 96b267d5..75953404 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,16 +8,16 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{CliOptions, Command, ProcessingContext}; +use crate::command::{Command, ProcessingContext, ProcessingOptions}; use std::path::PathBuf; use uucore::error::UResult; pub fn process( _commands: Option>, files: Vec, - cli_options: CliOptions, + processing_options: ProcessingOptions, ) -> UResult<()> { - let mut context = ProcessingContext::new(files, cli_options)?; + let mut context = ProcessingContext::new(files, processing_options)?; while let Some(chunk) = context.get_line()? { // TODO: process commands diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 76ef3c61..c91136f3 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -17,7 +17,7 @@ pub mod processor; pub mod script_char_provider; pub mod script_line_provider; -use crate::command::{CliOptions, ScriptValue}; +use crate::command::{ProcessingOptions, ScriptValue}; use crate::compiler::compile; use crate::processor::process; use clap::{arg, Arg, ArgMatches, Command}; @@ -32,10 +32,10 @@ const USAGE: &str = "sed [OPTION]... [script] [file]..."; pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; let (scripts, files) = get_scripts_files(&matches)?; - let mut cli_options = build_context(&matches); + let mut processing_options = build_context(&matches); - let executable = compile(scripts, &mut cli_options)?; - process(executable, files, cli_options)?; + let executable = compile(scripts, &mut processing_options)?; + process(executable, files, processing_options)?; Ok(()) } @@ -171,9 +171,9 @@ fn get_scripts_files(matches: &ArgMatches) -> UResult<(Vec, Vec CliOptions { - CliOptions { +// Parse CLI flag arguments and return a ProcessingOptions struct based on them +fn build_context(matches: &ArgMatches) -> ProcessingOptions { + ProcessingOptions { all_output_files: matches.get_flag("all-output-files"), debug: matches.get_flag("debug"), regexp_extended: matches.get_flag("regexp-extended"), From 5545e82db4e68f1b0c6ff3e68f42f04f5e67325a Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 16:32:10 +0300 Subject: [PATCH 13/85] Use portable as_raw_fd This was required to fix the Unix implementation. --- src/uu/sed/src/fast_io.rs | 41 ++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index e757004f..026b27a1 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -19,8 +19,7 @@ use memmap2::Mmap; use std::borrow::Cow; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; -#[cfg(unix)] -use std::os::unix::io::AsRawFd; +use std::os::fd::AsRawFd; use std::path::PathBuf; #[cfg(unix)] use uucore::libc::{c_void, write}; @@ -223,12 +222,15 @@ impl LineReader { } } +trait WriteAndFd: Write + AsRawFd {} +impl WriteAndFd for T {} + /// Abstraction for outputting data, potentially from the mmapped file /// Outputs from mmapped data are coallesced and written via a write(2) /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. pub struct OutputBuffer { - out: BufWriter>, + out: BufWriter>, #[cfg(unix)] mmap_ptr: Option<(*const u8, usize)>, #[cfg(test)] @@ -263,9 +265,13 @@ const MIN_DIRECT_WRITE: usize = 4 * 1024; const MAX_PENDING_WRITE: usize = 64 * 1024; impl OutputBuffer { - pub fn new(w: Box) -> Self { + /// Construct a new OutputBuffer given a file path. + pub fn new(w: W) -> Self + where + W: Write + AsRawFd + Send + 'static, + { Self { - out: BufWriter::new(w), + out: BufWriter::new(Box::new(w)), #[cfg(unix)] mmap_ptr: None, #[cfg(test)] @@ -320,7 +326,6 @@ impl OutputBuffer { } // Flush any pending mmap data - #[cfg(unix)] fn flush_mmap(&mut self) -> io::Result<()> { if let Some((ptr, len)) = self.mmap_ptr.take() { if len < MIN_DIRECT_WRITE { @@ -449,7 +454,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = std::fs::File::create(&output_path)?; - let mut out = OutputBuffer::new(out_file); + let mut out = OutputBuffer::new(Box::new(Box::new(out_file))); // Drain reader → writer while let Some(chunk) = reader.get_line()? { @@ -485,7 +490,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = File::create(&output_path)?; - let mut out = OutputBuffer::new(out_file); + let mut out = OutputBuffer::new(Box::new(out_file)); // Read the first mmap line ("zero\n") and write it if let Some(chunk) = reader.get_line()? { @@ -536,10 +541,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -575,10 +580,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -610,10 +615,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(output_file)); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -645,10 +650,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(output_file)); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -686,10 +691,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; From 7d177277ea856563d7f63bfb25736d12310c18f6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 16:49:26 +0300 Subject: [PATCH 14/85] Revert "Use portable as_raw_fd" This reverts commit 5545e82db4e68f1b0c6ff3e68f42f04f5e67325a. Apparently there is no such thing. It doesn't work on Windows even with on the nightly build. --- src/uu/sed/src/fast_io.rs | 41 +++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 026b27a1..e757004f 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -19,7 +19,8 @@ use memmap2::Mmap; use std::borrow::Cow; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; -use std::os::fd::AsRawFd; +#[cfg(unix)] +use std::os::unix::io::AsRawFd; use std::path::PathBuf; #[cfg(unix)] use uucore::libc::{c_void, write}; @@ -222,15 +223,12 @@ impl LineReader { } } -trait WriteAndFd: Write + AsRawFd {} -impl WriteAndFd for T {} - /// Abstraction for outputting data, potentially from the mmapped file /// Outputs from mmapped data are coallesced and written via a write(2) /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. pub struct OutputBuffer { - out: BufWriter>, + out: BufWriter>, #[cfg(unix)] mmap_ptr: Option<(*const u8, usize)>, #[cfg(test)] @@ -265,13 +263,9 @@ const MIN_DIRECT_WRITE: usize = 4 * 1024; const MAX_PENDING_WRITE: usize = 64 * 1024; impl OutputBuffer { - /// Construct a new OutputBuffer given a file path. - pub fn new(w: W) -> Self - where - W: Write + AsRawFd + Send + 'static, - { + pub fn new(w: Box) -> Self { Self { - out: BufWriter::new(Box::new(w)), + out: BufWriter::new(w), #[cfg(unix)] mmap_ptr: None, #[cfg(test)] @@ -326,6 +320,7 @@ impl OutputBuffer { } // Flush any pending mmap data + #[cfg(unix)] fn flush_mmap(&mut self) -> io::Result<()> { if let Some((ptr, len)) = self.mmap_ptr.take() { if len < MIN_DIRECT_WRITE { @@ -454,7 +449,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = std::fs::File::create(&output_path)?; - let mut out = OutputBuffer::new(Box::new(Box::new(out_file))); + let mut out = OutputBuffer::new(out_file); // Drain reader → writer while let Some(chunk) = reader.get_line()? { @@ -490,7 +485,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = File::create(&output_path)?; - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(out_file); // Read the first mmap line ("zero\n") and write it if let Some(chunk) = reader.get_line()? { @@ -541,10 +536,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let out_file = File::create(&output_path)?; + let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(output_file); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -580,10 +575,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let out_file = File::create(&output_path)?; + let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(output_file); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -615,10 +610,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let out_file = File::create(&output_path)?; + let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(Box::new(output_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -650,10 +645,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let out_file = File::create(&output_path)?; + let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(Box::new(output_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -691,10 +686,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let out_file = File::create(&output_path)?; + let output_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(out_file)); + let mut out = OutputBuffer::new(output_file); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; From 48cd61e3365bbb5c4b1fc1938ec0d391ba7dc359 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 17:00:35 +0300 Subject: [PATCH 15/85] Fix Unix AsRawFd handling --- src/uu/sed/src/fast_io.rs | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index e757004f..2219671c 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -223,12 +223,22 @@ impl LineReader { } } +#[cfg(unix)] +pub trait OutputWrite: Write + AsRawFd {} +#[cfg(unix)] +impl OutputWrite for T {} + +#[cfg(not(unix))] +pub trait OutputWrite: Write {} +#[cfg(not(unix))] +impl OutputWrite for T {} + /// Abstraction for outputting data, potentially from the mmapped file /// Outputs from mmapped data are coallesced and written via a write(2) /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. pub struct OutputBuffer { - out: BufWriter>, + out: BufWriter>, #[cfg(unix)] mmap_ptr: Option<(*const u8, usize)>, #[cfg(test)] @@ -263,7 +273,7 @@ const MIN_DIRECT_WRITE: usize = 4 * 1024; const MAX_PENDING_WRITE: usize = 64 * 1024; impl OutputBuffer { - pub fn new(w: Box) -> Self { + pub fn new(w: Box) -> Self { Self { out: BufWriter::new(w), #[cfg(unix)] @@ -449,7 +459,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = std::fs::File::create(&output_path)?; - let mut out = OutputBuffer::new(out_file); + let mut out = OutputBuffer::new(Box::new(Box::new(out_file))); // Drain reader → writer while let Some(chunk) = reader.get_line()? { @@ -485,7 +495,7 @@ mod tests { let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); let out_file = File::create(&output_path)?; - let mut out = OutputBuffer::new(out_file); + let mut out = OutputBuffer::new(Box::new(out_file)); // Read the first mmap line ("zero\n") and write it if let Some(chunk) = reader.get_line()? { @@ -536,10 +546,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -575,10 +585,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -610,10 +620,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(output_file)); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -645,10 +655,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(Box::new(output_file)); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; @@ -686,10 +696,10 @@ mod tests { // Create the output temp file (empty): let output = NamedTempFile::new()?; let output_path = output.path().to_path_buf(); - let output_file = File::create(&output_path)?; + let out_file = File::create(&output_path)?; // Wrap it in your OutputBuffer and run the loop: - let mut out = OutputBuffer::new(output_file); + let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; while let Some(chunk) = reader.get_line()? { out.write_chunk(&chunk)?; From 8da47550078fdb7ac42f1d89495b211a6abd0b31 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 19:26:48 +0300 Subject: [PATCH 16/85] Implement new in-place editing design The preceding one, by bringing together input and output in the same structure, resulted in overlapping mutable borrowing. The current design separates the two and thus avoids this problem. --- src/uu/sed/src/command.rs | 10 -------- src/uu/sed/src/fast_io.rs | 9 ++++--- src/uu/sed/src/in_place.rs | 46 ++++++++++++++++++++++++++++++++++ src/uu/sed/src/multi_io.rs | 49 ------------------------------------- src/uu/sed/src/processor.rs | 19 +++++++++----- src/uu/sed/src/sed.rs | 2 +- 6 files changed, 65 insertions(+), 70 deletions(-) create mode 100644 src/uu/sed/src/in_place.rs delete mode 100644 src/uu/sed/src/multi_io.rs diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 5934f8ff..27b5923a 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,8 +11,6 @@ // TODO: remove when compile is implemented #![allow(dead_code)] -use crate::fast_io::LineReader; -use crate::fast_io::OutputBuffer; use regex::Regex; use std::collections::HashMap; use std::fs::File; @@ -167,11 +165,3 @@ pub struct Space { pub append_newline: bool, // Whether originally terminated by \n pub backup: String, // Backing memory } - -/// Context for processing multiple files and in-place replacements -pub struct ProcessingContext { - pub reader: LineReader, - pub output: OutputBuffer, - pub input_files: Vec, - pub processing_options: ProcessingOptions, -} diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 2219671c..d022d5ca 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -31,7 +31,7 @@ use uucore::libc::{c_void, write}; /// Cursor for zero-copy iteration over mmap’d file. #[cfg(unix)] -struct MmapLineCursor<'a> { +pub struct MmapLineCursor<'a> { data: &'a [u8], pos: usize, } @@ -223,6 +223,7 @@ impl LineReader { } } +// Define a trait combining two: workaround for Rust's corresponding inability. #[cfg(unix)] pub trait OutputWrite: Write + AsRawFd {} #[cfg(unix)] @@ -238,11 +239,11 @@ impl OutputWrite for T {} /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. pub struct OutputBuffer { - out: BufWriter>, + out: BufWriter>, // Where to write #[cfg(unix)] - mmap_ptr: Option<(*const u8, usize)>, + mmap_ptr: Option<(*const u8, usize)>, // Start and len of chunk to write #[cfg(test)] - writes_issued: usize, // Number of issued write(2) calls + writes_issued: usize, // Number of issued write(2) calls } /// Wrapper that issues the write(2) system call diff --git a/src/uu/sed/src/in_place.rs b/src/uu/sed/src/in_place.rs new file mode 100644 index 00000000..27a690e1 --- /dev/null +++ b/src/uu/sed/src/in_place.rs @@ -0,0 +1,46 @@ +// Support for in-place editing +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use crate::command::ProcessingOptions; +use crate::fast_io::OutputBuffer; +use std::io::stdout; +use std::path::Path; +use uucore::error::UResult; + +/// Context for in-place editing +pub struct InPlace<'opts> { + pub output: OutputBuffer, + pub processing_options: &'opts ProcessingOptions, +} + +impl<'opts> InPlace<'opts> { + /// Create a new `ProcessingContext` taking ownership of processing_options + pub fn new(processing_options: &'opts ProcessingOptions) -> UResult { + let output = OutputBuffer::new(Box::new(stdout())); + + Ok(InPlace { + output, + processing_options, + }) + } + + /// Return an OutputBuffer for outputting the edits to the specified file. + pub fn begin(&mut self, _file_name: &Path) -> UResult<&mut OutputBuffer> { + // TODO: Adjust output for in-place editing, if needed. + Ok(&mut self.output) + } + + /// Finish in-place editing. + pub fn end(&mut self) -> UResult<()> { + self.output.flush()?; + // TODO: Rename and delete output file, if needed. + Ok(()) + } +} diff --git a/src/uu/sed/src/multi_io.rs b/src/uu/sed/src/multi_io.rs deleted file mode 100644 index 7e1a6ddb..00000000 --- a/src/uu/sed/src/multi_io.rs +++ /dev/null @@ -1,49 +0,0 @@ -// Line-based I/O from multiple input files to multiple output files -// -// SPDX-License-Identifier: MIT -// Copyright (c) 2025 Diomidis Spinellis -// -// This file is part of the uutils sed package. -// It is licensed under the MIT License. -// For the full copyright and license information, please view the LICENSE -// file that was distributed with this source code. - -use crate::command::{ProcessingContext, ProcessingOptions}; -use crate::fast_io::{LineReader, OutputBuffer, OutputChunk, OutputChunkRef}; -use std::io::{self, stdout}; -use std::path::PathBuf; -use uucore::error::UResult; - -impl ProcessingContext { - /// Create a new `ProcessingContext` taking ownership of processing_options - pub fn new(input_files: Vec, processing_options: ProcessingOptions) -> UResult { - let first = input_files.first().expect("input_files must be non-empty"); - // Open the reader on the first path. - let reader = LineReader::open(first)?; - - // TODO: Handle in-place editing of first file - let output = OutputBuffer::new(Box::new(stdout())); - - Ok(ProcessingContext { - reader, - output, - input_files, - processing_options, - }) - } - - /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result> { - // TODO: Handle iterating over all files - self.reader.get_line() - } - - /// Schedule the specified output chunk for eventual output - pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { - self.output.write_chunk(chunk) - } - - pub fn flush(&mut self) -> io::Result<()> { - self.output.flush() - } -} diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 75953404..c1cdd5de 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,7 +8,9 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Command, ProcessingContext, ProcessingOptions}; +use crate::command::{Command, ProcessingOptions}; +use crate::fast_io::LineReader; +use crate::in_place::InPlace; use std::path::PathBuf; use uucore::error::UResult; @@ -17,13 +19,18 @@ pub fn process( files: Vec, processing_options: ProcessingOptions, ) -> UResult<()> { - let mut context = ProcessingContext::new(files, processing_options)?; + let mut in_place = InPlace::new(&processing_options)?; - while let Some(chunk) = context.get_line()? { - // TODO: process commands - context.write_chunk(&chunk)?; + for path in files { + let mut reader = LineReader::open(&path)?; + let output = in_place.begin(&path)?; + + while let Some(chunk) = reader.get_line()? { + output.write_chunk(&chunk)?; + } + + in_place.end()?; } - context.flush()?; Ok(()) } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index c91136f3..32b100fb 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -12,7 +12,7 @@ pub mod command; pub mod compiler; pub mod delimited_parser; pub mod fast_io; -pub mod multi_io; +pub mod in_place; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From 59727136fe6cdea1fdc6c370e0f1c341d0b8cf7f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 20:11:59 +0300 Subject: [PATCH 17/85] Implement unbuffered output --- Cargo.lock | 22 ++++++++++++++++++++++ Cargo.toml | 2 ++ src/uu/sed/Cargo.toml | 3 ++- src/uu/sed/src/processor.rs | 10 ++++++++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 208cf4ea..0e2a9062 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -76,6 +76,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -299,6 +310,15 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -669,6 +689,7 @@ checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" name = "sed" version = "0.0.1" dependencies = [ + "atty", "chrono", "clap", "clap_complete", @@ -872,6 +893,7 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" name = "uu_sed" version = "0.0.1" dependencies = [ + "atty", "clap", "memmap2", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index 7989651a..49b20665 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ feat_common_core = [ ] [workspace.dependencies] +atty = "0.2" uucore = "0.0.30" clap = { version = "4.4", features = ["wrap_help", "cargo"] } clap_complete = "4.5" @@ -52,6 +53,7 @@ chrono = { version = "0.4.37", default-features = false, features = [ [dependencies] +atty = { workspace = true } clap = { workspace = true } clap_complete = { workspace = true } clap_mangen = { workspace = true } diff --git a/src/uu/sed/Cargo.toml b/src/uu/sed/Cargo.toml index 53869c11..b41999ac 100644 --- a/src/uu/sed/Cargo.toml +++ b/src/uu/sed/Cargo.toml @@ -13,12 +13,13 @@ categories = ["command-line-utilities"] [dependencies] +atty = { workspace = true } uucore = { workspace = true } clap = { workspace = true } once_cell = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } -memmap2.workspace = true +memmap2 = { workspace = true } [lib] path = "src/sed.rs" diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index c1cdd5de..677f2ce9 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -11,6 +11,7 @@ use crate::command::{Command, ProcessingOptions}; use crate::fast_io::LineReader; use crate::in_place::InPlace; +use atty::Stream; use std::path::PathBuf; use uucore::error::UResult; @@ -20,13 +21,18 @@ pub fn process( processing_options: ProcessingOptions, ) -> UResult<()> { let mut in_place = InPlace::new(&processing_options)?; + let line_flush = processing_options.unbuffered || atty::is(Stream::Stdout); for path in files { let mut reader = LineReader::open(&path)?; let output = in_place.begin(&path)?; - while let Some(chunk) = reader.get_line()? { - output.write_chunk(&chunk)?; + while let Some(pattern_space) = reader.get_line()? { + // TODO: process commands + output.write_chunk(&pattern_space)?; + if line_flush { + output.flush()?; + } } in_place.end()?; From bf35c99a930be907e34f013f62cf8342875d8831 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 6 May 2025 23:37:29 +0300 Subject: [PATCH 18/85] Move file processing into separate function --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/in_place.rs | 8 ++++---- src/uu/sed/src/processor.rs | 38 ++++++++++++++++++++++++------------- src/uu/sed/src/sed.rs | 4 ++-- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 27b5923a..5fcc0b0d 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -18,7 +18,7 @@ use std::path::PathBuf; // For file descriptors and equivalent // Compilation and processing options provided mostly through the // command-line interface -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct ProcessingOptions { // Command-line flags with corresponding names pub all_output_files: bool, diff --git a/src/uu/sed/src/in_place.rs b/src/uu/sed/src/in_place.rs index 27a690e1..4590b175 100644 --- a/src/uu/sed/src/in_place.rs +++ b/src/uu/sed/src/in_place.rs @@ -15,14 +15,14 @@ use std::path::Path; use uucore::error::UResult; /// Context for in-place editing -pub struct InPlace<'opts> { +pub struct InPlace { pub output: OutputBuffer, - pub processing_options: &'opts ProcessingOptions, + pub processing_options: ProcessingOptions, } -impl<'opts> InPlace<'opts> { +impl InPlace { /// Create a new `ProcessingContext` taking ownership of processing_options - pub fn new(processing_options: &'opts ProcessingOptions) -> UResult { + pub fn new(processing_options: ProcessingOptions) -> UResult { let output = OutputBuffer::new(Box::new(stdout())); Ok(InPlace { diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 677f2ce9..12e0bbb4 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -9,31 +9,43 @@ // file that was distributed with this source code. use crate::command::{Command, ProcessingOptions}; -use crate::fast_io::LineReader; +use crate::fast_io::{LineReader, OutputBuffer}; use crate::in_place::InPlace; use atty::Stream; use std::path::PathBuf; use uucore::error::UResult; -pub fn process( - _commands: Option>, +/// Process a single input file +fn process_file( + _commands: &Option>, + reader: &mut LineReader, + output: &mut OutputBuffer, + processing_options: &mut ProcessingOptions, +) -> UResult<()> { + while let Some(pattern_space) = reader.get_line()? { + // TODO: process commands + output.write_chunk(&pattern_space)?; + if processing_options.unbuffered { + output.flush()?; + } + } + Ok(()) +} + +/// Process all input files +pub fn process_all_files( + commands: Option>, files: Vec, - processing_options: ProcessingOptions, + mut processing_options: ProcessingOptions, ) -> UResult<()> { - let mut in_place = InPlace::new(&processing_options)?; - let line_flush = processing_options.unbuffered || atty::is(Stream::Stdout); + processing_options.unbuffered = processing_options.unbuffered || atty::is(Stream::Stdout); + let mut in_place = InPlace::new(processing_options.clone())?; for path in files { let mut reader = LineReader::open(&path)?; let output = in_place.begin(&path)?; - while let Some(pattern_space) = reader.get_line()? { - // TODO: process commands - output.write_chunk(&pattern_space)?; - if line_flush { - output.flush()?; - } - } + process_file(&commands, &mut reader, output, &mut processing_options)?; in_place.end()?; } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 32b100fb..a3bb7b95 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -19,7 +19,7 @@ pub mod script_line_provider; use crate::command::{ProcessingOptions, ScriptValue}; use crate::compiler::compile; -use crate::processor::process; +use crate::processor::process_all_files; use clap::{arg, Arg, ArgMatches, Command}; use std::path::PathBuf; use uucore::error::{UResult, UUsageError}; @@ -35,7 +35,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let mut processing_options = build_context(&matches); let executable = compile(scripts, &mut processing_options)?; - process(executable, files, processing_options)?; + process_all_files(executable, files, processing_options)?; Ok(()) } From 6d2c4dd07c5d0f7a614e33099f72c5970ac48f45 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 00:01:34 +0300 Subject: [PATCH 19/85] Iterate over all commands --- src/uu/sed/src/processor.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 12e0bbb4..d6e8d0cd 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -17,13 +17,20 @@ use uucore::error::UResult; /// Process a single input file fn process_file( - _commands: &Option>, + commands: &Option>, reader: &mut LineReader, output: &mut OutputBuffer, processing_options: &mut ProcessingOptions, ) -> UResult<()> { while let Some(pattern_space) = reader.get_line()? { - // TODO: process commands + let mut current = commands.as_deref(); + while let Some(command) = current { + // TODO: process command.code. + + // Advance to next command. + current = command.next.as_deref(); + } + output.write_chunk(&pattern_space)?; if processing_options.unbuffered { output.flush()?; From 98107653fa5f31d24fba3fe633f6d2c9ef160876 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 09:49:03 +0300 Subject: [PATCH 20/85] Add is_last_line() method --- src/uu/sed/src/fast_io.rs | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index d022d5ca..40035d35 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -68,6 +68,11 @@ impl<'a> MmapLineCursor<'a> { Ok(Some((content, full_span))) } + + /// Return true if the line read is the last one + fn is_last_line(&mut self) -> io::Result { + Ok(self.pos >= self.data.len()) + } } /// Buffered line reader from any BufRead input. @@ -102,6 +107,11 @@ impl ReadLineCursor { } Ok(Some((Cow::Owned(self.buffer.clone()), has_newline))) } + + /// Return true if the line read is the last one + fn is_last_line(&mut self) -> io::Result { + Ok(self.reader.fill_buf()?.is_empty()) + } } /// Data to be written to a file. It can come from the mmapped @@ -221,6 +231,15 @@ impl LineReader { } } } + + /// Return true if the line read is the last one + pub fn is_last_line(&mut self) -> io::Result { + match self { + #[cfg(unix)] + LineReader::MmapInput { cursor, .. } => cursor.is_last_line(), + LineReader::ReadInput(cursor) => cursor.is_last_line(), + } + } } // Define a trait combining two: workaround for Rust's corresponding inability. @@ -718,6 +737,47 @@ mod tests { Ok(()) } + #[test] + fn test_stream_read() -> std::io::Result<()> { + // Create temporary file with known contents + let mut tmp = NamedTempFile::new()?; + write!(tmp, "first line\nsecond line\n")?; + tmp.flush()?; + + let path = tmp.path().to_path_buf(); + let mut reader = LineReader::open_stream(&path)?; + + // Verify the reader's operation + assert!(!reader.is_last_line()?); + if let Some(OutputChunk::Owned { + content, + has_newline, + }) = reader.get_line()? + { + assert_eq!(content, b"first line"); + assert!(has_newline); + } else { + panic!("Expected OutputChunk::Owned"); + } + + assert!(!reader.is_last_line()?); + if let Some(OutputChunk::Owned { + content, + has_newline, + }) = reader.get_line()? + { + assert_eq!(content, b"second line"); + assert!(has_newline); + } else { + panic!("Expected OutputChunk::Owned"); + } + + assert!(reader.is_last_line()?); + assert_eq!(reader.get_line()?, None); + + Ok(()) + } + #[test] #[cfg(unix)] fn test_mmap_read() -> std::io::Result<()> { @@ -730,6 +790,7 @@ mod tests { let mut reader = LineReader::open(&path)?; // Verify the reader's operation + assert!(!reader.is_last_line()?); assert_eq!( reader.get_line()?, Some(OutputChunk::MmapInput { @@ -737,6 +798,7 @@ mod tests { full_span: b"first line\n".as_ref(), }) ); + assert!(!reader.is_last_line()?); assert_eq!( reader.get_line()?, Some(OutputChunk::MmapInput { @@ -744,6 +806,7 @@ mod tests { full_span: b"second line\n".as_ref(), }) ); + assert!(reader.is_last_line()?); assert_eq!(reader.get_line()?, None); Ok(()) From 2082d0ab06172e0edf67f49c1bb4cffd91b0f8f1 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 10:27:01 +0300 Subject: [PATCH 21/85] Support partial equality missing on Windows --- src/uu/sed/src/fast_io.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 40035d35..f4b4cea0 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -135,6 +135,7 @@ pub enum OutputChunk<'a> { pub type OutputChunkRef<'a> = OutputChunk<'a>; // The same as above for non-Unix platforms, which lack mmap(2) +#[derive(Debug, PartialEq, Eq)] #[cfg(not(unix))] pub enum OutputChunk { Owned { From 6cca6f298aa7e2701b331c64a7f7870308ac1192 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 10:35:46 +0300 Subject: [PATCH 22/85] Track input line number --- src/uu/sed/src/command.rs | 5 ++++- src/uu/sed/src/compiler.rs | 28 ++++++++++++++-------------- src/uu/sed/src/in_place.rs | 10 +++++----- src/uu/sed/src/processor.rs | 18 +++++++++++------- src/uu/sed/src/sed.rs | 16 +++++++++------- 5 files changed, 43 insertions(+), 34 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 5fcc0b0d..ab22f42f 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -19,7 +19,7 @@ use std::path::PathBuf; // For file descriptors and equivalent // Compilation and processing options provided mostly through the // command-line interface #[derive(Debug, Default, Clone)] -pub struct ProcessingOptions { +pub struct ProcessingContext { // Command-line flags with corresponding names pub all_output_files: bool, pub debug: bool, @@ -34,6 +34,9 @@ pub struct ProcessingOptions { pub sandbox: bool, pub unbuffered: bool, pub null_data: bool, + // Other context + /// Current input line number + pub line_number: usize, } // The specification of a script: through a string or a file diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 0042ccc6..ece500fd 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -8,7 +8,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Address, AddressType, AddressValue, Command, ProcessingOptions, ScriptValue}; +use crate::command::{Address, AddressType, AddressValue, Command, ProcessingContext, ScriptValue}; use crate::delimited_parser::{compilation_error, parse_regex}; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; @@ -208,11 +208,11 @@ enum ContinueAction { pub fn compile( scripts: Vec, - processing_options: &mut ProcessingOptions, + processing_context: &mut ProcessingContext, ) -> UResult>> { let mut make_providers = ScriptLineProvider::new(scripts); - let result = compile_thread(&mut make_providers, processing_options)?; + let result = compile_thread(&mut make_providers, processing_context)?; // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -220,7 +220,7 @@ pub fn compile( // Compile provided scripts into a thread of commands fn compile_thread( lines: &mut ScriptLineProvider, - _processing_options: &mut ProcessingOptions, + _processing_context: &mut ProcessingContext, ) -> UResult>> { let mut head: Option> = None; // A mutable reference to the place we’ll insert next @@ -235,7 +235,7 @@ fn compile_thread( Some(line_string) => { let mut line = ScriptCharProvider::new(&line_string); - // TODO: set processing_options.quiet for StringVal starting with #n + // TODO: set processing_context.quiet for StringVal starting with #n 'next_char: loop { line.eat_spaces(); if line.eol() || line.current() == '#' { @@ -1017,14 +1017,14 @@ mod tests { ScriptLineProvider::new(input) } - fn make_processing_options() -> ProcessingOptions { - ProcessingOptions::default() + fn make_processing_context() -> ProcessingContext { + ProcessingContext::default() } #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1033,7 +1033,7 @@ mod tests { #[test] fn test_compile_thread_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1042,7 +1042,7 @@ mod tests { #[test] fn test_compile_thread_single_command() { let mut provider = make_provider(&["42q"]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let cmd = result.unwrap(); @@ -1065,7 +1065,7 @@ mod tests { #[test] fn test_compile_thread_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let cmd = result.unwrap(); @@ -1088,7 +1088,7 @@ mod tests { #[test] fn test_compile_thread_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let first = result.unwrap(); @@ -1102,7 +1102,7 @@ mod tests { #[test] fn test_compile_thread_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); - let mut opts = make_processing_options(); + let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let first = result.unwrap(); @@ -1117,7 +1117,7 @@ mod tests { #[test] fn test_compile_single_command() { let scripts = vec![ScriptValue::StringVal("1q".to_string())]; - let mut opts = ProcessingOptions::default(); + let mut opts = ProcessingContext::default(); let result = compile(scripts, &mut opts).unwrap(); let cmd = result.unwrap(); diff --git a/src/uu/sed/src/in_place.rs b/src/uu/sed/src/in_place.rs index 4590b175..eb4d9403 100644 --- a/src/uu/sed/src/in_place.rs +++ b/src/uu/sed/src/in_place.rs @@ -8,7 +8,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::ProcessingOptions; +use crate::command::ProcessingContext; use crate::fast_io::OutputBuffer; use std::io::stdout; use std::path::Path; @@ -17,17 +17,17 @@ use uucore::error::UResult; /// Context for in-place editing pub struct InPlace { pub output: OutputBuffer, - pub processing_options: ProcessingOptions, + pub processing_context: ProcessingContext, } impl InPlace { - /// Create a new `ProcessingContext` taking ownership of processing_options - pub fn new(processing_options: ProcessingOptions) -> UResult { + /// Create a new `ProcessingContext` taking ownership of processing_context + pub fn new(processing_context: ProcessingContext) -> UResult { let output = OutputBuffer::new(Box::new(stdout())); Ok(InPlace { output, - processing_options, + processing_context, }) } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index d6e8d0cd..79fddfbb 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,7 +8,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Command, ProcessingOptions}; +use crate::command::{Command, ProcessingContext}; use crate::fast_io::{LineReader, OutputBuffer}; use crate::in_place::InPlace; use atty::Stream; @@ -20,9 +20,10 @@ fn process_file( commands: &Option>, reader: &mut LineReader, output: &mut OutputBuffer, - processing_options: &mut ProcessingOptions, + processing_context: &mut ProcessingContext, ) -> UResult<()> { while let Some(pattern_space) = reader.get_line()? { + processing_context.line_number += 1; let mut current = commands.as_deref(); while let Some(command) = current { // TODO: process command.code. @@ -32,7 +33,7 @@ fn process_file( } output.write_chunk(&pattern_space)?; - if processing_options.unbuffered { + if processing_context.unbuffered { output.flush()?; } } @@ -43,16 +44,19 @@ fn process_file( pub fn process_all_files( commands: Option>, files: Vec, - mut processing_options: ProcessingOptions, + mut processing_context: ProcessingContext, ) -> UResult<()> { - processing_options.unbuffered = processing_options.unbuffered || atty::is(Stream::Stdout); - let mut in_place = InPlace::new(processing_options.clone())?; + processing_context.unbuffered = processing_context.unbuffered || atty::is(Stream::Stdout); + let mut in_place = InPlace::new(processing_context.clone())?; for path in files { let mut reader = LineReader::open(&path)?; let output = in_place.begin(&path)?; - process_file(&commands, &mut reader, output, &mut processing_options)?; + if processing_context.separate { + processing_context.line_number = 0; + } + process_file(&commands, &mut reader, output, &mut processing_context)?; in_place.end()?; } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index a3bb7b95..611c57d8 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -17,7 +17,7 @@ pub mod processor; pub mod script_char_provider; pub mod script_line_provider; -use crate::command::{ProcessingOptions, ScriptValue}; +use crate::command::{ProcessingContext, ScriptValue}; use crate::compiler::compile; use crate::processor::process_all_files; use clap::{arg, Arg, ArgMatches, Command}; @@ -32,10 +32,10 @@ const USAGE: &str = "sed [OPTION]... [script] [file]..."; pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; let (scripts, files) = get_scripts_files(&matches)?; - let mut processing_options = build_context(&matches); + let mut processing_context = build_context(&matches); - let executable = compile(scripts, &mut processing_options)?; - process_all_files(executable, files, processing_options)?; + let executable = compile(scripts, &mut processing_context)?; + process_all_files(executable, files, processing_context)?; Ok(()) } @@ -171,9 +171,9 @@ fn get_scripts_files(matches: &ArgMatches) -> UResult<(Vec, Vec ProcessingOptions { - ProcessingOptions { +// Parse CLI flag arguments and return a ProcessingContext struct based on them +fn build_context(matches: &ArgMatches) -> ProcessingContext { + ProcessingContext { all_output_files: matches.get_flag("all-output-files"), debug: matches.get_flag("debug"), regexp_extended: matches.get_flag("regexp-extended"), @@ -196,6 +196,8 @@ fn build_context(matches: &ArgMatches) -> ProcessingOptions { sandbox: matches.get_flag("sandbox"), unbuffered: matches.get_flag("unbuffered"), null_data: matches.get_flag("null-data"), + // Other context + line_number: 0, } } From 2e4909629f01f338b40f72efb1f79cdd58c6e71e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 10:59:08 +0300 Subject: [PATCH 23/85] Add command codes --- src/uu/sed/src/processor.rs | 87 +++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 79fddfbb..0efefbbd 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -26,9 +26,90 @@ fn process_file( processing_context.line_number += 1; let mut current = commands.as_deref(); while let Some(command) = current { - // TODO: process command.code. - - // Advance to next command. + // TODO: continue if command doesn't apply + match command.code { + '{' => { + // TODO + } + 'a' => { + // TODO + } + 'b' => { + // TODO + } + 'c' => { + // TODO + } + 'd' => { + // TODO + } + 'D' => { + // TODO + } + 'g' => { + // TODO + } + 'G' => { + // TODO + } + 'h' => { + // TODO + } + 'H' => { + // TODO + } + 'i' => { + // TODO + } + 'l' => { + // TODO + } + 'n' => { + // TODO + } + 'N' => { + // TODO + } + 'p' => { + // TODO + } + 'P' => { + // TODO + } + 'q' => { + // TODO + } + 'r' => { + // TODO + } + 's' => { + // TODO + } + 't' => { + // TODO + } + 'w' => { + // TODO + } + 'x' => { + // TODO + } + 'y' => { + // TODO + } + ':' => { + // TODO + } + '}' => { + // TODO + } + '=' => { + // TODO + } + // The compilation should supply only valid codes. + _ => panic!("invalid command code"), + } // match + // Advance to next command. current = command.next.as_deref(); } From a8db13a0f33278598945e9f69baaf2703e652bab Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 12:33:49 +0300 Subject: [PATCH 24/85] Allow command pointers to be shared This is required for the b t and { commands, which point to others. --- src/uu/sed/src/command.rs | 33 ++++++---- src/uu/sed/src/compiler.rs | 120 ++++++++++++++++++++++-------------- src/uu/sed/src/processor.rs | 13 ++-- 3 files changed, 104 insertions(+), 62 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ab22f42f..67cf2c88 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -12,9 +12,11 @@ #![allow(dead_code)] use regex::Regex; +use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; use std::path::PathBuf; // For file descriptors and equivalent +use std::rc::Rc; // Compilation and processing options provided mostly through the // command-line interface @@ -100,14 +102,14 @@ pub struct Transliteration { */ #[derive(Debug)] pub struct Command { - pub code: char, // Command code - pub addr1: Option
, // Start address - pub addr2: Option
, // End address - pub non_select: bool, // True if '!' - pub start_line: Option, // Start line number (or None) - pub text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' - pub data: CommandData, // Command-specific data - pub next: Option>, // Pointer to next command + pub code: char, // Command code + pub addr1: Option
, // Start address + pub addr2: Option
, // End address + pub non_select: bool, // True if '!' + pub start_line: Option, // Start line number (or None) + pub text: Option, // Text for ':', 'a', 'c', 'i', 'r', 'w' + pub data: CommandData, // Command-specific data + pub next: Option>>, // Pointer to next command } impl Default for Command { @@ -128,10 +130,19 @@ impl Default for Command { #[derive(Debug)] pub enum CommandData { None, - SubCommands(Vec), // Commands for 'b', 't', '{' - Substitution(Box), // Substitute command 's' + Subcommand(Rc>), // Commands for 'b', 't', '{' + Substitution(Box), // Substitute command 's' Transliteration(Box), // Transliteration command 'y' - WriteFileDescriptor(File), // File descriptor for 'w' + WriteFileDescriptor(File), // File descriptor for 'w' +} + +impl CommandData { + pub fn get_subcommand(self) -> Rc> { + match self { + CommandData::Subcommand(c) => c, + _ => panic!("Called get on non-Subcommand variant"), + } + } } /* diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index ece500fd..6e4a85d1 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; +use std::rc::Rc; use uucore::error::UResult; thread_local! { @@ -209,7 +210,7 @@ enum ContinueAction { pub fn compile( scripts: Vec, processing_context: &mut ProcessingContext, -) -> UResult>> { +) -> UResult>>> { let mut make_providers = ScriptLineProvider::new(scripts); let result = compile_thread(&mut make_providers, processing_context)?; @@ -221,8 +222,8 @@ pub fn compile( fn compile_thread( lines: &mut ScriptLineProvider, _processing_context: &mut ProcessingContext, -) -> UResult>> { - let mut head: Option> = None; +) -> UResult>>> { + let mut head: Option>> = None; // A mutable reference to the place we’ll insert next let mut next_p = &mut head; @@ -245,7 +246,7 @@ fn compile_thread( continue 'next_char; } - let mut cmd = Box::new(Command::default()); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(lines, &mut line, &mut cmd)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; @@ -253,7 +254,7 @@ fn compile_thread( if cmd_spec.args == CommandArgs::NonSelect { line.advance(); line.eat_spaces(); - cmd.non_select = true; + cmd.borrow_mut().non_select = true; cmd_spec = get_cmd_spec(lines, &line, n_addr)?; } @@ -261,7 +262,13 @@ fn compile_thread( let action = compile_command(lines, &mut line, &mut cmd, cmd_spec)?; *next_p = Some(cmd); - next_p = &mut next_p.as_mut().unwrap().next; + // Intermediate let binding to avoid the temporary drop + let cmd_rc = next_p.as_mut().unwrap(); + let cmd_ptr = + &mut cmd_rc.borrow_mut().next as *mut Option>>; + unsafe { + next_p = &mut *cmd_ptr; + } match action { ContinueAction::NextLine => continue 'next_line, @@ -283,9 +290,10 @@ fn is_address_char(c: char) -> bool { fn compile_address_range( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, - cmd: &mut Command, + cmd: &mut Rc>, ) -> UResult { let mut n_addr = 0; + let mut cmd = cmd.borrow_mut(); line.eat_spaces(); if !line.eol() && is_address_char(line.current()) { @@ -423,9 +431,10 @@ fn compile_regex( fn compile_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, - cmd: &mut Command, + cmd: &mut Rc>, cmd_spec: &'static CommandSpec, ) -> UResult { + let mut cmd = cmd.borrow_mut(); cmd.code = line.current(); match cmd_spec.args { @@ -860,12 +869,12 @@ mod tests { #[test] fn test_compile_single_line_address() { let (lines, mut chars) = make_providers("42"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( - cmd.addr1.as_ref().unwrap().atype, + cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Line )); } @@ -873,26 +882,26 @@ mod tests { #[test] fn test_compile_relative_address_range() { let (lines, mut chars) = make_providers("2,+3"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); assert!(matches!( - cmd.addr1.as_ref().unwrap().atype, + cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Line )); - let v1 = match &cmd.addr1.as_ref().unwrap().value { + let v1 = match &cmd.borrow().addr1.as_ref().unwrap().value { AddressValue::LineNumber(n) => *n, _ => panic!(), }; assert_eq!(v1, 2); assert!(matches!( - cmd.addr2.as_ref().unwrap().atype, + cmd.borrow().addr2.as_ref().unwrap().atype, AddressType::RelLine )); - let v2 = match &cmd.addr2.as_ref().unwrap().value { + let v2 = match &cmd.borrow().addr2.as_ref().unwrap().value { AddressValue::LineNumber(n) => *n, _ => panic!(), }; @@ -902,12 +911,12 @@ mod tests { #[test] fn test_compile_last_address() { let (lines, mut chars) = make_providers("$"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( - cmd.addr1.as_ref().unwrap().atype, + cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Last )); } @@ -915,16 +924,16 @@ mod tests { #[test] fn test_compile_absolute_address_range() { let (lines, mut chars) = make_providers("5,10"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); assert!(matches!( - cmd.addr1.as_ref().unwrap().atype, + cmd.borrow().addr1.as_ref().unwrap().atype, AddressType::Line )); assert!(matches!( - cmd.addr2.as_ref().unwrap().atype, + cmd.borrow().addr2.as_ref().unwrap().atype, AddressType::Line )); } @@ -932,80 +941,92 @@ mod tests { #[test] fn test_compile_regex_address() { let (lines, mut chars) = make_providers("/foo/"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); - assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); - if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(matches!( + cmd.borrow().addr1.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("foo")); assert!(!re.is_match("bar")); } else { panic!("expected a regex address"); - } + }; } #[test] fn test_compile_regex_address_range_other_delimiter() { let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 2); - assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); - if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(matches!( + cmd.borrow().addr1.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("foo")); assert!(!re.is_match("bar")); } else { panic!("expected a regex address"); } - assert!(matches!(cmd.addr2.as_ref().unwrap().atype, AddressType::Re)); - if let AddressValue::Regex(re) = &cmd.addr2.as_ref().unwrap().value { + assert!(matches!( + cmd.borrow().addr2.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd.borrow().addr2.as_ref().unwrap().value { assert!(re.is_match("bar")); assert!(!re.is_match("foo")); } else { panic!("expected a regex address"); - } + }; } #[test] fn test_compile_regex_with_modifier() { let (lines, mut chars) = make_providers("/foo/I"); - let mut cmd = Command::default(); + let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); assert_eq!(n_addr, 1); - assert!(matches!(cmd.addr1.as_ref().unwrap().atype, AddressType::Re)); - if let AddressValue::Regex(re) = &cmd.addr1.as_ref().unwrap().value { + assert!(matches!( + cmd.borrow().addr1.as_ref().unwrap().atype, + AddressType::Re + )); + if let AddressValue::Regex(re) = &cmd.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("FOO")); assert!(re.is_match("foo")); } else { panic!("expected a regex address with case-insensitive match"); - } + }; } #[test] fn test_compile_re_reuse_saved() { // First save a regex let (lines1, mut chars1) = make_providers("/abc/"); - let mut cmd1 = Command::default(); + let mut cmd1 = Rc::new(RefCell::new(Command::default())); compile_address_range(&lines1, &mut chars1, &mut cmd1).unwrap(); // Now reuse it let (lines2, mut chars2) = make_providers("//"); - let mut cmd2 = Command::default(); + let mut cmd2 = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( - cmd2.addr1.as_ref().unwrap().atype, + cmd2.borrow().addr1.as_ref().unwrap().atype, AddressType::Re )); - if let AddressValue::Regex(re) = &cmd2.addr1.as_ref().unwrap().value { + if let AddressValue::Regex(re) = &cmd2.borrow().addr1.as_ref().unwrap().value { assert!(re.is_match("abc")); - } + }; } // compile_thread @@ -1045,7 +1066,8 @@ mod tests { let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); - let cmd = result.unwrap(); + let binding = result.unwrap(); + let cmd = binding.borrow(); assert_eq!(cmd.code, 'q'); assert!(!cmd.non_select); @@ -1068,7 +1090,8 @@ mod tests { let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); - let cmd = result.unwrap(); + let binding = result.unwrap(); + let cmd = binding.borrow(); assert_eq!(cmd.code, 'p'); assert!(cmd.non_select); @@ -1091,10 +1114,12 @@ mod tests { let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); - let first = result.unwrap(); + let binding = result.unwrap(); + let first = binding.borrow(); assert_eq!(first.code, 'q'); - let second = first.next.unwrap(); + let binding = first.next.clone().unwrap(); + let second = binding.borrow(); assert_eq!(second.code, 'd'); assert!(second.next.is_none()); } @@ -1105,10 +1130,12 @@ mod tests { let mut opts = make_processing_context(); let result = compile_thread(&mut provider, &mut opts).unwrap(); - let first = result.unwrap(); + let binding = result.unwrap(); + let first = binding.borrow(); assert_eq!(first.code, 'q'); - let second = first.next.unwrap(); + let binding = first.next.clone().unwrap(); + let second = binding.borrow(); assert_eq!(second.code, 'd'); assert!(second.next.is_none()); } @@ -1120,7 +1147,8 @@ mod tests { let mut opts = ProcessingContext::default(); let result = compile(scripts, &mut opts).unwrap(); - let cmd = result.unwrap(); + let binding = result.unwrap(); + let cmd = binding.borrow(); assert_eq!(cmd.code, 'q'); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 0efefbbd..01c83524 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -12,22 +12,24 @@ use crate::command::{Command, ProcessingContext}; use crate::fast_io::{LineReader, OutputBuffer}; use crate::in_place::InPlace; use atty::Stream; +use std::cell::RefCell; use std::path::PathBuf; +use std::rc::Rc; use uucore::error::UResult; /// Process a single input file fn process_file( - commands: &Option>, + commands: &Option>>, reader: &mut LineReader, output: &mut OutputBuffer, processing_context: &mut ProcessingContext, ) -> UResult<()> { while let Some(pattern_space) = reader.get_line()? { processing_context.line_number += 1; - let mut current = commands.as_deref(); + let mut current: Option>> = commands.clone(); while let Some(command) = current { // TODO: continue if command doesn't apply - match command.code { + match command.borrow().code { '{' => { // TODO } @@ -110,7 +112,8 @@ fn process_file( _ => panic!("invalid command code"), } // match // Advance to next command. - current = command.next.as_deref(); + let command_ref = command.borrow(); + current = command_ref.next.clone(); } output.write_chunk(&pattern_space)?; @@ -123,7 +126,7 @@ fn process_file( /// Process all input files pub fn process_all_files( - commands: Option>, + commands: Option>>, files: Vec, mut processing_context: ProcessingContext, ) -> UResult<()> { From 55f09fdbbbd67f31c38acc94cead69cf6962e2af Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 13:11:16 +0300 Subject: [PATCH 25/85] Add command switching examples --- src/uu/sed/src/command.rs | 4 ++-- src/uu/sed/src/processor.rs | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 67cf2c88..294da581 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -137,9 +137,9 @@ pub enum CommandData { } impl CommandData { - pub fn get_subcommand(self) -> Rc> { + pub fn get_subcommand(&self) -> Rc> { match self { - CommandData::Subcommand(c) => c, + CommandData::Subcommand(rc) => Rc::clone(rc), _ => panic!("Called get on non-Subcommand variant"), } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 01c83524..0444bdb8 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -27,17 +27,20 @@ fn process_file( while let Some(pattern_space) = reader.get_line()? { processing_context.line_number += 1; let mut current: Option>> = commands.clone(); - while let Some(command) = current { + while let Some(command_rc) = current { + let command = command_rc.borrow(); // TODO: continue if command doesn't apply - match command.borrow().code { + match command.code { '{' => { - // TODO + current = Some(command.data.get_subcommand()); + continue; } 'a' => { // TODO } 'b' => { - // TODO + current = Some(command.data.get_subcommand()); + continue; } 'c' => { // TODO @@ -112,8 +115,7 @@ fn process_file( _ => panic!("invalid command code"), } // match // Advance to next command. - let command_ref = command.borrow(); - current = command_ref.next.clone(); + current = command.next.clone(); } output.write_chunk(&pattern_space)?; From c5ef0fc6d7d426fc1b644f6a12418a4d80f00659 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 13:28:51 +0300 Subject: [PATCH 26/85] Implement the 'd' command This demonstrates mutable patern space and starting a new cycle. --- src/uu/sed/src/fast_io.rs | 20 ++++++++++++++++++++ src/uu/sed/src/processor.rs | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index f4b4cea0..6b2ee267 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -135,6 +135,16 @@ pub enum OutputChunk<'a> { pub type OutputChunkRef<'a> = OutputChunk<'a>; // The same as above for non-Unix platforms, which lack mmap(2) +#[cfg(unix)] +impl OutputChunk<'_> { + pub fn clear(&mut self) { + *self = OutputChunk::Owned { + content: Vec::new(), + has_newline: false, + }; + } +} + #[derive(Debug, PartialEq, Eq)] #[cfg(not(unix))] pub enum OutputChunk { @@ -147,6 +157,16 @@ pub enum OutputChunk { #[cfg(not(unix))] pub type OutputChunkRef = OutputChunk; +#[cfg(not(unix))] +impl OutputChunk { + pub fn clear(&mut self) { + *self = OutputChunk::Owned { + content: Vec::new(), + has_newline: false, + }; + } +} + /// Unified reader that uses mmap when possible, falls back to buffered reading. pub enum LineReader { #[cfg(unix)] diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 0444bdb8..aae807e8 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -24,7 +24,7 @@ fn process_file( output: &mut OutputBuffer, processing_context: &mut ProcessingContext, ) -> UResult<()> { - while let Some(pattern_space) = reader.get_line()? { + while let Some(mut pattern_space) = reader.get_line()? { processing_context.line_number += 1; let mut current: Option>> = commands.clone(); while let Some(command_rc) = current { @@ -46,7 +46,8 @@ fn process_file( // TODO } 'd' => { - // TODO + pattern_space.clear(); + break; } 'D' => { // TODO From 9d23255666ba073455a0fd2aa68abf673950be87 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 7 May 2025 15:32:28 +0300 Subject: [PATCH 27/85] Remove unneeded type alias --- src/uu/sed/src/fast_io.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 6b2ee267..d44bec1b 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -131,9 +131,6 @@ pub enum OutputChunk<'a> { }, } -#[cfg(unix)] -pub type OutputChunkRef<'a> = OutputChunk<'a>; - // The same as above for non-Unix platforms, which lack mmap(2) #[cfg(unix)] impl OutputChunk<'_> { @@ -154,9 +151,6 @@ pub enum OutputChunk { }, } -#[cfg(not(unix))] -pub type OutputChunkRef = OutputChunk; - #[cfg(not(unix))] impl OutputChunk { pub fn clear(&mut self) { @@ -230,7 +224,7 @@ impl LineReader { } /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result> { + pub fn get_line(&mut self) -> io::Result> { match self { #[cfg(unix)] LineReader::MmapInput { cursor, .. } => { From 3366d7e33824052e38e67aabf20b90012962d26d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 8 May 2025 12:19:47 +0300 Subject: [PATCH 28/85] Create a single portable OutputChunk This avoids the duplication of implementation methods, which will be required for processing data. --- src/uu/sed/src/fast_io.rs | 64 +++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index d44bec1b..393198af 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -16,12 +16,19 @@ #[cfg(unix)] use memmap2::Mmap; + use std::borrow::Cow; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; + +#[cfg(not(unix))] +use std::marker::PhantomData; + #[cfg(unix)] use std::os::unix::io::AsRawFd; + use std::path::PathBuf; + #[cfg(unix)] use uucore::libc::{c_void, write}; @@ -119,8 +126,8 @@ impl ReadLineCursor { /// and bypassing BufWriter, or it can be other data from the process's /// memory space. #[derive(Debug, PartialEq, Eq)] -#[cfg(unix)] pub enum OutputChunk<'a> { + #[cfg(unix)] MmapInput { content: &'a [u8], // Line without newline full_span: &'a [u8], // Line including original newline, if any @@ -128,36 +135,32 @@ pub enum OutputChunk<'a> { Owned { content: Vec, // Line content without newline has_newline: bool, // True if \n-terminated + #[cfg(not(unix))] + _phantom: PhantomData<&'a ()>, // Silence E0392 warning }, } -// The same as above for non-Unix platforms, which lack mmap(2) -#[cfg(unix)] impl OutputChunk<'_> { - pub fn clear(&mut self) { - *self = OutputChunk::Owned { - content: Vec::new(), - has_newline: false, + /// Construct a new Owned chunk. + pub fn new_owned(content: Vec, has_newline: bool) -> Self { + #[cfg(unix)] + return OutputChunk::Owned { + content, + has_newline, }; - } -} -#[derive(Debug, PartialEq, Eq)] -#[cfg(not(unix))] -pub enum OutputChunk { - Owned { - content: Vec, // Line content without newline - has_newline: bool, // True if \n-terminated - }, -} + #[cfg(not(unix))] + return OutputChunk::Owned { + content, + has_newline, + // Avoid E0063 missing _phantom initialization errors + _phantom: std::marker::PhantomData, + }; + } -#[cfg(not(unix))] -impl OutputChunk { + /// Clear the object's contents, converting it it Owned if needed. pub fn clear(&mut self) { - *self = OutputChunk::Owned { - content: Vec::new(), - has_newline: false, - }; + *self = OutputChunk::new_owned(Vec::new(), false); } } @@ -236,10 +239,10 @@ impl LineReader { } LineReader::ReadInput(cursor) => { if let Some((line, has_newline)) = cursor.get_line()? { - Ok(Some(OutputChunk::Owned { - content: line.into_owned().into_bytes(), + Ok(Some(OutputChunk::new_owned( + line.into_owned().into_bytes(), has_newline, - })) + ))) } else { Ok(None) } @@ -321,10 +324,7 @@ impl OutputBuffer { /// Schedule the specified string for eventual output pub fn write_str(&mut self, s: &str) -> io::Result<()> { // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::Owned { - content: s.as_bytes().to_vec(), - has_newline: false, - }) + self.write_chunk(&OutputChunk::new_owned(s.as_bytes().to_vec(), false)) } } @@ -353,6 +353,7 @@ impl OutputBuffer { OutputChunk::Owned { content, has_newline, + .. } => { self.flush_mmap()?; self.out.write_all(content)?; @@ -401,6 +402,7 @@ impl OutputBuffer { OutputChunk::Owned { content, has_newline, + .. } => { self.out.write_all(content)?; if *has_newline { @@ -767,6 +769,7 @@ mod tests { if let Some(OutputChunk::Owned { content, has_newline, + .. }) = reader.get_line()? { assert_eq!(content, b"first line"); @@ -779,6 +782,7 @@ mod tests { if let Some(OutputChunk::Owned { content, has_newline, + .. }) = reader.get_line()? { assert_eq!(content, b"second line"); From b90bf0373cf964da1a3239bbb67995469532aefd Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 8 May 2025 13:09:54 +0300 Subject: [PATCH 29/85] Test the delete command --- tests/by-util/test_sed.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 3b3ecd43..c302ab88 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -90,3 +90,24 @@ fn test_no_script_file() { .stdout_is_fixture(fixture); } } + +#[test] +fn test_delete_stdin() { + for fixture in INPUT_FILES { + new_ucmd!() + .arg("d") + .pipe_in_fixture(fixture) + .succeeds() + .no_stdout(); + } +} + +#[test] +fn test_delete_file() { + for fixture in INPUT_FILES { + new_ucmd!() + .args(&["-e", "d", fixture]) + .succeeds() + .no_stdout(); + } +} From f628fe0d21c859cccdb09a6d6ecf694eb4e47b97 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 8 May 2025 19:46:39 +0300 Subject: [PATCH 30/85] Add applies function to filter commands (WIP) - Create new IOChunk struct to cache-verify UTF-8 conversion, and use it for all I/O. This has a as_str_unchecked() method to obtain a UTF-8 string from the stored bytes. - Rename OutputChunk into IOChunkContent. - Add match_address and applies functions to the processor. TODO: - [ ] Refactor LineReader to avoid multiple borrowing. - [ ] Add unit tests. - [ ] Add integration tests. --- README.md | 8 +- src/uu/sed/src/command.rs | 3 + src/uu/sed/src/fast_io.rs | 183 +++++++++++++++++++++++++++--------- src/uu/sed/src/processor.rs | 157 ++++++++++++++++++++++++++++--- src/uu/sed/src/sed.rs | 2 + 5 files changed, 291 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index 5f42a768..023249c4 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,18 @@ cd sed cargo build --release cargo run --release ``` -## Extensions +## Extensions and incompatibilities ### GNU * Command-line arguments can be specified in long (`--`) form. * Spaces can precede a regular expression modifier. ### BSD and GNU +* The input is assumed to be valid UTF-8 (this includes 7-bit ASCII). + If the input is in another code page, consider converting it through UTF-8 + in order to avoid errors on invalid UTF-8 sequences and for the correct + handling of regular expressions. + This _sed_ program can also handle arbitrary byte sequences if no part of the + input is treated as string. * The second address in a range can be specified as a relative address with +N. ### Other diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 294da581..e3ac0db0 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -36,9 +36,12 @@ pub struct ProcessingContext { pub sandbox: bool, pub unbuffered: bool, pub null_data: bool, + // Other context /// Current input line number pub line_number: usize, + /// Last address of a range + pub last_address: bool, } // The specification of a script: through a string or a file diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 393198af..4d10c1a8 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -28,10 +28,13 @@ use std::marker::PhantomData; use std::os::unix::io::AsRawFd; use std::path::PathBuf; +use std::str; #[cfg(unix)] use uucore::libc::{c_void, write}; +use uucore::error::{UError, USimpleError}; + // Define two cursors for iterating over lines: // - MmapLineCursor based on mmap(2), // - ReadLineCursorbased on BufReader. @@ -121,12 +124,51 @@ impl ReadLineCursor { } } +/// As chunk of data that is input and can be output, often very efficiently +#[derive(Debug, PartialEq, Eq)] +pub struct IOChunk<'a> { + utf8_verified: bool, // True if the contents are valid UTF-8 + content: IOChunkContent<'a>, +} + +impl<'a> IOChunk<'a> { + /// Construct an IOChunk from the given content + fn from_content(content: IOChunkContent<'a>) -> Self { + Self { + utf8_verified: false, + content, + } + } + + /// Clear the object's contents, converting it it Owned if needed. + pub fn clear(&mut self) { + self.content = IOChunkContent::new_owned(Vec::new(), false); + self.utf8_verified = true; + } + + /// Return the content as a string. + pub fn try_as_str(&mut self) -> Result<&str, Box> { + if self.utf8_verified { + // Use cached result + return Ok(unsafe { self.content.as_str_unchecked() }); + } + + let result = match &self.content { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => str::from_utf8(content), + IOChunkContent::Owned { content, .. } => str::from_utf8(content), + }; + self.utf8_verified = true; + result.map_err(|e| USimpleError::new(1, e.to_string())) + } +} + /// Data to be written to a file. It can come from the mmapped /// memory space, in which case it is tracked to allow coallescing /// and bypassing BufWriter, or it can be other data from the process's /// memory space. #[derive(Debug, PartialEq, Eq)] -pub enum OutputChunk<'a> { +enum IOChunkContent<'a> { #[cfg(unix)] MmapInput { content: &'a [u8], // Line without newline @@ -140,17 +182,17 @@ pub enum OutputChunk<'a> { }, } -impl OutputChunk<'_> { +impl IOChunkContent<'_> { /// Construct a new Owned chunk. pub fn new_owned(content: Vec, has_newline: bool) -> Self { #[cfg(unix)] - return OutputChunk::Owned { + return IOChunkContent::Owned { content, has_newline, }; #[cfg(not(unix))] - return OutputChunk::Owned { + return IOChunkContent::Owned { content, has_newline, // Avoid E0063 missing _phantom initialization errors @@ -158,9 +200,12 @@ impl OutputChunk<'_> { }; } - /// Clear the object's contents, converting it it Owned if needed. - pub fn clear(&mut self) { - *self = OutputChunk::new_owned(Vec::new(), false); + unsafe fn as_str_unchecked(&self) -> &str { + match self { + #[cfg(unix)] + IOChunkContent::MmapInput { content, .. } => std::str::from_utf8_unchecked(content), + IOChunkContent::Owned { content, .. } => std::str::from_utf8_unchecked(content), + } } } @@ -227,22 +272,25 @@ impl LineReader { } /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result> { + pub fn get_line(&mut self) -> io::Result> { match self { #[cfg(unix)] LineReader::MmapInput { cursor, .. } => { if let Some((content, full_span)) = cursor.get_line()? { - Ok(Some(OutputChunk::MmapInput { content, full_span })) + Ok(Some(IOChunk::from_content(IOChunkContent::MmapInput { + content, + full_span, + }))) } else { Ok(None) } } LineReader::ReadInput(cursor) => { if let Some((line, has_newline)) = cursor.get_line()? { - Ok(Some(OutputChunk::new_owned( + Ok(Some(IOChunk::from_content(IOChunkContent::new_owned( line.into_owned().into_bytes(), has_newline, - ))) + )))) } else { Ok(None) } @@ -276,11 +324,11 @@ impl OutputWrite for T {} /// system call without any copying if worthwhile. /// All other output is buffered and writen via BufWriter. pub struct OutputBuffer { - out: BufWriter>, // Where to write + out: BufWriter>, // Where to write #[cfg(unix)] - mmap_ptr: Option<(*const u8, usize)>, // Start and len of chunk to write + mmap_ptr: Option<(*const u8, usize)>, // Start and len of chunk to write #[cfg(test)] - writes_issued: usize, // Number of issued write(2) calls + writes_issued: usize, // Number of issued write(2) calls } /// Wrapper that issues the write(2) system call @@ -311,7 +359,7 @@ const MIN_DIRECT_WRITE: usize = 4 * 1024; const MAX_PENDING_WRITE: usize = 64 * 1024; impl OutputBuffer { - pub fn new(w: Box) -> Self { + pub fn new(w: Box) -> Self { Self { out: BufWriter::new(w), #[cfg(unix)] @@ -324,16 +372,19 @@ impl OutputBuffer { /// Schedule the specified string for eventual output pub fn write_str(&mut self, s: &str) -> io::Result<()> { // Use the write_chunk corresponding to cfg - self.write_chunk(&OutputChunk::new_owned(s.as_bytes().to_vec(), false)) + self.write_chunk(&IOChunk::from_content(IOChunkContent::new_owned( + s.as_bytes().to_vec(), + false, + ))) } } #[cfg(unix)] impl OutputBuffer { /// Schedule the specified output chunk for eventual output - pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { - match chunk { - OutputChunk::MmapInput { full_span, .. } => { + pub fn write_chunk(&mut self, chunk: &IOChunk) -> io::Result<()> { + match &chunk.content { + IOChunkContent::MmapInput { full_span, .. } => { let ptr = full_span.as_ptr(); let len = full_span.len(); @@ -350,7 +401,7 @@ impl OutputBuffer { Ok(()) } - OutputChunk::Owned { + IOChunkContent::Owned { content, has_newline, .. @@ -397,9 +448,9 @@ impl OutputBuffer { #[cfg(not(unix))] impl OutputBuffer { /// Schedule the specified output chunk for eventual output - pub fn write_chunk(&mut self, chunk: &OutputChunk) -> io::Result<()> { - match chunk { - OutputChunk::Owned { + pub fn write_chunk(&mut self, chunk: &IOChunk) -> io::Result<()> { + match &chunk.content { + IOChunkContent::Owned { content, has_newline, .. @@ -758,7 +809,7 @@ mod tests { fn test_stream_read() -> std::io::Result<()> { // Create temporary file with known contents let mut tmp = NamedTempFile::new()?; - write!(tmp, "first line\nsecond line\n")?; + write!(tmp, "first line\nsecond line\nlast line\n")?; tmp.flush()?; let path = tmp.path().to_path_buf(); @@ -766,29 +817,49 @@ mod tests { // Verify the reader's operation assert!(!reader.is_last_line()?); - if let Some(OutputChunk::Owned { - content, - has_newline, + if let Some(IOChunk { + content: + IOChunkContent::Owned { + content, + has_newline, + .. + }, + utf8_verified, .. }) = reader.get_line()? { assert_eq!(content, b"first line"); assert!(has_newline); + assert!(!utf8_verified); } else { - panic!("Expected OutputChunk::Owned"); + panic!("Expected IOChunkContent::Owned"); } - assert!(!reader.is_last_line()?); - if let Some(OutputChunk::Owned { - content, - has_newline, + if let Some(IOChunk { + content: + IOChunkContent::Owned { + content, + has_newline, + .. + }, .. }) = reader.get_line()? { assert_eq!(content, b"second line"); assert!(has_newline); } else { - panic!("Expected OutputChunk::Owned"); + panic!("Expected IOChunkContent::Owned"); + } + + assert!(!reader.is_last_line()?); + if let Some(mut content) = reader.get_line()? { + assert!(!content.utf8_verified); + assert_eq!(content.try_as_str().unwrap(), "last line"); + assert!(content.utf8_verified); + // Cached version + assert_eq!(content.try_as_str().unwrap(), "last line"); + } else { + panic!("Expected IOChunk"); } assert!(reader.is_last_line()?); @@ -810,21 +881,39 @@ mod tests { // Verify the reader's operation assert!(!reader.is_last_line()?); - assert_eq!( - reader.get_line()?, - Some(OutputChunk::MmapInput { - content: b"first line".as_ref(), - full_span: b"first line\n".as_ref(), - }) - ); + if let Some(IOChunk { + content: + IOChunkContent::MmapInput { + content, full_span, .. + }, + utf8_verified, + .. + }) = reader.get_line()? + { + assert_eq!(content, b"first line"); + assert_eq!(full_span, b"first line\n"); + assert!(!utf8_verified); + } else { + panic!("Expected IOChunkContent::MapInput"); + } + assert!(!reader.is_last_line()?); - assert_eq!( - reader.get_line()?, - Some(OutputChunk::MmapInput { - content: b"second line".as_ref(), - full_span: b"second line\n".as_ref(), - }) - ); + if let Some(IOChunk { + content: + IOChunkContent::MmapInput { + content, full_span, .. + }, + utf8_verified, + .. + }) = reader.get_line()? + { + assert_eq!(content, b"second line"); + assert_eq!(full_span, b"second line\n"); + assert!(!utf8_verified); + } else { + panic!("Expected IOChunkContent::MapInput"); + } + assert!(reader.is_last_line()?); assert_eq!(reader.get_line()?, None); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index aae807e8..e5445121 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,8 +8,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Command, ProcessingContext}; -use crate::fast_io::{LineReader, OutputBuffer}; +use crate::command::{Address, AddressType, AddressValue, Command, ProcessingContext}; +use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; use atty::Stream; use std::cell::RefCell; @@ -17,18 +17,147 @@ use std::path::PathBuf; use std::rc::Rc; use uucore::error::UResult; +#[allow(dead_code)] +/// Return true if the passed address matches the current I/O context. +fn match_address( + addr: &Address, + reader: &mut LineReader, + pattern: &mut IOChunk, + context: &ProcessingContext, +) -> UResult { + match addr.atype { + AddressType::Re => { + if let AddressValue::Regex(ref re) = addr.value { + Ok(re.is_match(pattern.try_as_str()?)) + } else { + Ok(false) + } + } + AddressType::Line => { + if let AddressValue::LineNumber(lineno) = addr.value { + Ok(context.line_number == lineno) + } else { + Ok(false) + } + } + AddressType::Last => Ok(reader.is_last_line()?), + _ => panic!("invalid address type in match_address"), + } +} + +#[allow(dead_code)] +/// Return true if the command applies to the given pattern. +fn applies( + command: &mut Command, + reader: &mut LineReader, + pattern: &mut IOChunk, + context: &mut ProcessingContext, +) -> UResult { + let linenum = context.line_number; + + let result = if command.addr1.is_none() && command.addr2.is_none() { + Ok(true) + } else if let Some(addr2) = &command.addr2 { + if let Some(start) = command.start_line { + match addr2.atype { + AddressType::RelLine => { + if let AddressValue::LineNumber(n) = addr2.value { + if linenum - start <= n { + Ok(true) + } else { + command.start_line = None; + Ok(false) + } + } else { + Ok(false) + } + } + _ => { + if match_address(addr2, reader, pattern, context)? { + command.start_line = None; + context.last_address = true; + Ok(true) + } else if addr2.atype == AddressType::Line { + if let AddressValue::LineNumber(n) = addr2.value { + if linenum > n { + command.start_line = None; + Ok(false) + } else { + Ok(true) + } + } else { + Ok(true) + } + } else { + Ok(true) + } + } + } + } else if let Some(addr1) = &command.addr1 { + if match_address(addr1, reader, pattern, context)? { + match addr2.atype { + AddressType::Line => { + if let AddressValue::LineNumber(n) = addr2.value { + if linenum >= n { + context.last_address = true; + } else { + command.start_line = Some(linenum); + } + } + } + AddressType::RelLine => { + if let AddressValue::LineNumber(0) = addr2.value { + context.last_address = true; + } else { + command.start_line = Some(linenum); + } + } + _ => { + command.start_line = Some(linenum); + } + } + Ok(true) + } else { + Ok(false) + } + } else { + Ok(false) + } + } else if let Some(addr1) = &command.addr1 { + Ok(match_address(addr1, reader, pattern, context)?) + } else { + Ok(false) + }; + + if command.non_select { + result.map(|v| !v) + } else { + result + } +} + /// Process a single input file fn process_file( commands: &Option>>, reader: &mut LineReader, output: &mut OutputBuffer, - processing_context: &mut ProcessingContext, + context: &mut ProcessingContext, ) -> UResult<()> { - while let Some(mut pattern_space) = reader.get_line()? { - processing_context.line_number += 1; + while let Some(p) = reader.get_line()? { + let mut pattern = p; + context.line_number += 1; let mut current: Option>> = commands.clone(); while let Some(command_rc) = current { let command = command_rc.borrow(); + + // Not compiled until the double-borrow of reader is resolved. + #[cfg(any())] + if !applies(&mut command, reader, &mut pattern, context)? { + // Advance to next command + current = command.next.clone(); + continue; + } + // TODO: continue if command doesn't apply match command.code { '{' => { @@ -46,7 +175,7 @@ fn process_file( // TODO } 'd' => { - pattern_space.clear(); + pattern.clear(); break; } 'D' => { @@ -119,8 +248,8 @@ fn process_file( current = command.next.clone(); } - output.write_chunk(&pattern_space)?; - if processing_context.unbuffered { + output.write_chunk(&pattern)?; + if context.unbuffered { output.flush()?; } } @@ -131,19 +260,19 @@ fn process_file( pub fn process_all_files( commands: Option>>, files: Vec, - mut processing_context: ProcessingContext, + mut context: ProcessingContext, ) -> UResult<()> { - processing_context.unbuffered = processing_context.unbuffered || atty::is(Stream::Stdout); + context.unbuffered = context.unbuffered || atty::is(Stream::Stdout); - let mut in_place = InPlace::new(processing_context.clone())?; + let mut in_place = InPlace::new(context.clone())?; for path in files { let mut reader = LineReader::open(&path)?; let output = in_place.begin(&path)?; - if processing_context.separate { - processing_context.line_number = 0; + if context.separate { + context.line_number = 0; } - process_file(&commands, &mut reader, output, &mut processing_context)?; + process_file(&commands, &mut reader, output, &mut context)?; in_place.end()?; } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 611c57d8..9983dc90 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -196,8 +196,10 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { sandbox: matches.get_flag("sandbox"), unbuffered: matches.get_flag("unbuffered"), null_data: matches.get_flag("null-data"), + // Other context line_number: 0, + last_address: false, } } From f6d07021473ca4ebd1ad752e588da2be8f0fe605 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 8 May 2025 20:52:55 +0300 Subject: [PATCH 31/85] Remove needless String-bytes-string conversion For the Owned objects. --- src/uu/sed/src/fast_io.rs | 92 ++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 4d10c1a8..9a92310c 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -17,7 +17,6 @@ #[cfg(unix)] use memmap2::Mmap; -use std::borrow::Cow; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Read, Write}; @@ -33,7 +32,10 @@ use std::str; #[cfg(unix)] use uucore::libc::{c_void, write}; -use uucore::error::{UError, USimpleError}; +use uucore::error::UError; + +#[cfg(unix)] +use uucore::error::USimpleError; // Define two cursors for iterating over lines: // - MmapLineCursor based on mmap(2), @@ -102,7 +104,7 @@ impl ReadLineCursor { } /// Return the next line and its \n termination, if available, or None. - fn get_line(&mut self) -> io::Result, bool)>> { + fn get_line(&mut self) -> io::Result> { self.buffer.clear(); // read_line *includes* the '\n' if present let bytes_read = self.reader.read_line(&mut self.buffer)?; @@ -115,7 +117,8 @@ impl ReadLineCursor { if has_newline { self.buffer.pop(); } - Ok(Some((Cow::Owned(self.buffer.clone()), has_newline))) + let line = std::mem::take(&mut self.buffer); + Ok(Some((line, has_newline))) } /// Return true if the line read is the last one @@ -142,24 +145,39 @@ impl<'a> IOChunk<'a> { /// Clear the object's contents, converting it it Owned if needed. pub fn clear(&mut self) { - self.content = IOChunkContent::new_owned(Vec::new(), false); self.utf8_verified = true; + match &mut self.content { + IOChunkContent::Owned { + content, + has_newline, + .. + } => { + content.clear(); + *has_newline = false; + } + #[cfg(unix)] + _ => { + self.content = IOChunkContent::new_owned(String::new(), false); + } + } } /// Return the content as a string. pub fn try_as_str(&mut self) -> Result<&str, Box> { - if self.utf8_verified { - // Use cached result - return Ok(unsafe { self.content.as_str_unchecked() }); - } - - let result = match &self.content { + match &self.content { #[cfg(unix)] - IOChunkContent::MmapInput { content, .. } => str::from_utf8(content), - IOChunkContent::Owned { content, .. } => str::from_utf8(content), - }; - self.utf8_verified = true; - result.map_err(|e| USimpleError::new(1, e.to_string())) + IOChunkContent::MmapInput { content, .. } => { + if self.utf8_verified { + // Use cached result + Ok(unsafe { self.content.as_str_unchecked() }) + } else { + let result = str::from_utf8(content); + self.utf8_verified = true; + result.map_err(|e| USimpleError::new(1, e.to_string())) + } + } + IOChunkContent::Owned { content, .. } => Ok(content), + } } } @@ -175,7 +193,7 @@ enum IOChunkContent<'a> { full_span: &'a [u8], // Line including original newline, if any }, Owned { - content: Vec, // Line content without newline + content: String, // Line content without newline has_newline: bool, // True if \n-terminated #[cfg(not(unix))] _phantom: PhantomData<&'a ()>, // Silence E0392 warning @@ -184,7 +202,7 @@ enum IOChunkContent<'a> { impl IOChunkContent<'_> { /// Construct a new Owned chunk. - pub fn new_owned(content: Vec, has_newline: bool) -> Self { + pub fn new_owned(content: String, has_newline: bool) -> Self { #[cfg(unix)] return IOChunkContent::Owned { content, @@ -200,11 +218,11 @@ impl IOChunkContent<'_> { }; } + #[cfg(unix)] unsafe fn as_str_unchecked(&self) -> &str { match self { - #[cfg(unix)] IOChunkContent::MmapInput { content, .. } => std::str::from_utf8_unchecked(content), - IOChunkContent::Owned { content, .. } => std::str::from_utf8_unchecked(content), + IOChunkContent::Owned { content, .. } => content, } } } @@ -288,7 +306,7 @@ impl LineReader { LineReader::ReadInput(cursor) => { if let Some((line, has_newline)) = cursor.get_line()? { Ok(Some(IOChunk::from_content(IOChunkContent::new_owned( - line.into_owned().into_bytes(), + line, has_newline, )))) } else { @@ -369,11 +387,10 @@ impl OutputBuffer { } } - /// Schedule the specified string for eventual output - pub fn write_str(&mut self, s: &str) -> io::Result<()> { - // Use the write_chunk corresponding to cfg + /// Schedule the specified String or &strfor eventual output + pub fn write_str>(&mut self, s: S) -> io::Result<()> { self.write_chunk(&IOChunk::from_content(IOChunkContent::new_owned( - s.as_bytes().to_vec(), + s.into(), false, ))) } @@ -407,7 +424,7 @@ impl OutputBuffer { .. } => { self.flush_mmap()?; - self.out.write_all(content)?; + self.out.write_all(content.as_bytes())?; if *has_newline { self.out.write_all(b"\n")?; } @@ -455,7 +472,7 @@ impl OutputBuffer { has_newline, .. } => { - self.out.write_all(content)?; + self.out.write_all(content.as_bytes())?; if *has_newline { self.out.write_all(b"\n")?; } @@ -828,7 +845,7 @@ mod tests { .. }) = reader.get_line()? { - assert_eq!(content, b"first line"); + assert_eq!(content, "first line"); assert!(has_newline); assert!(!utf8_verified); } else { @@ -845,7 +862,7 @@ mod tests { .. }) = reader.get_line()? { - assert_eq!(content, b"second line"); + assert_eq!(content, "second line"); assert!(has_newline); } else { panic!("Expected IOChunkContent::Owned"); @@ -853,10 +870,6 @@ mod tests { assert!(!reader.is_last_line()?); if let Some(mut content) = reader.get_line()? { - assert!(!content.utf8_verified); - assert_eq!(content.try_as_str().unwrap(), "last line"); - assert!(content.utf8_verified); - // Cached version assert_eq!(content.try_as_str().unwrap(), "last line"); } else { panic!("Expected IOChunk"); @@ -873,7 +886,7 @@ mod tests { fn test_mmap_read() -> std::io::Result<()> { // Create temporary file with known contents let mut tmp = NamedTempFile::new()?; - write!(tmp, "first line\nsecond line\n")?; + write!(tmp, "first line\nsecond line\nlast line\n")?; tmp.flush()?; let path = tmp.path().to_path_buf(); @@ -914,6 +927,17 @@ mod tests { panic!("Expected IOChunkContent::MapInput"); } + assert!(!reader.is_last_line()?); + if let Some(mut content) = reader.get_line()? { + assert!(!content.utf8_verified); + assert_eq!(content.try_as_str().unwrap(), "last line"); + assert!(content.utf8_verified); + // Cached version + assert_eq!(content.try_as_str().unwrap(), "last line"); + } else { + panic!("Expected IOChunk"); + } + assert!(reader.is_last_line()?); assert_eq!(reader.get_line()?, None); From 0acd866d639d1862e2b2645ded1863b52cdd618e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 8 May 2025 21:06:14 +0300 Subject: [PATCH 32/85] Have Reader::get_line also return is_last_line This avoids borrowing conflicts arising from calling Reader twice: once to get the line and once to see if it was the last. --- src/uu/sed/src/command.rs | 4 +- src/uu/sed/src/fast_io.rs | 190 +++++++++++++++++++----------------- src/uu/sed/src/processor.rs | 14 ++- src/uu/sed/src/sed.rs | 1 + 4 files changed, 111 insertions(+), 98 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index e3ac0db0..f05557c9 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -40,8 +40,10 @@ pub struct ProcessingContext { // Other context /// Current input line number pub line_number: usize, - /// Last address of a range + /// True if this is the last address of a range pub last_address: bool, + /// True if the line read is the last line + pub last_line: bool, } // The specification of a script: through a string or a file diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 9a92310c..45cf24fd 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -48,6 +48,14 @@ pub struct MmapLineCursor<'a> { pos: usize, } +#[cfg(unix)] +/// Represents the get_line return: one line plus whether it was the last. +pub struct NextMmapLine<'a> { + pub content: &'a [u8], + pub full_span: &'a [u8], + pub is_last_line: bool, +} + #[cfg(unix)] impl<'a> MmapLineCursor<'a> { fn new(data: &'a [u8]) -> Self { @@ -55,7 +63,7 @@ impl<'a> MmapLineCursor<'a> { } /// Return the next line, if available, or None. - fn get_line(&mut self) -> io::Result> { + fn get_line(&mut self) -> io::Result>> { if self.pos >= self.data.len() { return Ok(None); } @@ -78,12 +86,12 @@ impl<'a> MmapLineCursor<'a> { full_span }; - Ok(Some((content, full_span))) - } - - /// Return true if the line read is the last one - fn is_last_line(&mut self) -> io::Result { - Ok(self.pos >= self.data.len()) + let is_last_line = self.pos >= self.data.len(); + Ok(Some(NextMmapLine { + content, + full_span, + is_last_line, + })) } } @@ -103,8 +111,9 @@ impl ReadLineCursor { } } - /// Return the next line and its \n termination, if available, or None. - fn get_line(&mut self) -> io::Result> { + /// If a line is available, return it, its \n termination, + /// and next line availability, itherwise return None. + fn get_line(&mut self) -> io::Result> { self.buffer.clear(); // read_line *includes* the '\n' if present let bytes_read = self.reader.read_line(&mut self.buffer)?; @@ -118,12 +127,8 @@ impl ReadLineCursor { self.buffer.pop(); } let line = std::mem::take(&mut self.buffer); - Ok(Some((line, has_newline))) - } - - /// Return true if the line read is the last one - fn is_last_line(&mut self) -> io::Result { - Ok(self.reader.fill_buf()?.is_empty()) + let is_last_line = self.reader.fill_buf()?.is_empty(); + Ok(Some((line, has_newline, is_last_line))) } } @@ -289,41 +294,38 @@ impl LineReader { line_reader_read_input(file) } - /// Return the next line, if available, or None. - pub fn get_line(&mut self) -> io::Result> { + /// Return the next line, if available and also the availability + /// of another one, or None at end of file. + pub fn get_line(&mut self) -> io::Result> { match self { #[cfg(unix)] LineReader::MmapInput { cursor, .. } => { - if let Some((content, full_span)) = cursor.get_line()? { - Ok(Some(IOChunk::from_content(IOChunkContent::MmapInput { - content, - full_span, - }))) + if let Some(NextMmapLine { + content, + full_span, + is_last_line, + }) = cursor.get_line()? + { + let chunk = + IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + Ok(Some((chunk, is_last_line))) } else { Ok(None) } } + LineReader::ReadInput(cursor) => { - if let Some((line, has_newline)) = cursor.get_line()? { - Ok(Some(IOChunk::from_content(IOChunkContent::new_owned( - line, - has_newline, - )))) + if let Some((line, _has_newline, is_last_line)) = cursor.get_line()? { + let chunk = + IOChunk::from_content(IOChunkContent::new_owned(line, _has_newline)); + Ok(Some((chunk, is_last_line))) } else { Ok(None) } } } } - - /// Return true if the line read is the last one - pub fn is_last_line(&mut self) -> io::Result { - match self { - #[cfg(unix)] - LineReader::MmapInput { cursor, .. } => cursor.is_last_line(), - LineReader::ReadInput(cursor) => cursor.is_last_line(), - } - } } // Define a trait combining two: workaround for Rust's corresponding inability. @@ -567,7 +569,7 @@ mod tests { let mut out = OutputBuffer::new(Box::new(Box::new(out_file))); // Drain reader → writer - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; } out.flush()?; @@ -603,7 +605,7 @@ mod tests { let mut out = OutputBuffer::new(Box::new(out_file)); // Read the first mmap line ("zero\n") and write it - if let Some(chunk) = reader.get_line()? { + if let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; } @@ -611,7 +613,7 @@ mod tests { out.write_str("middle\n")?; // Read the second mmap line ("one\n") and write it - if let Some(chunk) = reader.get_line()? { + if let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; } @@ -656,7 +658,7 @@ mod tests { // Wrap it in your OutputBuffer and run the loop: let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; nline += 1; } @@ -695,7 +697,7 @@ mod tests { // Wrap it in your OutputBuffer and run the loop: let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; nline += 1; } @@ -730,7 +732,7 @@ mod tests { // Wrap it in your OutputBuffer and run the loop: let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; nline += 1; } @@ -765,7 +767,7 @@ mod tests { // Wrap it in your OutputBuffer and run the loop: let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; nline += 1; } @@ -806,7 +808,7 @@ mod tests { // Wrap it in your OutputBuffer and run the loop: let mut out = OutputBuffer::new(Box::new(out_file)); let mut nline = 0; - while let Some(chunk) = reader.get_line()? { + while let Some((chunk, _last_line)) = reader.get_line()? { out.write_chunk(&chunk)?; nline += 1; } @@ -833,49 +835,55 @@ mod tests { let mut reader = LineReader::open_stream(&path)?; // Verify the reader's operation - assert!(!reader.is_last_line()?); - if let Some(IOChunk { - content: - IOChunkContent::Owned { - content, - has_newline, - .. - }, - utf8_verified, - .. - }) = reader.get_line()? + if let Some(( + IOChunk { + content: + IOChunkContent::Owned { + content, + has_newline, + .. + }, + utf8_verified, + .. + }, + last_line, + )) = reader.get_line()? { assert_eq!(content, "first line"); assert!(has_newline); assert!(!utf8_verified); + assert!(!last_line); } else { panic!("Expected IOChunkContent::Owned"); } - if let Some(IOChunk { - content: - IOChunkContent::Owned { - content, - has_newline, - .. - }, - .. - }) = reader.get_line()? + if let Some(( + IOChunk { + content: + IOChunkContent::Owned { + content, + has_newline, + .. + }, + .. + }, + last_line, + )) = reader.get_line()? { assert_eq!(content, "second line"); assert!(has_newline); + assert!(!last_line); } else { panic!("Expected IOChunkContent::Owned"); } - assert!(!reader.is_last_line()?); - if let Some(mut content) = reader.get_line()? { + if let Some((mut content, last_line)) = reader.get_line()? { assert_eq!(content.try_as_str().unwrap(), "last line"); + assert!(last_line); } else { panic!("Expected IOChunk"); } - assert!(reader.is_last_line()?); assert_eq!(reader.get_line()?, None); Ok(()) @@ -893,52 +901,56 @@ mod tests { let mut reader = LineReader::open(&path)?; // Verify the reader's operation - assert!(!reader.is_last_line()?); - if let Some(IOChunk { - content: - IOChunkContent::MmapInput { - content, full_span, .. - }, - utf8_verified, - .. - }) = reader.get_line()? + if let Some(( + IOChunk { + content: + IOChunkContent::MmapInput { + content, full_span, .. + }, + utf8_verified, + .. + }, + last_line, + )) = reader.get_line()? { assert_eq!(content, b"first line"); assert_eq!(full_span, b"first line\n"); assert!(!utf8_verified); + assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); } - assert!(!reader.is_last_line()?); - if let Some(IOChunk { - content: - IOChunkContent::MmapInput { - content, full_span, .. - }, - utf8_verified, - .. - }) = reader.get_line()? + if let Some(( + IOChunk { + content: + IOChunkContent::MmapInput { + content, full_span, .. + }, + utf8_verified, + .. + }, + last_line, + )) = reader.get_line()? { assert_eq!(content, b"second line"); assert_eq!(full_span, b"second line\n"); assert!(!utf8_verified); + assert!(!last_line); } else { panic!("Expected IOChunkContent::MapInput"); } - assert!(!reader.is_last_line()?); - if let Some(mut content) = reader.get_line()? { - assert!(!content.utf8_verified); + if let Some((mut content, last_line)) = reader.get_line()? { assert_eq!(content.try_as_str().unwrap(), "last line"); assert!(content.utf8_verified); + assert!(last_line); // Cached version assert_eq!(content.try_as_str().unwrap(), "last line"); } else { panic!("Expected IOChunk"); } - assert!(reader.is_last_line()?); assert_eq!(reader.get_line()?, None); Ok(()) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index e5445121..7cbd8af5 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -21,7 +21,6 @@ use uucore::error::UResult; /// Return true if the passed address matches the current I/O context. fn match_address( addr: &Address, - reader: &mut LineReader, pattern: &mut IOChunk, context: &ProcessingContext, ) -> UResult { @@ -40,7 +39,7 @@ fn match_address( Ok(false) } } - AddressType::Last => Ok(reader.is_last_line()?), + AddressType::Last => Ok(context.last_line), _ => panic!("invalid address type in match_address"), } } @@ -49,7 +48,6 @@ fn match_address( /// Return true if the command applies to the given pattern. fn applies( command: &mut Command, - reader: &mut LineReader, pattern: &mut IOChunk, context: &mut ProcessingContext, ) -> UResult { @@ -73,7 +71,7 @@ fn applies( } } _ => { - if match_address(addr2, reader, pattern, context)? { + if match_address(addr2, pattern, context)? { command.start_line = None; context.last_address = true; Ok(true) @@ -94,7 +92,7 @@ fn applies( } } } else if let Some(addr1) = &command.addr1 { - if match_address(addr1, reader, pattern, context)? { + if match_address(addr1, pattern, context)? { match addr2.atype { AddressType::Line => { if let AddressValue::LineNumber(n) = addr2.value { @@ -124,7 +122,7 @@ fn applies( Ok(false) } } else if let Some(addr1) = &command.addr1 { - Ok(match_address(addr1, reader, pattern, context)?) + Ok(match_address(addr1, pattern, context)?) } else { Ok(false) }; @@ -143,8 +141,8 @@ fn process_file( output: &mut OutputBuffer, context: &mut ProcessingContext, ) -> UResult<()> { - while let Some(p) = reader.get_line()? { - let mut pattern = p; + while let Some((mut pattern, last_line)) = reader.get_line()? { + context.last_line = last_line; context.line_number += 1; let mut current: Option>> = commands.clone(); while let Some(command_rc) = current { diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 9983dc90..1a1cc007 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -200,6 +200,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { // Other context line_number: 0, last_address: false, + last_line: false, } } From 774026370b492ce90a6a14b086bb300b95c11c23 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 9 May 2025 16:30:29 +0300 Subject: [PATCH 33/85] Implement and pass address range tests - Add file/script based tests and macro to shorten them. - Implement required "p" command. - Add last_file in the processing context to recognize last line. - Fix initialization of start_line. - Implement required -n (--silent --quiet) flag. --- src/uu/sed/src/command.rs | 4 +- src/uu/sed/src/processor.rs | 50 ++++++++++--- src/uu/sed/src/sed.rs | 1 + tests/by-util/test_sed.rs | 74 +++++++++++++++++++ tests/fixtures/sed/lines1 | 14 ++++ tests/fixtures/sed/lines2 | 9 +++ tests/fixtures/sed/output/addr_append_empty | 0 .../sed/output/addr_escaped_delimiter | 1 + tests/fixtures/sed/output/addr_found | 1 + tests/fixtures/sed/output/addr_found_space | 1 + tests/fixtures/sed/output/addr_last_one_file | 1 + tests/fixtures/sed/output/addr_last_two_files | 1 + .../fixtures/sed/output/addr_last_with_empty | 1 + tests/fixtures/sed/output/addr_not_found | 0 .../sed/output/addr_numeric_relative_straddle | 2 + .../sed/output/addr_numeric_to_relative | 5 ++ tests/fixtures/sed/output/addr_one_line | 1 + tests/fixtures/sed/output/addr_past_last | 0 .../sed/output/addr_pattern_range_reverse | 1 + .../fixtures/sed/output/addr_pattern_straddle | 7 ++ .../fixtures/sed/output/addr_pattern_to_last | 20 +++++ .../sed/output/addr_pattern_to_pattern | 17 +++++ .../sed/output/addr_pattern_to_relative | 3 + .../sed/output/addr_pattern_to_straddle | 17 +++++ tests/fixtures/sed/output/addr_range_numeric | 4 + tests/fixtures/sed/output/addr_range_reverse | 1 + tests/fixtures/sed/output/addr_range_to_last | 23 ++++++ .../fixtures/sed/output/addr_range_to_pattern | 23 ++++++ tests/fixtures/sed/output/addr_straddle | 1 + 29 files changed, 270 insertions(+), 13 deletions(-) create mode 100644 tests/fixtures/sed/lines1 create mode 100644 tests/fixtures/sed/lines2 create mode 100644 tests/fixtures/sed/output/addr_append_empty create mode 100644 tests/fixtures/sed/output/addr_escaped_delimiter create mode 100644 tests/fixtures/sed/output/addr_found create mode 100644 tests/fixtures/sed/output/addr_found_space create mode 100644 tests/fixtures/sed/output/addr_last_one_file create mode 100644 tests/fixtures/sed/output/addr_last_two_files create mode 100644 tests/fixtures/sed/output/addr_last_with_empty create mode 100644 tests/fixtures/sed/output/addr_not_found create mode 100644 tests/fixtures/sed/output/addr_numeric_relative_straddle create mode 100644 tests/fixtures/sed/output/addr_numeric_to_relative create mode 100644 tests/fixtures/sed/output/addr_one_line create mode 100644 tests/fixtures/sed/output/addr_past_last create mode 100644 tests/fixtures/sed/output/addr_pattern_range_reverse create mode 100644 tests/fixtures/sed/output/addr_pattern_straddle create mode 100644 tests/fixtures/sed/output/addr_pattern_to_last create mode 100644 tests/fixtures/sed/output/addr_pattern_to_pattern create mode 100644 tests/fixtures/sed/output/addr_pattern_to_relative create mode 100644 tests/fixtures/sed/output/addr_pattern_to_straddle create mode 100644 tests/fixtures/sed/output/addr_range_numeric create mode 100644 tests/fixtures/sed/output/addr_range_reverse create mode 100644 tests/fixtures/sed/output/addr_range_to_last create mode 100644 tests/fixtures/sed/output/addr_range_to_pattern create mode 100644 tests/fixtures/sed/output/addr_straddle diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index f05557c9..f8d6e278 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -44,6 +44,8 @@ pub struct ProcessingContext { pub last_address: bool, /// True if the line read is the last line pub last_line: bool, + /// True if the file is the last file of the ones specified + pub last_file: bool, } // The specification of a script: through a string or a file @@ -124,7 +126,7 @@ impl Default for Command { addr1: None, addr2: None, non_select: false, - start_line: Some(0), + start_line: None, text: None, data: CommandData::None, next: None, diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 7cbd8af5..fe9e8a02 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -32,6 +32,7 @@ fn match_address( Ok(false) } } + AddressType::Line => { if let AddressValue::LineNumber(lineno) = addr.value { Ok(context.line_number == lineno) @@ -39,7 +40,15 @@ fn match_address( Ok(false) } } - AddressType::Last => Ok(context.last_line), + + // Recognize "$" as the last line of last file. This is consistent + // with the original 7th Research Edition implementation: + // https://github.com/dspinellis/unix-history-repo/blob/Research-V7/usr/src/cmd/sed/sed1.c#L665 + // The FreeBSD version checked for subsequent empty files, but this + // can lead to destructive reads (e.g. from named pipes), + // and is probably an overkill. + AddressType::Last => Ok(context.last_line && (context.last_file || context.separate)), + _ => panic!("invalid address type in match_address"), } } @@ -134,6 +143,21 @@ fn applies( } } +/// Write the specified chunk to the output for a given processing context. +fn write_chunk( + output: &mut OutputBuffer, + context: &ProcessingContext, + chunk: &IOChunk, +) -> std::io::Result<()> { + output.write_chunk(chunk)?; + + if context.unbuffered { + output.flush()?; + } + + Ok(()) +} + /// Process a single input file fn process_file( commands: &Option>>, @@ -146,11 +170,9 @@ fn process_file( context.line_number += 1; let mut current: Option>> = commands.clone(); while let Some(command_rc) = current { - let command = command_rc.borrow(); + let mut command = command_rc.borrow_mut(); - // Not compiled until the double-borrow of reader is resolved. - #[cfg(any())] - if !applies(&mut command, reader, &mut pattern, context)? { + if !applies(&mut command, &mut pattern, context)? { // Advance to next command current = command.next.clone(); continue; @@ -173,6 +195,7 @@ fn process_file( // TODO } 'd' => { + // Delete the pattern space and start the next cycle. pattern.clear(); break; } @@ -204,7 +227,8 @@ fn process_file( // TODO } 'p' => { - // TODO + // Write the pattern space to standard output. + write_chunk(output, context, &pattern)?; } 'P' => { // TODO @@ -246,9 +270,8 @@ fn process_file( current = command.next.clone(); } - output.write_chunk(&pattern)?; - if context.unbuffered { - output.flush()?; + if !context.quiet { + write_chunk(output, context, &pattern)?; } } Ok(()) @@ -263,9 +286,12 @@ pub fn process_all_files( context.unbuffered = context.unbuffered || atty::is(Stream::Stdout); let mut in_place = InPlace::new(context.clone())?; - for path in files { - let mut reader = LineReader::open(&path)?; - let output = in_place.begin(&path)?; + let last_file_index = files.len() - 1; + + for (index, path) in files.iter().enumerate() { + context.last_file = index == last_file_index; + let mut reader = LineReader::open(path)?; + let output = in_place.begin(path)?; if context.separate { context.line_number = 0; diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 1a1cc007..c5ab4b9e 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -201,6 +201,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { line_number: 0, last_address: false, last_line: false, + last_file: false, } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index c302ab88..60bb7e72 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -111,3 +111,77 @@ fn test_delete_file() { .no_stdout(); } } + +/// Create a new test function to verify an execution for specified output. +macro_rules! check_output { + ($name:ident, $args:expr) => { + #[test] + fn $name() { + new_ucmd!() + .args(&$args) + .succeeds() + .stdout_is_fixture(&format!("output/{}", stringify!($name))); + } + }; +} + +// Test address ranges +check_output!(addr_one_line, ["-n", "-e", "4p", "lines1"]); +check_output!(addr_straddle, ["-n", "-e", "20p", "lines1", "lines2"]); +check_output!(addr_last_one_file, ["-n", "-e", "$p", "lines1"]); +check_output!(addr_last_two_files, ["-n", "-e", "$p", "lines1", "lines2"]); + +// TODO: Enable and configure for Unix/Windows, when "a" is implemented. +#[cfg(any())] +check_output!(addr_append_with_empty, ["-e", "$a\nhello", "/dev/null"]); + +#[cfg(unix)] +check_output!( + addr_last_with_empty, + ["-n", "-e", "$p", "lines1", "/dev/null", "lines2"] +); + +#[cfg(windows)] +check_output!( + addr_last_with_empty, + ["-n", "-e", "$p", "lines1", "NUL", "lines2"] +); + +check_output!(addr_past_last, ["-n", "-e", "20p", "lines1"]); +check_output!(addr_not_found, ["-n", "-e", "/NOTFOUND/p", "lines1"]); +check_output!(addr_found, ["-n", "/l1_7/p", "lines1"]); +check_output!(addr_found_space, ["-n", " /l1_7/ p", "lines1"]); +check_output!(addr_escaped_delimiter, ["-n", "\\_l1\\_7_p", "lines1"]); +check_output!(addr_range_numeric, ["-n", "1,4p", "lines1"]); +check_output!(addr_range_to_last, ["-n", "1,$p", "lines1", "lines2"]); +check_output!( + addr_range_to_pattern, + ["-n", "1,/l2_9/p", "lines1", "lines2"] +); +check_output!(addr_pattern_to_last, ["-n", "/4/,$p", "lines1", "lines2"]); +check_output!( + addr_pattern_to_straddle, + ["-n", "/4/,20p", "lines1", "lines2"] +); +check_output!( + addr_pattern_to_pattern, + ["-n", "/4/,/10/p", "lines1", "lines2"] +); +check_output!( + addr_pattern_straddle, + ["-n", "/l2_3/,/l1_8/p", "lines1", "lines2"] +); +check_output!(addr_range_reverse, ["-n", "12,3p", "lines1", "lines2"]); +check_output!( + addr_pattern_range_reverse, + ["-n", "/l1_7/,3p", "lines1", "lines2"] +); +check_output!( + addr_numeric_to_relative, + ["-n", "13,+4p", "lines1", "lines2"] +); +check_output!( + addr_pattern_to_relative, + ["-n", "/l1_6/,+2p", "lines1", "lines2"] +); +check_output!(addr_numeric_relative_straddle, ["-n", "12,+1p", "lines1"]); diff --git a/tests/fixtures/sed/lines1 b/tests/fixtures/sed/lines1 new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/lines1 @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/lines2 b/tests/fixtures/sed/lines2 new file mode 100644 index 00000000..d2ff3827 --- /dev/null +++ b/tests/fixtures/sed/lines2 @@ -0,0 +1,9 @@ +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_append_empty b/tests/fixtures/sed/output/addr_append_empty new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/addr_escaped_delimiter b/tests/fixtures/sed/output/addr_escaped_delimiter new file mode 100644 index 00000000..8e6f085d --- /dev/null +++ b/tests/fixtures/sed/output/addr_escaped_delimiter @@ -0,0 +1 @@ +l1_7 diff --git a/tests/fixtures/sed/output/addr_found b/tests/fixtures/sed/output/addr_found new file mode 100644 index 00000000..8e6f085d --- /dev/null +++ b/tests/fixtures/sed/output/addr_found @@ -0,0 +1 @@ +l1_7 diff --git a/tests/fixtures/sed/output/addr_found_space b/tests/fixtures/sed/output/addr_found_space new file mode 100644 index 00000000..8e6f085d --- /dev/null +++ b/tests/fixtures/sed/output/addr_found_space @@ -0,0 +1 @@ +l1_7 diff --git a/tests/fixtures/sed/output/addr_last_one_file b/tests/fixtures/sed/output/addr_last_one_file new file mode 100644 index 00000000..6165ce87 --- /dev/null +++ b/tests/fixtures/sed/output/addr_last_one_file @@ -0,0 +1 @@ +l1_14 diff --git a/tests/fixtures/sed/output/addr_last_two_files b/tests/fixtures/sed/output/addr_last_two_files new file mode 100644 index 00000000..1502f007 --- /dev/null +++ b/tests/fixtures/sed/output/addr_last_two_files @@ -0,0 +1 @@ +l2_9 diff --git a/tests/fixtures/sed/output/addr_last_with_empty b/tests/fixtures/sed/output/addr_last_with_empty new file mode 100644 index 00000000..1502f007 --- /dev/null +++ b/tests/fixtures/sed/output/addr_last_with_empty @@ -0,0 +1 @@ +l2_9 diff --git a/tests/fixtures/sed/output/addr_not_found b/tests/fixtures/sed/output/addr_not_found new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/addr_numeric_relative_straddle b/tests/fixtures/sed/output/addr_numeric_relative_straddle new file mode 100644 index 00000000..543c0574 --- /dev/null +++ b/tests/fixtures/sed/output/addr_numeric_relative_straddle @@ -0,0 +1,2 @@ +l1_12 +l1_13 diff --git a/tests/fixtures/sed/output/addr_numeric_to_relative b/tests/fixtures/sed/output/addr_numeric_to_relative new file mode 100644 index 00000000..1a9e0660 --- /dev/null +++ b/tests/fixtures/sed/output/addr_numeric_to_relative @@ -0,0 +1,5 @@ +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 diff --git a/tests/fixtures/sed/output/addr_one_line b/tests/fixtures/sed/output/addr_one_line new file mode 100644 index 00000000..a7c92f0e --- /dev/null +++ b/tests/fixtures/sed/output/addr_one_line @@ -0,0 +1 @@ +l1_4 diff --git a/tests/fixtures/sed/output/addr_past_last b/tests/fixtures/sed/output/addr_past_last new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/addr_pattern_range_reverse b/tests/fixtures/sed/output/addr_pattern_range_reverse new file mode 100644 index 00000000..8e6f085d --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_range_reverse @@ -0,0 +1 @@ +l1_7 diff --git a/tests/fixtures/sed/output/addr_pattern_straddle b/tests/fixtures/sed/output/addr_pattern_straddle new file mode 100644 index 00000000..c4d558b1 --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_straddle @@ -0,0 +1,7 @@ +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_pattern_to_last b/tests/fixtures/sed/output/addr_pattern_to_last new file mode 100644 index 00000000..1f20a523 --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_to_last @@ -0,0 +1,20 @@ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_pattern_to_pattern b/tests/fixtures/sed/output/addr_pattern_to_pattern new file mode 100644 index 00000000..df7978fd --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_to_pattern @@ -0,0 +1,17 @@ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_pattern_to_relative b/tests/fixtures/sed/output/addr_pattern_to_relative new file mode 100644 index 00000000..b1827918 --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_to_relative @@ -0,0 +1,3 @@ +l1_6 +l1_7 +l1_8 diff --git a/tests/fixtures/sed/output/addr_pattern_to_straddle b/tests/fixtures/sed/output/addr_pattern_to_straddle new file mode 100644 index 00000000..4d8dc1e3 --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_to_straddle @@ -0,0 +1,17 @@ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 diff --git a/tests/fixtures/sed/output/addr_range_numeric b/tests/fixtures/sed/output/addr_range_numeric new file mode 100644 index 00000000..ddd49762 --- /dev/null +++ b/tests/fixtures/sed/output/addr_range_numeric @@ -0,0 +1,4 @@ +l1_1 +l1_2 +l1_3 +l1_4 diff --git a/tests/fixtures/sed/output/addr_range_reverse b/tests/fixtures/sed/output/addr_range_reverse new file mode 100644 index 00000000..7fb81db4 --- /dev/null +++ b/tests/fixtures/sed/output/addr_range_reverse @@ -0,0 +1 @@ +l1_12 diff --git a/tests/fixtures/sed/output/addr_range_to_last b/tests/fixtures/sed/output/addr_range_to_last new file mode 100644 index 00000000..d08d35c6 --- /dev/null +++ b/tests/fixtures/sed/output/addr_range_to_last @@ -0,0 +1,23 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_range_to_pattern b/tests/fixtures/sed/output/addr_range_to_pattern new file mode 100644 index 00000000..d08d35c6 --- /dev/null +++ b/tests/fixtures/sed/output/addr_range_to_pattern @@ -0,0 +1,23 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/addr_straddle b/tests/fixtures/sed/output/addr_straddle new file mode 100644 index 00000000..fe925b0a --- /dev/null +++ b/tests/fixtures/sed/output/addr_straddle @@ -0,0 +1 @@ +l2_6 From d3b0cc41de031be5cb00022ada5a683ab576f6cd Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 9 May 2025 17:52:03 +0300 Subject: [PATCH 34/85] Clarify extensions and incompatibilities --- README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 023249c4..64e6525e 100644 --- a/README.md +++ b/README.md @@ -25,22 +25,27 @@ cargo build --release cargo run --release ``` ## Extensions and incompatibilities -### GNU +### Spported GNU extensions * Command-line arguments can be specified in long (`--`) form. * Spaces can precede a regular expression modifier. -### BSD and GNU +### Supported BSD and GNU extensions +* The second address in a range can be specified as a relative address with +N. + +### New extensions +* Unicode characters can be specified in regular expression pattern, replacement + and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. + +### Incompatibilities * The input is assumed to be valid UTF-8 (this includes 7-bit ASCII). If the input is in another code page, consider converting it through UTF-8 in order to avoid errors on invalid UTF-8 sequences and for the correct handling of regular expressions. This _sed_ program can also handle arbitrary byte sequences if no part of the input is treated as string. -* The second address in a range can be specified as a relative address with +N. - -### Other -* Unicode characters can be specified in regular expression pattern, replacement - and transliteration sequences using `\uXXXX` or `\UXXXXXXXX` sequences. +- The last line (`$`) address is interpreted as the last non-empty line of + the last file. If files specified in subsequent arguments until the last + one are empty, then the last line condition will never be triggered. ## License From d2f59faf1544cca600afba47432f02c3ab4140ac Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 09:06:17 +0300 Subject: [PATCH 35/85] Add tests for --separate --- tests/by-util/test_sed.rs | 5 +++++ tests/fixtures/sed/output/addr_first_separate | 2 ++ tests/fixtures/sed/output/addr_last_separate | 2 ++ 3 files changed, 9 insertions(+) create mode 100644 tests/fixtures/sed/output/addr_first_separate create mode 100644 tests/fixtures/sed/output/addr_last_separate diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 60bb7e72..0e587b5f 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -185,3 +185,8 @@ check_output!( ["-n", "/l1_6/,+2p", "lines1", "lines2"] ); check_output!(addr_numeric_relative_straddle, ["-n", "12,+1p", "lines1"]); +check_output!( + addr_first_separate, + ["-n", "--separate", "1p", "lines1", "lines2"] +); +check_output!(addr_last_separate, ["-ns", "$p", "lines1", "lines2"]); diff --git a/tests/fixtures/sed/output/addr_first_separate b/tests/fixtures/sed/output/addr_first_separate new file mode 100644 index 00000000..07b5fe88 --- /dev/null +++ b/tests/fixtures/sed/output/addr_first_separate @@ -0,0 +1,2 @@ +l1_1 +l2_1 diff --git a/tests/fixtures/sed/output/addr_last_separate b/tests/fixtures/sed/output/addr_last_separate new file mode 100644 index 00000000..3c6d6e9f --- /dev/null +++ b/tests/fixtures/sed/output/addr_last_separate @@ -0,0 +1,2 @@ +l1_14 +l2_9 From c21a9f26660c781467a219b145607a44543df4ab Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 09:12:02 +0300 Subject: [PATCH 36/85] Move input files to separate directory --- tests/by-util/test_sed.rs | 64 ++++++++++--------- tests/fixtures/sed/{ => input}/dots-4k.txt | 0 tests/fixtures/sed/{ => input}/dots-64k.txt | 0 tests/fixtures/sed/{ => input}/dots-8k.txt | 0 tests/fixtures/sed/{ => input}/lines1 | 0 tests/fixtures/sed/{ => input}/lines2 | 0 .../fixtures/sed/{ => input}/no-new-line.txt | 0 tests/fixtures/sed/{ => input}/two-lines.txt | 0 8 files changed, 34 insertions(+), 30 deletions(-) rename tests/fixtures/sed/{ => input}/dots-4k.txt (100%) rename tests/fixtures/sed/{ => input}/dots-64k.txt (100%) rename tests/fixtures/sed/{ => input}/dots-8k.txt (100%) rename tests/fixtures/sed/{ => input}/lines1 (100%) rename tests/fixtures/sed/{ => input}/lines2 (100%) rename tests/fixtures/sed/{ => input}/no-new-line.txt (100%) rename tests/fixtures/sed/{ => input}/two-lines.txt (100%) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 0e587b5f..cb67b2c9 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -63,11 +63,11 @@ fn test_f_script_ok() { } const INPUT_FILES: &[&str] = &[ - "two-lines.txt", - "no-new-line.txt", - "dots-4k.txt", - "dots-8k.txt", - "dots-64k.txt", + "input/two-lines.txt", + "input/no-new-line.txt", + "input/dots-4k.txt", + "input/dots-8k.txt", + "input/dots-64k.txt", ]; #[test] @@ -125,11 +125,15 @@ macro_rules! check_output { }; } +// Input files +const LINES1: &str = "input/lines1"; +const LINES2: &str = "input/lines2"; + // Test address ranges -check_output!(addr_one_line, ["-n", "-e", "4p", "lines1"]); -check_output!(addr_straddle, ["-n", "-e", "20p", "lines1", "lines2"]); -check_output!(addr_last_one_file, ["-n", "-e", "$p", "lines1"]); -check_output!(addr_last_two_files, ["-n", "-e", "$p", "lines1", "lines2"]); +check_output!(addr_one_line, ["-n", "-e", "4p", LINES1]); +check_output!(addr_straddle, ["-n", "-e", "20p", LINES1, LINES2]); +check_output!(addr_last_one_file, ["-n", "-e", "$p", LINES1]); +check_output!(addr_last_two_files, ["-n", "-e", "$p", LINES1, LINES2]); // TODO: Enable and configure for Unix/Windows, when "a" is implemented. #[cfg(any())] @@ -138,55 +142,55 @@ check_output!(addr_append_with_empty, ["-e", "$a\nhello", "/dev/null"]); #[cfg(unix)] check_output!( addr_last_with_empty, - ["-n", "-e", "$p", "lines1", "/dev/null", "lines2"] + ["-n", "-e", "$p", LINES1, "/dev/null", LINES2] ); #[cfg(windows)] check_output!( addr_last_with_empty, - ["-n", "-e", "$p", "lines1", "NUL", "lines2"] + ["-n", "-e", "$p", LINES1, "NUL", LINES2] ); -check_output!(addr_past_last, ["-n", "-e", "20p", "lines1"]); -check_output!(addr_not_found, ["-n", "-e", "/NOTFOUND/p", "lines1"]); -check_output!(addr_found, ["-n", "/l1_7/p", "lines1"]); -check_output!(addr_found_space, ["-n", " /l1_7/ p", "lines1"]); -check_output!(addr_escaped_delimiter, ["-n", "\\_l1\\_7_p", "lines1"]); -check_output!(addr_range_numeric, ["-n", "1,4p", "lines1"]); -check_output!(addr_range_to_last, ["-n", "1,$p", "lines1", "lines2"]); +check_output!(addr_past_last, ["-n", "-e", "20p", LINES1]); +check_output!(addr_not_found, ["-n", "-e", "/NOTFOUND/p", LINES1]); +check_output!(addr_found, ["-n", "/l1_7/p", LINES1]); +check_output!(addr_found_space, ["-n", " /l1_7/ p", LINES1]); +check_output!(addr_escaped_delimiter, ["-n", "\\_l1\\_7_p", LINES1]); +check_output!(addr_range_numeric, ["-n", "1,4p", LINES1]); +check_output!(addr_range_to_last, ["-n", "1,$p", LINES1, LINES2]); check_output!( addr_range_to_pattern, - ["-n", "1,/l2_9/p", "lines1", "lines2"] + ["-n", "1,/l2_9/p", LINES1, LINES2] ); -check_output!(addr_pattern_to_last, ["-n", "/4/,$p", "lines1", "lines2"]); +check_output!(addr_pattern_to_last, ["-n", "/4/,$p", LINES1, LINES2]); check_output!( addr_pattern_to_straddle, - ["-n", "/4/,20p", "lines1", "lines2"] + ["-n", "/4/,20p", LINES1, LINES2] ); check_output!( addr_pattern_to_pattern, - ["-n", "/4/,/10/p", "lines1", "lines2"] + ["-n", "/4/,/10/p", LINES1, LINES2] ); check_output!( addr_pattern_straddle, - ["-n", "/l2_3/,/l1_8/p", "lines1", "lines2"] + ["-n", "/l2_3/,/l1_8/p", LINES1, LINES2] ); -check_output!(addr_range_reverse, ["-n", "12,3p", "lines1", "lines2"]); +check_output!(addr_range_reverse, ["-n", "12,3p", LINES1, LINES2]); check_output!( addr_pattern_range_reverse, - ["-n", "/l1_7/,3p", "lines1", "lines2"] + ["-n", "/l1_7/,3p", LINES1, LINES2] ); check_output!( addr_numeric_to_relative, - ["-n", "13,+4p", "lines1", "lines2"] + ["-n", "13,+4p", LINES1, LINES2] ); check_output!( addr_pattern_to_relative, - ["-n", "/l1_6/,+2p", "lines1", "lines2"] + ["-n", "/l1_6/,+2p", LINES1, LINES2] ); -check_output!(addr_numeric_relative_straddle, ["-n", "12,+1p", "lines1"]); +check_output!(addr_numeric_relative_straddle, ["-n", "12,+1p", LINES1]); check_output!( addr_first_separate, - ["-n", "--separate", "1p", "lines1", "lines2"] + ["-n", "--separate", "1p", LINES1, LINES2] ); -check_output!(addr_last_separate, ["-ns", "$p", "lines1", "lines2"]); +check_output!(addr_last_separate, ["-ns", "$p", LINES1, LINES2]); diff --git a/tests/fixtures/sed/dots-4k.txt b/tests/fixtures/sed/input/dots-4k.txt similarity index 100% rename from tests/fixtures/sed/dots-4k.txt rename to tests/fixtures/sed/input/dots-4k.txt diff --git a/tests/fixtures/sed/dots-64k.txt b/tests/fixtures/sed/input/dots-64k.txt similarity index 100% rename from tests/fixtures/sed/dots-64k.txt rename to tests/fixtures/sed/input/dots-64k.txt diff --git a/tests/fixtures/sed/dots-8k.txt b/tests/fixtures/sed/input/dots-8k.txt similarity index 100% rename from tests/fixtures/sed/dots-8k.txt rename to tests/fixtures/sed/input/dots-8k.txt diff --git a/tests/fixtures/sed/lines1 b/tests/fixtures/sed/input/lines1 similarity index 100% rename from tests/fixtures/sed/lines1 rename to tests/fixtures/sed/input/lines1 diff --git a/tests/fixtures/sed/lines2 b/tests/fixtures/sed/input/lines2 similarity index 100% rename from tests/fixtures/sed/lines2 rename to tests/fixtures/sed/input/lines2 diff --git a/tests/fixtures/sed/no-new-line.txt b/tests/fixtures/sed/input/no-new-line.txt similarity index 100% rename from tests/fixtures/sed/no-new-line.txt rename to tests/fixtures/sed/input/no-new-line.txt diff --git a/tests/fixtures/sed/two-lines.txt b/tests/fixtures/sed/input/two-lines.txt similarity index 100% rename from tests/fixtures/sed/two-lines.txt rename to tests/fixtures/sed/input/two-lines.txt From 42268278655635696a30e3a9f824d16a639222d3 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 09:15:17 +0300 Subject: [PATCH 37/85] Add torture scripts For now, as an aspiration and to populate the placeholder directory. --- tests/fixtures/sed/script/hanoi.sed | 98 +++++++ tests/fixtures/sed/script/math.sed | 438 ++++++++++++++++++++++++++++ 2 files changed, 536 insertions(+) create mode 100644 tests/fixtures/sed/script/hanoi.sed create mode 100644 tests/fixtures/sed/script/math.sed diff --git a/tests/fixtures/sed/script/hanoi.sed b/tests/fixtures/sed/script/hanoi.sed new file mode 100644 index 00000000..e93d358b --- /dev/null +++ b/tests/fixtures/sed/script/hanoi.sed @@ -0,0 +1,98 @@ +# Towers of Hanoi in sed. +# Ex: +# Run "sed -f hanoi.sed", and enter: +# +# :abcd: : : +# +# note -- TWO carriage returns were once required, this will output the +# sequence of states involved in moving 4 rings, the largest called "a" and +# the smallest called "d", from the first to the second of three towers, so +# that the rings on any tower at any time are in descending order of size. +# You can start with a different arrangement and a different number of rings, +# say :ce:b:ax: and it will give the shortest procedure for moving them all +# to the middle tower. The rules are: the names of the rings must all be +# lower-case letters, they must be input within 3 fields (representing the +# towers) and delimited by 4 colons, such that the letters within each field +# are in alphabetical order (i.e. rings are in descending order of size). +# +# For the benefit of anyone who wants to figure out the script, an "internal" +# line of the form +# b:0abx:1a2b3 :2 :3x2 +# has the following meaning: the material after the three markers :1, :2, +# and :3 represents the three towers; in this case the current set-up is +# ":ab : :x :". The numbers after a, b and x in these fields indicate +# that the next time it gets a chance, it will move a to tower 2, move b +# to tower 3, and move x to tower 2. The string after :0 just keeps track +# of the alphabetical order of the names of the rings. The b at the +# beginning means that it is now dealing with ring b (either about to move +# it, or re-evaluating where it should next be moved to). +# +# Although this version is "limited" to 26 rings because of the size of the +# alphabet, one could write a script using the same idea in which the rings +# were represented by arbitrary [strings][within][brackets], and in place of +# the built-in line of the script giving the order of the letters of the +# alphabet, it would accept from the user a line giving the ordering to be +# assumed, e.g. [ucbvax][decvax][hplabs][foo][bar]. +# +# George Bergman +# Math, UC Berkeley 94720 USA + +# cleaning, diagnostics +s/ *//g +/^$/d +/[^a-z:]/{a\ +Illegal characters: use only a-z and ":". Try again. +d +} +/^:[a-z]*:[a-z]*:[a-z]*:$/!{a\ +Incorrect format: use\ +\ : string1 : string2 : string3 :\ +Try again. +d +} +/\([a-z]\).*\1/{a\ +Repeated letters not allowed. Try again. +d +} +# initial formatting +h +s/[a-z]/ /g +G +s/^:\( *\):\( *\):\( *\):\n:\([a-z]*\):\([a-z]*\):\([a-z]*\):$/:1\4\2\3:2\5\1\3:3\6\1\2:0/ +s/[a-z]/&2/g +s/^/abcdefghijklmnopqrstuvwxyz/ +:a +s/^\(.\).*\1.*/&\1/ +s/.// +/^[^:]/ba +s/\([^0]*\)\(:0.*\)/\2\1:/ +s/^[^0]*0\(.\)/\1&/ +:b +# outputting current state without markers +h +s/.*:1/:/ +s/[123]//gp +g +:c +# establishing destinations +/^\(.\).*\1:1/td +/^\(.\).*:1[^:]*\11/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\31/ +/^\(.\).*:1[^:]*\12/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\33/ +/^\(.\).*:1[^:]*\13/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\32/ +/^\(.\).*:2[^:]*\11/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\33/ +/^\(.\).*:2[^:]*\12/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\32/ +/^\(.\).*:2[^:]*\13/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\31/ +/^\(.\).*:3[^:]*\11/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\32/ +/^\(.\).*:3[^:]*\12/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\31/ +/^\(.\).*:3[^:]*\13/s/^\(.\)\(.*\1\([a-z]\).*\)\3./\3\2\33/ +bc +# iterate back to find smallest out-of-place ring +:d +s/^\(.\)\(:0[^:]*\([^:]\)\1.*:\([123]\)[^:]*\1\)\4/\3\2\4/ +td +# move said ring (right, resp. left) +s/^\(.\)\(.*\)\1\([23]\)\(.*:\3[^ ]*\) /\1\2 \4\1\3/ +s/^\(.\)\(.*:\([12]\)[^ ]*\) \(.*\)\1\3/\1\2\1\3\4 / +tb +s/.*/Done! Try another, or end with ^D./p +d diff --git a/tests/fixtures/sed/script/math.sed b/tests/fixtures/sed/script/math.sed new file mode 100644 index 00000000..509e6c3e --- /dev/null +++ b/tests/fixtures/sed/script/math.sed @@ -0,0 +1,438 @@ +# This is ksb's infamous sed calculator. (ksb@sa.fedex.com) +# +# +# $Id: math.sed,v 2.5 1998/08/02 13:23:34 ksb Exp ksb $ +# expr ::= (expr) | expr! | +# expr ^ expr | +# -expr | expr * expr | expr / expr | expr % expr | +# expr + expr | expr - expr | +# [0-9][0-9]* ; +# Bugs: some sign combinations don't work, and I got sick of added cases +# for unary +. Don't depend on signed math working all the time. -- ksb +# +# $Compile: echo "4+7*3+2^7/3" | sed -f %f + +# make sure the expression is well formed +s/[ ]//g +/[*\/^%+-]$/{ + a\ + poorly formed expression, dyadic operator on the end + q +} +/^[*\/^%]/{ + a\ + poorly formed expression, leading dyadic operator + q +} + +# fill hold space with done token +x +s/^.*/done/ +x + +# main loop, process operators ((), !, *, /, %, +, and -) +: loop +# uncomment the print below to follow the "logic" -- ksb +#p +/^[+]/{ + s/// + b loop +} +/^--/{ + s/// + b loop +} +# eval parenthesised sub expressions first +/^\(.*\)(\([^)]*\))\(.*\)$/{ + H + s//\2/ + x + s/^\(.*\)\n\(.*\)(\([^()]*\))\(.*\)$/()\2@\4@\1/ + x + b loop +} +# reduce a^b^c -> a^(b^c) +/\([0-9][0-9]*^\)\([0-9][0-9]*^[0-9][0-9^]*\)/{ + s//\1(\2)/ + b loop +} +# pull any buried exponents +/^\(.*[^0-9]\)\([0-9][0-9]*^[0-9][0-9]*\)$/{ + s//\1(\2)/ + b loop +} +/^\(.*[^0-9]\)\([0-9][0-9]*^[0-9][0-9]*\)\([^0-9].*\)$/{ + s//\1(\2)\3/ + b loop +} +/^\([0-9][0-9]*^[0-9][0-9]*\)\([^0-9].*\)$/{ + s//(\1)\2/ + b loop +} +/^\([-]*[0-9]*\)^0*$/{ + s//1/ + b loop +} +/^\([-]*[0-9]*\)^0*1$/{ + s//\1/ + b loop +} +/^\([-]*[0-9]*\)^-[0-9]*$/{ + s//0/ + b loop +} +/^\([-]*\)\([0-9]*\)^\([0-9][0-9]*[13579]\)$/{ + s//\1\2*((\2*\2)^(\3\/2))/ + b loop +} +/^[-]*\([0-9]*\)^\([0-9][0-9]*[02468]\)$/{ + s//(\1*\1)^(\2\/2)/ + b loop +} +# single digit powers (2 3,9 4,6,8 5,7 +/^[-]*\([0-9]*\)^0*2$/{ + s//(\1*\1)/ + b loop +} +/^\([-]*\)\([0-9]*\)^0*\([39]\)$/{ + s//\1(\2*(\2*\2))^(\3\/3)/ + b loop +} +/^[-]*\([0-9]*\)^0*\([468]\)$/{ + s//(\1*\1)^(\2\/2)/ + b loop +} +# 5 7 +/^\([-]*[0-9]*\)^\([0-9]*\)$/{ + s//\1*(\1^(\2-1))/ + b loop +} +# reduce all number factorials +/^0*[01]!/{ + s//1/ + b loop +} +/\([*+-/%^]\)0*[01]!/{ + s//\11/ + b loop +} +/\([0-9]*\)!/{ + s//(\1-1)!*\1/ + b loop +} +# sign simplifications +/^-\([0-9]*\)\([*/%]\)-\([0-9]*\)$/{ + s//\1\2\3/ + b loop +} +/^\([0-9]*\)\([*/%]\)-\([0-9]*\)$/{ + s//-\1\2\3/ + b loop +} +/^-\([0-9][0-9]*\)[+]*-\([0-9][0-9]*\)$/{ + s//\1+\2/ + x + s/\(.*\)/()-@@\1/ + x + b loop +} +/^-\([0-9]*\)[+]\([0-9]\)*$/{ + s//\2-\1/ + b loop +} +/^-.*[-+*/%].*/{ + H + s/^-// + x + s/^\(.*\)\n-.*$/()-@@\1/ + x + b loop +} +# can we simplify multiplications +/^\([0-9]*\)\([*][0-9]*[1-9]\)00*$/{ + H + s//\1\2/ + x + s/^\(.*\)\n[0-9]*[*][0-9]*[1-9]\(00*\)$/()@\2@\1/ + x + b loop +} +/^\([0-9][1-9]*\)00*\([*][0-9]*\)$/{ + H + s//\1\2/ + x + s/^\(.*\)\n[0-9][1-9]*\(00*\)[*][0-9]*$/()@\2@\1/ + x + b loop +} +# can we simplify division (20/30 -> 2/3) +/^\([0-9][0-9]*\)0\([/%]\)\([0-9][0-9]*\)0$/{ + s//\1\2\3/ + b loop +} +# n/1 -> n +/^0*\([0-9][0-9]*\)0[/]0*1$/{ + s//\1/ + b loop +} +# n%2 -> last_digit(n)%2 (same for 1, BTW) N.B. NO LOOP +/^[0-9]*\([0-9]\)%0*\([12]\)$/{ + s//\1%\2/ +} +# move any mul/divs to the front via parans +/^\([0-9+]*\)\([-+]\)\([0-9]*[*/][0-9*/]*\)/{ + s//\1\2(\3)/ + b loop +} +# can we div or mul +/^[0-9]*[*][0-9]*$/{ + b mul +} +/^[0-9]*[/%]0*$/{ + i\ +divide by zero + d +} +/^[0-9]*[/%][0-9]*$/{ + H + s/\([0-9]\).*[/%]/\1-/ + x + s/^\(.*\)\n\([0-9]\)\([0-9]*\)\([/%]\)\([0-9]*\).*$/.\4\3q0r\2-\5@\1/ + x + b loop +} +/^\([0-9]*[*/%][0-9]*\)\(.*\)/{ + H + s//\1/ + x + s/^\(.*\)\n\([0-9]*[*/][0-9]*\)\(.*\)$/()@\3@\1/ + x + b loop +} +# can we add or subtract -- note subtract hold expression for underflow +/^[0-9]*[+][0-9]*$/{ + s/$/=/ + b add +} +/^[0-9][0-9]*-[0-9]*$/{ + H + s/$/=/ + b sub +} +/^\([0-9][0-9]*[-+][0-9]*\)\(.*\)/{ + H + s//\1/ + x + s/^\(.*\)\n\([0-9]*[-+][0-9]*\)\(.*\)$/()@\3@\1/ + x + b loop +} +# look in hold space for stack to reduce +x +/^done$/{ + x + s/^0*\([0-9][0-9]*\)/\1/ + p + d +} +# .[/%] numerator q quotient r remainder-divisor @stack +/^\./{ + x + /^[^-]/{ + H + x + s/.\(.\)\([0-9]*\)q\([^r]*\)r\([0-9]*\)-\([0-9]*\)@\(.*\)\n\(.*\)/.\1\2q\3+1r\7-\5@\6/ + h + s/..[0-9]*q[^r]*r\([0-9]*-[0-9]*\)@.*/\1/ + b loop + } + /^-/{ + g + /.\(.\)\([0-9]\)\([0-9]*\)q\([^r]*\)r0*\([0-9]*\)-\([^@]*\)@.*/{ + s//\5\2-\6/ + x + s/.\(.\)\([0-9]\)\([0-9]*\)q\([^r]*\)r0*\([0-9]*\)-\([0-9]*\)@\(.*\)/.\1\3q(\4)*10r\5\2-\6@\7/ + x + b loop + } +# no digits to shift on + s/^\.[/]q\([^r]*\)r[^@]*@.*/\1/ + s/^\.[%]q[^r]*r0*\([0-9][0-9]*\)-[^@]*@.*/\1/ + /^\./{ + i\ +divide error + q + } + x + s/^\.[/%]q[^r]*r[^@]*@\(.*\)/\1/ + x + b loop + } +} +/^()/{ + s/// + x + G + s/\(.*\)\n\([^@]*\)@\([^@]*\)@\(.*\)/\2\1\3/ + x + s/[^@]*@[^@]*@\(.*\)/\1/ + x + b loop +} +i\ +help, stack problem - the hold space +p +x +i\ +and the pat space +p +i\ +quit +q + +# turn mul into add until 1*x -> x, 0*x -> 0 +: mul +/^00*\*.*/{ + s//0/ + b loop +} +/^0*1\*/{ + s/// +: leading + s/^0*\([0-9][0-9]*\)/\1/ + b loop +} +s/^\([0-9]*\)0\*\([0-9]*\)/\1*\20/ +s/^\([0-9]*\)1\*\([0-9]*\)/\1*\20+\2/ +s/^\([0-9]*\)2\*\([0-9]*\)/\1*\20+(\2+\2)/ +s/^\([0-9]*\)3\*\([0-9]*\)/\1*\20+(\2+\2+\2)/ +s/^\([0-9]*\)4\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2)/ +s/^\([0-9]*\)5\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2+\2)/ +s/^\([0-9]*\)6\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2+\2+\2)/ +s/^\([0-9]*\)7\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2+\2+\2+\2)/ +s/^\([0-9]*\)8\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2+\2+\2+\2+\2)/ +s/^\([0-9]*\)9\*\([0-9]*\)/\1*\20+(\2+\2+\2+\2+\2+\2+\2+\2+\2)/ +/^0*\*[0-9]*[+]*\(.*\)/{ + s//\1/ + b loop +} +b mul + +# get rid of a plus term until 0+x -> x +: add +/^[+]\([0-9+*]*\)=/{ + s//\1/ + b leading +} +/^\([0-9*]*\)[+]=/{ + s//\1/ + b loop +} +/^\([0-9]*\)0[+]\([0-9]*\)\([0-9]\)=/{ + s//\1+\2=\3/ + b add +} +/^\([0-9]*\)\([0-9]\)[+]\([0-9]*\)0=/{ + s//\1+\3=\2/ + b add +} +s/^\([0-9]*\)1[+]/\10+/ +s/^\([0-9]*\)2[+]/\11+/ +s/^\([0-9]*\)3[+]/\12+/ +s/^\([0-9]*\)4[+]/\13+/ +s/^\([0-9]*\)5[+]/\14+/ +s/^\([0-9]*\)6[+]/\15+/ +s/^\([0-9]*\)7[+]/\16+/ +s/^\([0-9]*\)8[+]/\17+/ +s/^\([0-9]*\)9[+]/\18+/ + +s/9=\([0-9]*\)$/_=\1/ +s/8=\([0-9]*\)$/9=\1/ +s/7=\([0-9]*\)$/8=\1/ +s/6=\([0-9]*\)$/7=\1/ +s/5=\([0-9]*\)$/6=\1/ +s/4=\([0-9]*\)$/5=\1/ +s/3=\([0-9]*\)$/4=\1/ +s/2=\([0-9]*\)$/3=\1/ +s/1=\([0-9]*\)$/2=\1/ +/_/{ + s//_0/ + : inc + s/9_/_0/ + s/8_/9/ + s/7_/8/ + s/6_/7/ + s/5_/6/ + s/4_/5/ + s/3_/4/ + s/2_/3/ + s/1_/2/ + s/0_/1/ + s/[+]_/+1/ + /_/b inc +} +b add + +# get rid of a sub term until /-0*=/ or underflow +: sub +/^\([0-9]*\)-0*=/{ + s//\1/ + x + s/\(.*\)\n.*$/\1/ + x + b leading +} +/^-\([0-9].*\)=/{ +: under + g + s/.*\n\([0-9]*\)-\([0-9]*\).*/-(\2-\1)/ + x + s/\(.*\)\n.*/\1/ + x + b loop +} +/^\([0-9]*\)\([0-9]\)-\([0-9]*\)0=/{ + s//\1-\3=\2/ + b sub +} +s/1=/0=/ +s/2=/1=/ +s/3=/2=/ +s/4=/3=/ +s/5=/4=/ +s/6=/5=/ +s/7=/6=/ +s/8=/7=/ +s/9=/8=/ + +s/^\([0-9]*\)1-/\1_-/ +s/^\([0-9]*\)2-/\11-/ +s/^\([0-9]*\)3-/\12-/ +s/^\([0-9]*\)4-/\13-/ +s/^\([0-9]*\)5-/\14-/ +s/^\([0-9]*\)6-/\15-/ +s/^\([0-9]*\)7-/\16-/ +s/^\([0-9]*\)8-/\17-/ +s/^\([0-9]*\)9-/\18-/ +s/^\([0-9]*\)0-/\1'9-/ +s/_/0/ + +: scarry +/0'/{ + s//'9/ + b scarry +} +/^'/{ + b under +} +s/1'/0/ +s/2'/1/ +s/3'/2/ +s/4'/3/ +s/5'/4/ +s/6'/5/ +s/7'/6/ +s/8'/7/ +s/9'/8/ + +b sub From 4be5600f63a9257f25bf5a113f11be671d528f66 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 09:22:07 +0300 Subject: [PATCH 38/85] Add tests for multiple commands These demonstrate that command threading works. --- src/uu/sed/src/compiler.rs | 2 +- tests/by-util/test_sed.rs | 23 ++++++------------- .../sed/output/addr_three_lines_semicolon | 3 +++ .../sed/output/addr_two_lines_newline | 2 ++ .../sed/output/addr_two_lines_semicolon | 2 ++ 5 files changed, 15 insertions(+), 17 deletions(-) create mode 100644 tests/fixtures/sed/output/addr_three_lines_semicolon create mode 100644 tests/fixtures/sed/output/addr_two_lines_newline create mode 100644 tests/fixtures/sed/output/addr_two_lines_semicolon diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 6e4a85d1..4e0b6fc0 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -444,7 +444,6 @@ fn compile_command( line.eat_spaces(); if !line.eol() && line.current() == ';' { line.advance(); - // TODO: update link return Ok(ContinueAction::NextChar); } if !line.eol() { @@ -456,6 +455,7 @@ fn compile_command( } } CommandArgs::NonSelect => { // ! + // Implemented at a heigher level. } // TODO CommandArgs::Text => { // a c i diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index cb67b2c9..21bdaac5 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -158,19 +158,10 @@ check_output!(addr_found_space, ["-n", " /l1_7/ p", LINES1]); check_output!(addr_escaped_delimiter, ["-n", "\\_l1\\_7_p", LINES1]); check_output!(addr_range_numeric, ["-n", "1,4p", LINES1]); check_output!(addr_range_to_last, ["-n", "1,$p", LINES1, LINES2]); -check_output!( - addr_range_to_pattern, - ["-n", "1,/l2_9/p", LINES1, LINES2] -); +check_output!(addr_range_to_pattern, ["-n", "1,/l2_9/p", LINES1, LINES2]); check_output!(addr_pattern_to_last, ["-n", "/4/,$p", LINES1, LINES2]); -check_output!( - addr_pattern_to_straddle, - ["-n", "/4/,20p", LINES1, LINES2] -); -check_output!( - addr_pattern_to_pattern, - ["-n", "/4/,/10/p", LINES1, LINES2] -); +check_output!(addr_pattern_to_straddle, ["-n", "/4/,20p", LINES1, LINES2]); +check_output!(addr_pattern_to_pattern, ["-n", "/4/,/10/p", LINES1, LINES2]); check_output!( addr_pattern_straddle, ["-n", "/l2_3/,/l1_8/p", LINES1, LINES2] @@ -180,10 +171,7 @@ check_output!( addr_pattern_range_reverse, ["-n", "/l1_7/,3p", LINES1, LINES2] ); -check_output!( - addr_numeric_to_relative, - ["-n", "13,+4p", LINES1, LINES2] -); +check_output!(addr_numeric_to_relative, ["-n", "13,+4p", LINES1, LINES2]); check_output!( addr_pattern_to_relative, ["-n", "/l1_6/,+2p", LINES1, LINES2] @@ -194,3 +182,6 @@ check_output!( ["-n", "--separate", "1p", LINES1, LINES2] ); check_output!(addr_last_separate, ["-ns", "$p", LINES1, LINES2]); +check_output!(addr_two_lines_semicolon, ["-n", "-e", "4p;8p", LINES1]); +check_output!(addr_two_lines_newline, ["-n", "-e", "4p\n8p", LINES1]); +check_output!(addr_three_lines_semicolon, ["-n", "-e", "4p;8p;1p", LINES1]); diff --git a/tests/fixtures/sed/output/addr_three_lines_semicolon b/tests/fixtures/sed/output/addr_three_lines_semicolon new file mode 100644 index 00000000..d01034e5 --- /dev/null +++ b/tests/fixtures/sed/output/addr_three_lines_semicolon @@ -0,0 +1,3 @@ +l1_1 +l1_4 +l1_8 diff --git a/tests/fixtures/sed/output/addr_two_lines_newline b/tests/fixtures/sed/output/addr_two_lines_newline new file mode 100644 index 00000000..f209cac5 --- /dev/null +++ b/tests/fixtures/sed/output/addr_two_lines_newline @@ -0,0 +1,2 @@ +l1_4 +l1_8 diff --git a/tests/fixtures/sed/output/addr_two_lines_semicolon b/tests/fixtures/sed/output/addr_two_lines_semicolon new file mode 100644 index 00000000..f209cac5 --- /dev/null +++ b/tests/fixtures/sed/output/addr_two_lines_semicolon @@ -0,0 +1,2 @@ +l1_4 +l1_8 From b98a9d4da36553fa3310bfece5a443fab0bcf212 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 11:54:41 +0300 Subject: [PATCH 39/85] Add regular expression replacement template This allows more readable code and probably faster replacements than the original BSD string-based representation. --- src/uu/sed/src/command.rs | 199 ++++++++++++++++++++++++++++++++------ 1 file changed, 168 insertions(+), 31 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index f8d6e278..ae5620d4 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,12 +11,14 @@ // TODO: remove when compile is implemented #![allow(dead_code)] +use regex::Captures; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; use std::fs::File; use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; +use uucore::error::{UResult, USimpleError}; // Compilation and processing options provided mostly through the // command-line interface @@ -48,17 +50,15 @@ pub struct ProcessingContext { pub last_file: bool, } -// The specification of a script: through a string or a file #[derive(Debug, PartialEq)] +/// The specification of a script: through a string or a file pub enum ScriptValue { StringVal(String), PathVal(PathBuf), } -/* - * Types of address specifications - */ #[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Types of address specifications that precede commands pub enum AddressType { Re, // Line that matches regex Line, // Specific line @@ -66,10 +66,8 @@ pub enum AddressType { Last, // Last line } -/* - * Format of an address - */ #[derive(Debug)] +/// Format of an address pub struct Address { pub atype: AddressType, // Address type pub value: AddressValue, // Line number or regex @@ -81,33 +79,83 @@ pub enum AddressValue { Regex(Regex), } -/* - * Substitution command - */ #[derive(Debug)] +/// A single part of an RE replacement +pub enum ReplacementPart { + Literal(String), // Normal text + WholeMatch, // & + Group(u32), // \1 to \9 +} + +#[derive(Debug, Default)] +/// All specified replacements for an RE +pub struct ReplacementTemplate { + pub parts: Vec, +} + +impl ReplacementTemplate { + /// Create an empty tamplate. + pub fn default() -> Self { + ReplacementTemplate { parts: Vec::new() } + } + + /// Apply the template to the given RE captures. + /// Example: + /// let result = regex.replace_all(input, |caps: ®ex::Captures| { + /// template.apply(caps) }); + /// Returns an error if a backreference in the template was not matched by the RE. + pub fn apply(&self, caps: &Captures) -> UResult { + let mut result = String::new(); + + for part in &self.parts { + match part { + ReplacementPart::Literal(s) => result.push_str(s), + + ReplacementPart::WholeMatch => { + result.push_str(caps.get(0).map_or("", |m| m.as_str())); + } + + ReplacementPart::Group(n) => { + let group_index = *n as usize; + if group_index >= caps.len() { + return Err(USimpleError::new( + 2, + // TODO: Provide code location info + format!("\\{} not defined in the regular expression", n), + )); + } + + result.push_str(caps.get(group_index).map_or("", |m| m.as_str())); + } + } + } + + Ok(result) + } +} + +#[derive(Debug)] +/// Substitution command pub struct Substitution { - pub occurrence: usize, // Which occurrence to substitute - pub print_flag: bool, // True if 'p' flag - pub ignore_case: bool, // True if 'I' flag - pub write_file: Option, // Path to file if 'w' flag is used - pub file_descriptor: Option, // Cached file descriptor - pub regex: Regex, // Regular expression - pub max_backref: u32, // Largest backreference - pub line_number: usize, // Line number - pub replacement: String, // Replacement text -} - -// Transliteration command (y) + pub occurrence: usize, // Which occurrence to substitute + pub print_flag: bool, // True if 'p' flag + pub ignore_case: bool, // True if 'I' flag + pub write_file: Option, // Path to file if 'w' flag is used + pub file_descriptor: Option, // Cached file descriptor + pub regex: Regex, // Regular expression + pub line_number: usize, // Line number + pub replacement: ReplacementTemplate, // Specified broken-down replacement +} + #[derive(Debug)] +/// Transliteration command (y) pub struct Transliteration { pub byte_table: [u8; 256], // Byte translation table pub multi_map: HashMap, // Direct mapping from one char to another } -/* - * An internally compiled command. - */ #[derive(Debug)] +/// An internally compiled command. pub struct Command { pub code: char, // Command code pub addr1: Option
, // Start address @@ -135,6 +183,7 @@ impl Default for Command { } #[derive(Debug)] +/// Command-specific data pub enum CommandData { None, Subcommand(Rc>), // Commands for 'b', 't', '{' @@ -167,18 +216,14 @@ pub enum AppendType { File, } -/* - * Special flag for space modifications - */ +/// Flag for space modifications #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SpaceFlag { Append, // Append to contents Replace, // Replace contents } -/* - * Structure for a processing space (process, hold, otherwise). - */ +/// Structure for a processing space (process, hold, otherwise). #[derive(Debug)] pub struct Space { pub current: String, // Current space content @@ -186,3 +231,95 @@ pub struct Space { pub append_newline: bool, // Whether originally terminated by \n pub backup: String, // Backing memory } + +#[cfg(test)] +mod tests { + use super::*; + use regex::Regex; + + // Return the captures for the RE applied to the specified string + fn caps_for<'a>(re: &str, input: &'a str) -> regex::Captures<'a> { + Regex::new(re).unwrap().captures(input).unwrap() + } + + #[test] + // s/foo// + fn test_empty_template() { + let template = ReplacementTemplate::default(); + let caps = caps_for("foo", "foo"); + + let result = template.apply(&caps).unwrap(); + assert_eq!(result, ""); + } + + #[test] + // s/abc/hello/ + fn test_literal_only() { + let template = ReplacementTemplate { + parts: vec![ReplacementPart::Literal("hello".into())], + }; + let caps = caps_for("abc", "abc"); + + let result = template.apply(&caps).unwrap(); + assert_eq!(result, "hello"); + } + + #[test] + // s/foo\d+/got: &/ + fn test_whole_match() { + let template = ReplacementTemplate { + parts: vec![ + ReplacementPart::Literal("got: ".into()), + ReplacementPart::WholeMatch, + ], + }; + let caps = caps_for(r"foo\d+", "foo42"); + + let result = template.apply(&caps).unwrap(); + assert_eq!(result, "got: foo42"); + } + + #[test] + // s/foo(\d+)/number: \1/ + fn test_backreference() { + let template = ReplacementTemplate { + parts: vec![ + ReplacementPart::Literal("number: ".into()), + ReplacementPart::Group(1), + ], + }; + let caps = caps_for(r"foo(\d+)", "foo42"); + + let result = template.apply(&caps).unwrap(); + assert_eq!(result, "number: 42"); + } + + #[test] + // s/(\w+):(\d+)/key: \1, value: \2/ + fn test_multiple_parts() { + let template = ReplacementTemplate { + parts: vec![ + ReplacementPart::Literal("key: ".into()), + ReplacementPart::Group(1), + ReplacementPart::Literal(", value: ".into()), + ReplacementPart::Group(2), + ], + }; + let caps = caps_for(r"(\w+):(\d+)", "x:123"); + + let result = template.apply(&caps).unwrap(); + assert_eq!(result, "key: x, value: 123"); + } + + #[test] + // s/(a)(b)/\3/ + fn test_invalid_backreference() { + let template = ReplacementTemplate { + parts: vec![ReplacementPart::Group(3)], + }; + let caps = caps_for(r"(a)(b)", "ab"); // only groups 1 and 2 exist + + let err = template.apply(&caps).unwrap_err(); + assert!(err.to_string().contains(r"\3 not defined")); + } +} From fa7a9061d677523d0601f667fd6763ea0d7042eb Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 13:53:37 +0300 Subject: [PATCH 40/85] Implement compilation for the s command --- README.md | 2 + src/uu/sed/src/command.rs | 25 +- src/uu/sed/src/compiler.rs | 500 ++++++++++++++++++++++++++++- src/uu/sed/src/delimited_parser.rs | 2 +- src/uu/sed/src/processor.rs | 1 - 5 files changed, 519 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 64e6525e..905d14e1 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ cargo run --release ### Spported GNU extensions * Command-line arguments can be specified in long (`--`) form. * Spaces can precede a regular expression modifier. +* `I` can be used in as a synonym for the `i` (case insensitive) substitution + flag. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ae5620d4..4d54a399 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -87,18 +87,20 @@ pub enum ReplacementPart { Group(u32), // \1 to \9 } -#[derive(Debug, Default)] +#[derive(Debug)] /// All specified replacements for an RE pub struct ReplacementTemplate { pub parts: Vec, } -impl ReplacementTemplate { - /// Create an empty tamplate. - pub fn default() -> Self { +impl Default for ReplacementTemplate { + /// Create an empty template. + fn default() -> Self { ReplacementTemplate { parts: Vec::new() } } +} +impl ReplacementTemplate { /// Apply the template to the given RE captures. /// Example: /// let result = regex.replace_all(input, |caps: ®ex::Captures| { @@ -147,6 +149,21 @@ pub struct Substitution { pub replacement: ReplacementTemplate, // Specified broken-down replacement } +impl Default for Substitution { + fn default() -> Self { + Substitution { + occurrence: 1, + print_flag: false, + ignore_case: false, + write_file: None, + file_descriptor: None, + regex: Regex::new("").unwrap(), // safe dummy regex + line_number: 0, + replacement: ReplacementTemplate::default(), + } + } +} + #[derive(Debug)] /// Transliteration command (y) pub struct Transliteration { diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 4e0b6fc0..6cc2488d 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -8,14 +8,18 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Address, AddressType, AddressValue, Command, ProcessingContext, ScriptValue}; -use crate::delimited_parser::{compilation_error, parse_regex}; +use crate::command::{ + Address, AddressType, AddressValue, Command, CommandData, ProcessingContext, ReplacementPart, + ReplacementTemplate, ScriptValue, Substitution, +}; +use crate::delimited_parser::{compilation_error, parse_char_escape, parse_regex}; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; +use std::path::PathBuf; use std::rc::Rc; use uucore::error::UResult; @@ -202,7 +206,7 @@ fn build_command_map() -> HashMap { // How to continue after processing a command #[derive(Debug)] -enum ContinueAction { +pub enum ContinueAction { NextLine, NextChar, } @@ -427,6 +431,266 @@ fn compile_regex( } } +/// Compile a regular expression replacement string. +pub fn compile_replacement( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, +) -> UResult { + let mut parts = Vec::new(); + let mut literal = String::new(); + + let delimiter = line.current(); + line.advance(); + + loop { + while !line.eol() { + match line.current() { + '\\' => { + line.advance(); + + // Line continuation + if line.eol() { + if let Some(next_line_string) = lines.next_line()? { + *line = ScriptCharProvider::new(&next_line_string); + continue; + } else { + return compilation_error( + lines, + line, + "unterminated substitute replacement (unexpected EOF)", + ); + } + } + + match line.current() { + // \1 - \9 + c @ '1'..='9' => { + let ref_num = c.to_digit(10).unwrap(); + + if !literal.is_empty() { + parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); + } + parts.push(ReplacementPart::Group(ref_num)); + line.advance(); + } + + // \& or \\ + '&' => { + if !literal.is_empty() { + parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); + } + parts.push(ReplacementPart::WholeMatch); + line.advance(); + } + '\\' => { + literal.push('\\'); + line.advance(); + } + + // other escape sequences + _ => match parse_char_escape(line) { + Some(decoded) => literal.push(decoded), + None => { + literal.push('\\'); + literal.push(line.current()); + line.advance(); + } + }, + } + } + + '\n' => { + return compilation_error( + lines, + line, + "unescaped newline inside substitute replacement", + ); + } + + c if c == delimiter => { + line.advance(); // skip closing delimiter + if !literal.is_empty() { + parts.push(ReplacementPart::Literal(literal)); + } + return Ok(ReplacementTemplate { parts }); + } + + c => { + literal.push(c); + line.advance(); + } + } + } + + // Fetch next line for continued replacement string + if let Some(next_line_string) = lines.next_line()? { + *line = ScriptCharProvider::new(&next_line_string); + } else { + return compilation_error(lines, line, "unterminated substitute replacement"); + } + } +} + +pub fn compile_subst_command( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + line.advance(); // move past 's' + + let delimiter = line.current(); + if delimiter == '\0' || delimiter == '\\' { + return compilation_error( + lines, + line, + "substitute pattern cannot be delimited by newline or backslash", + ); + } + + let pattern = parse_regex(lines, line)?; + if pattern.is_empty() { + return compilation_error(lines, line, "unterminated substitute pattern"); + } + + let mut subst = Box::new(Substitution { + occurrence: 0, + print_flag: false, + ignore_case: false, + write_file: None, + file_descriptor: None, + regex: compile_regex(lines, line, &pattern, false)?, // temp compile + line_number: lines.get_line_number(), + replacement: ReplacementTemplate::default(), + }); + + subst.replacement = compile_replacement(lines, line)?; + compile_subst_flags(lines, line, &mut subst)?; + + // Recompile regex with actual ignore_case flag + subst.regex = compile_regex(lines, line, &pattern, subst.ignore_case)?; + + line.eat_spaces(); + if !line.eol() && line.current() == ';' { + line.advance(); + cmd.data = CommandData::Substitution(subst); + return Ok(ContinueAction::NextChar); + } + + if !line.eol() { + return compilation_error( + lines, + line, + format!("extra characters at the end of the {} command", cmd.code), + ); + } + + cmd.data = CommandData::Substitution(subst); + Ok(ContinueAction::NextLine) +} + +/// Parse the substitution command's optional flags +pub fn compile_subst_flags( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + subst: &mut Substitution, +) -> UResult<()> { + let mut seen_g_or_n = false; + + subst.occurrence = 1; // default + subst.print_flag = false; + subst.ignore_case = false; + subst.write_file = None; + + while !line.eol() { + line.eat_spaces(); + + match line.current() { + 'g' => { + if seen_g_or_n { + return compilation_error( + lines, + line, + "multiple 'g' or numeric flags in substitute command", + ); + } + seen_g_or_n = true; + subst.occurrence = 0; + line.advance(); + } + + 'p' => { + subst.print_flag = true; + line.advance(); + } + + 'i' | 'I' => { + subst.ignore_case = true; + line.advance(); + } + + _c @ '1'..='9' => { + if seen_g_or_n { + return compilation_error( + lines, + line, + "multiple 'g' or numeric flags in substitute command", + ); + } + + let mut number = 0usize; + while !line.eol() && line.current().is_ascii_digit() { + number = number + .checked_mul(10) + .and_then(|n| n.checked_add(line.current().to_digit(10).unwrap() as usize)) + .ok_or_else(|| { + compilation_error::<()>( + lines, + line, + "overflow in numeric substitute flag", + ) + .unwrap_err() + })?; + line.advance(); + } + + subst.occurrence = number; + seen_g_or_n = true; + } + + 'w' => { + line.advance(); + line.eat_spaces(); + + let mut path = String::new(); + while !line.eol() && line.current() != ';' { + path.push(line.current()); + line.advance(); + } + + if path.is_empty() { + return compilation_error(lines, line, "missing filename after 'w' flag"); + } + + subst.write_file = Some(PathBuf::from(path)); + // NOTE: subst.file_descriptor is resolved later at runtime + return Ok(()); // 'w' is the last flag allowed + } + + ';' | '\n' => break, + + other => { + return compilation_error( + lines, + line, + format!("invalid substitute flag: '{}'", other), + ); + } + } + } + + Ok(()) +} + // Compile the specified command fn compile_command( lines: &mut ScriptLineProvider, @@ -457,6 +721,10 @@ fn compile_command( CommandArgs::NonSelect => { // ! // Implemented at a heigher level. } + CommandArgs::Substitute => { + // s + return compile_subst_command(lines, line, &mut cmd); + } // TODO CommandArgs::Text => { // a c i } @@ -474,8 +742,6 @@ fn compile_command( } CommandArgs::WriteFile => { // w } - CommandArgs::Substitute => { // s - } CommandArgs::Translate => { // y } } @@ -1163,4 +1429,228 @@ mod tests { assert!(cmd.next.is_none()); } + + // compile_replacement + #[test] + fn test_compile_replacement_literal() { + let (mut lines, mut chars) = make_providers("/hello/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + dbg!(&template); + + assert_eq!(template.parts.len(), 1); + assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hello")); + } + + #[test] + fn test_compile_replacement_backrefs_and_literal() { + let (mut lines, mut chars) = make_providers("/prefix \\1 and \\2/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 4); + assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "prefix ")); + assert!(matches!(&template.parts[1], ReplacementPart::Group(1))); + assert!(matches!(&template.parts[2], ReplacementPart::Literal(s) if s == " and ")); + assert!(matches!(&template.parts[3], ReplacementPart::Group(2))); + } + + #[test] + fn test_compile_replacement_whole_match() { + let (mut lines, mut chars) = make_providers("/The match was: \\&/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 2); + assert!( + matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "The match was: ") + ); + assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); + } + + #[test] + fn test_compile_replacement_escape_sequences() { + let (mut lines, mut chars) = make_providers("/line\\nnewline\\tend/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 1); + assert!(matches!( + &template.parts[0], + ReplacementPart::Literal(s) if s == "line\nnewline\tend" + )); + } + + #[test] + fn test_compile_replacement_line_continuation() { + let script = vec![ + ScriptValue::StringVal("/first line\\".to_string()), + ScriptValue::StringVal(" continued/".to_string()), + ]; + let mut provider = ScriptLineProvider::new(script); + let first_line = provider.next_line().unwrap().unwrap(); + let mut chars = ScriptCharProvider::new(&first_line); + + let template = compile_replacement(&mut provider, &mut chars).unwrap(); + assert_eq!(template.parts.len(), 1); + assert!(matches!( + &template.parts[0], + ReplacementPart::Literal(s) if s == "first line continued" + )); + } + + // compile_subst_flags + #[test] + fn test_compile_subst_flag_g() { + let (lines, mut chars) = make_providers("g"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert_eq!(subst.occurrence, 0); // 'g' means all occurrences + } + + #[test] + fn test_compile_subst_flag_p() { + let (lines, mut chars) = make_providers("p"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert!(subst.print_flag); + } + + #[test] + fn test_compile_subst_flag_uppercase_i() { + let (lines, mut chars) = make_providers("I"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert!(subst.ignore_case); + } + + #[test] + fn test_compile_subst_flag_i_lowercase() { + let (lines, mut chars) = make_providers("i"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert!(subst.ignore_case); + } + + #[test] + fn test_compile_subst_flag_number() { + let (lines, mut chars) = make_providers("3"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert_eq!(subst.occurrence, 3); + } + + #[test] + fn test_compile_subst_flag_g_and_number_should_fail() { + let (lines, mut chars) = make_providers("g3"); + let mut subst = Substitution::default(); + + let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); + assert!(err + .to_string() + .contains("multiple 'g' or numeric flags in substitute command")); + } + + #[test] + fn test_compile_subst_flag_number_and_g_should_fail() { + let (lines, mut chars) = make_providers("2g"); + let mut subst = Substitution::default(); + + let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); + assert!(err + .to_string() + .contains("multiple 'g' or numeric flags in substitute command")); + } + + #[test] + fn test_compile_subst_flag_w_missing_filename() { + let (lines, mut chars) = make_providers("w "); + let mut subst = Substitution::default(); + + let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); + assert!(err.to_string().contains("missing filename")); + } + + #[test] + fn test_compile_subst_flag_w_with_filename() { + let (lines, mut chars) = make_providers("w out.txt"); + let mut subst = Substitution::default(); + + compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); + assert_eq!(subst.write_file, Some(std::path::PathBuf::from("out.txt"))); + } + + #[test] + fn test_compile_subst_flag_invalid_flag() { + let (lines, mut chars) = make_providers("z"); + let mut subst = Substitution::default(); + + let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); + assert!(err.to_string().contains("invalid substitute flag")); + } + + // compile_subst_command + #[test] + fn test_compile_subst_invalid_delimiter_backslash() { + let (mut lines, mut chars) = make_providers("s\\foo\\bar\\"); + let mut cmd = Command::default(); + + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + assert!(err + .to_string() + .contains("substitute pattern cannot be delimited")); + } + + #[test] + fn test_compile_subst_empty_pattern() { + let (mut lines, mut chars) = make_providers("s//bar/"); + let mut cmd = Command::default(); + + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + assert!(err.to_string().contains("unterminated substitute pattern")); + } + + #[test] + fn test_compile_subst_extra_characters_at_end() { + let (mut lines, mut chars) = make_providers("s/foo/bar/x"); + let mut cmd = Command::default(); + + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + assert!(err.to_string().contains("invalid substitute flag")); + } + + #[test] + fn test_compile_subst_semicolon_indicates_continue() { + let (mut lines, mut chars) = make_providers("s/foo/bar/;"); + let mut cmd = Command::default(); + + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap(); + assert!(matches!(result, ContinueAction::NextChar)); + + if let CommandData::Substitution(subst) = &cmd.data { + assert_eq!(subst.replacement.parts.len(), 1); + } else { + panic!("Expected CommandData::Substitution"); + } + } + + #[test] + fn test_compile_subst_sets_command_data() { + let (mut lines, mut chars) = make_providers("s/foo/bar/"); + let mut cmd = Command::default(); + + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap(); + assert!(matches!(result, ContinueAction::NextLine)); + + match &cmd.data { + CommandData::Substitution(subst) => { + assert_eq!(subst.replacement.parts.len(), 1); + assert!( + matches!(&subst.replacement.parts[0], ReplacementPart::Literal(s) if s == "bar") + ); + } + _ => panic!("Expected CommandData::Substitution"), + } + } } diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index ff71653c..625e655e 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -101,7 +101,7 @@ fn create_control_char(x: char) -> Option { /// At entry line.current() must have advanced after the `\\`. /// Advance line to the first character not part of the escape. /// Return `None` if an invalid escape has been specified. -fn parse_char_escape(line: &mut ScriptCharProvider) -> Option { +pub fn parse_char_escape(line: &mut ScriptCharProvider) -> Option { match line.current() { 'a' => { line.advance(); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index fe9e8a02..be648aba 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -17,7 +17,6 @@ use std::path::PathBuf; use std::rc::Rc; use uucore::error::UResult; -#[allow(dead_code)] /// Return true if the passed address matches the current I/O context. fn match_address( addr: &Address, From cc4b3788d186c9eaef042b53e91b14642a0c0092 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 13:58:07 +0300 Subject: [PATCH 41/85] Abstract code into compile_empty_command() --- src/uu/sed/src/compiler.rs | 40 +++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 6cc2488d..95fb40be 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -691,6 +691,32 @@ pub fn compile_subst_flags( Ok(()) } +/// Compile a command that doesn't take any arguments +// Handles d D g G h H l n N p P q x = +pub fn compile_empty_command( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + line.advance(); // Skip the command character + line.eat_spaces(); // Skip any trailing whitespace + + if !line.eol() && line.current() == ';' { + line.advance(); + return Ok(ContinueAction::NextChar); + } + + if !line.eol() { + return compilation_error( + lines, + line, + format!("extra characters at the end of the {} command", cmd.code), + ); + } + + Ok(ContinueAction::NextLine) +} + // Compile the specified command fn compile_command( lines: &mut ScriptLineProvider, @@ -704,19 +730,7 @@ fn compile_command( match cmd_spec.args { CommandArgs::Empty => { // d D g G h H l n N p P q x = - line.advance(); - line.eat_spaces(); - if !line.eol() && line.current() == ';' { - line.advance(); - return Ok(ContinueAction::NextChar); - } - if !line.eol() { - return compilation_error( - lines, - line, - format!("extra characters at the end of the {} command", cmd.code), - ); - } + return compile_empty_command(lines, line, &mut cmd); } CommandArgs::NonSelect => { // ! // Implemented at a heigher level. From 2aaa6beb9a7539cc48cce5b9d6714eaa3ece3bc3 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 14:01:31 +0300 Subject: [PATCH 42/85] Add link to original implementation --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 905d14e1..595a7b40 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,8 @@ cargo run --release - The last line (`$`) address is interpreted as the last non-empty line of the last file. If files specified in subsequent arguments until the last one are empty, then the last line condition will never be triggered. + This behavior is consistent with the + [original implementation](https://github.com/dspinellis/unix-history-repo/blob/Research-V7/usr/src/cmd/sed/sed1.c#L665). ## License From 878de49121ab51ada978e9fb912399e39575a355 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 14:50:59 +0300 Subject: [PATCH 43/85] Add negated address tests --- tests/by-util/test_sed.rs | 6 ++++++ tests/fixtures/sed/output/addr_one_line_negate | 13 +++++++++++++ .../sed/output/addr_pattern_to_pattern_negate | 7 +++++++ tests/fixtures/sed/output/addr_range_numeric_negate | 10 ++++++++++ 4 files changed, 36 insertions(+) create mode 100644 tests/fixtures/sed/output/addr_one_line_negate create mode 100644 tests/fixtures/sed/output/addr_pattern_to_pattern_negate create mode 100644 tests/fixtures/sed/output/addr_range_numeric_negate diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 21bdaac5..7c6dac2a 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -185,3 +185,9 @@ check_output!(addr_last_separate, ["-ns", "$p", LINES1, LINES2]); check_output!(addr_two_lines_semicolon, ["-n", "-e", "4p;8p", LINES1]); check_output!(addr_two_lines_newline, ["-n", "-e", "4p\n8p", LINES1]); check_output!(addr_three_lines_semicolon, ["-n", "-e", "4p;8p;1p", LINES1]); +check_output!(addr_one_line_negate, ["-n", "-e", "4!p", LINES1]); +check_output!(addr_range_numeric_negate, ["-n", "1,4!p", LINES1]); +check_output!( + addr_pattern_to_pattern_negate, + ["-n", "/1_4/,/10/!p", LINES1] +); diff --git a/tests/fixtures/sed/output/addr_one_line_negate b/tests/fixtures/sed/output/addr_one_line_negate new file mode 100644 index 00000000..77e5cc0e --- /dev/null +++ b/tests/fixtures/sed/output/addr_one_line_negate @@ -0,0 +1,13 @@ +l1_1 +l1_2 +l1_3 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/addr_pattern_to_pattern_negate b/tests/fixtures/sed/output/addr_pattern_to_pattern_negate new file mode 100644 index 00000000..4b859514 --- /dev/null +++ b/tests/fixtures/sed/output/addr_pattern_to_pattern_negate @@ -0,0 +1,7 @@ +l1_1 +l1_2 +l1_3 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/addr_range_numeric_negate b/tests/fixtures/sed/output/addr_range_numeric_negate new file mode 100644 index 00000000..18476253 --- /dev/null +++ b/tests/fixtures/sed/output/addr_range_numeric_negate @@ -0,0 +1,10 @@ +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 From 1024eaaec4092bd1a1a69ce47cce9a1d5853b48e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 16:08:43 +0300 Subject: [PATCH 44/85] Implement the 's' command processing TODO: Testing at the integration level --- src/uu/sed/src/command.rs | 4 +- src/uu/sed/src/compiler.rs | 4 +- src/uu/sed/src/fast_io.rs | 23 ++++++++++- src/uu/sed/src/processor.rs | 82 +++++++++++++++++++++++++++++++++++-- 4 files changed, 105 insertions(+), 8 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 4d54a399..16e1dd6d 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -143,7 +143,7 @@ pub struct Substitution { pub print_flag: bool, // True if 'p' flag pub ignore_case: bool, // True if 'I' flag pub write_file: Option, // Path to file if 'w' flag is used - pub file_descriptor: Option, // Cached file descriptor + pub write_handle: Option, // Cached open file pub regex: Regex, // Regular expression pub line_number: usize, // Line number pub replacement: ReplacementTemplate, // Specified broken-down replacement @@ -156,7 +156,7 @@ impl Default for Substitution { print_flag: false, ignore_case: false, write_file: None, - file_descriptor: None, + write_handle: None, regex: Regex::new("").unwrap(), // safe dummy regex line_number: 0, replacement: ReplacementTemplate::default(), diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 95fb40be..42aec9a5 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -557,7 +557,7 @@ pub fn compile_subst_command( print_flag: false, ignore_case: false, write_file: None, - file_descriptor: None, + write_handle: None, regex: compile_regex(lines, line, &pattern, false)?, // temp compile line_number: lines.get_line_number(), replacement: ReplacementTemplate::default(), @@ -672,7 +672,7 @@ pub fn compile_subst_flags( } subst.write_file = Some(PathBuf::from(path)); - // NOTE: subst.file_descriptor is resolved later at runtime + // NOTE: subst.write_handle is resolved later at runtime return Ok(()); // 'w' is the last flag allowed } diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index 45cf24fd..c5236cb8 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -148,7 +148,7 @@ impl<'a> IOChunk<'a> { } } - /// Clear the object's contents, converting it it Owned if needed. + /// Clear the object's contents, converting it into Owned if needed. pub fn clear(&mut self) { self.utf8_verified = true; match &mut self.content { @@ -167,6 +167,27 @@ impl<'a> IOChunk<'a> { } } + /// Set the object's contents to the specified string. + /// Convert it into Owned if needed. + pub fn set_to_string(&mut self, new_content: String, add_newline: bool) { + self.utf8_verified = true; + // TODO: Default newline to true and remove argumnt if always true. + match &mut self.content { + IOChunkContent::Owned { + content, + has_newline, + .. + } => { + *content = new_content; + *has_newline = add_newline; + } + #[cfg(unix)] + _ => { + self.content = IOChunkContent::new_owned(new_content, add_newline); + } + } + } + /// Return the content as a string. pub fn try_as_str(&mut self) -> Result<&str, Box> { match &self.content { diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index be648aba..a3507972 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -8,14 +8,18 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use crate::command::{Address, AddressType, AddressValue, Command, ProcessingContext}; +use crate::command::{ + Address, AddressType, AddressValue, Command, CommandData, ProcessingContext, Substitution, +}; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; use atty::Stream; use std::cell::RefCell; +use std::fs::OpenOptions; +use std::io::Write; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::UResult; +use uucore::error::{UResult, USimpleError}; /// Return true if the passed address matches the current I/O context. fn match_address( @@ -157,6 +161,73 @@ fn write_chunk( Ok(()) } +/// Perform the specified RE replacement in the provided pattern space. +fn substitute( + pattern: &mut IOChunk, + sub: &mut Substitution, + context: &ProcessingContext, + output: &mut OutputBuffer, +) -> UResult<()> { + let mut count = 0; + let mut last_end = 0; + let mut result = String::new(); + let mut replaced = false; + + let text = pattern.try_as_str()?; + + for caps in sub.regex.captures_iter(text) { + count += 1; + let m = caps.get(0).unwrap(); + + // Always write the unmatched text before this match. + result.push_str(&text[last_end..m.start()]); + + if sub.occurrence == 0 || count == sub.occurrence { + let replacement = sub.replacement.apply(&caps)?; + result.push_str(&replacement); + replaced = true; + } else { + // Not the target match — leave the match unchanged. + result.push_str(m.as_str()); + } + + last_end = m.end(); + } + + // Handle substitution success. + if replaced { + result.push_str(&text[last_end..]); + + pattern.set_to_string(result, true); + + if sub.print_flag { + write_chunk(output, context, pattern)?; + } + + // Write to file if needed. + if let Some(ref path) = sub.write_file { + // Check and cache the file handle if not already done. + let handle = if let Some(ref mut file) = sub.write_handle { + file + } else { + let file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path) + .map_err(|e| { + USimpleError::new(2, format!("Failed to open {}: {}", path.display(), e)) + })?; + sub.write_handle.get_or_insert(file) + }; + + writeln!(handle, "{}", pattern.try_as_str()?)?; + } + } + + Ok(()) +} + /// Process a single input file fn process_file( commands: &Option>>, @@ -239,7 +310,12 @@ fn process_file( // TODO } 's' => { - // TODO + let subst = match &mut command.data { + CommandData::Substitution(subst) => subst, + _ => panic!("Expected Substitution command data"), + }; + + substitute(&mut pattern, &mut *subst, context, output)?; } 't' => { // TODO From e1e5bb115ff8890dc8dc2a1002b19c11e254237b Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 10 May 2025 16:17:15 +0300 Subject: [PATCH 45/85] Fix handling of & in replacement string --- src/uu/sed/src/compiler.rs | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 42aec9a5..553203f9 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -474,16 +474,9 @@ pub fn compile_replacement( line.advance(); } - // \& or \\ - '&' => { - if !literal.is_empty() { - parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); - } - parts.push(ReplacementPart::WholeMatch); - line.advance(); - } - '\\' => { - literal.push('\\'); + // literal \ and & + '\\' | '&' => { + literal.push(line.current()); line.advance(); } @@ -499,6 +492,14 @@ pub fn compile_replacement( } } + '&' => { + if !literal.is_empty() { + parts.push(ReplacementPart::Literal(std::mem::take(&mut literal))); + } + parts.push(ReplacementPart::WholeMatch); + line.advance(); + } + '\n' => { return compilation_error( lines, @@ -1469,7 +1470,7 @@ mod tests { #[test] fn test_compile_replacement_whole_match() { - let (mut lines, mut chars) = make_providers("/The match was: \\&/"); + let (mut lines, mut chars) = make_providers("/The match was: &/"); let template = compile_replacement(&mut lines, &mut chars).unwrap(); assert_eq!(template.parts.len(), 2); @@ -1479,6 +1480,17 @@ mod tests { assert!(matches!(&template.parts[1], ReplacementPart::WholeMatch)); } + #[test] + fn test_compile_replacement_ampersand() { + let (mut lines, mut chars) = make_providers("/Simon \\& Garfunkel/"); + let template = compile_replacement(&mut lines, &mut chars).unwrap(); + + assert_eq!(template.parts.len(), 1); + assert!( + matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "Simon & Garfunkel") + ); + } + #[test] fn test_compile_replacement_escape_sequences() { let (mut lines, mut chars) = make_providers("/line\\nnewline\\tend/"); From fb41cf305c9a505832dd8b68d70a642913042cc9 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 00:12:55 +0300 Subject: [PATCH 46/85] Improve implementation of output files - Verify flushing on termination. - Open all files at entry, as required by POSIX. --- src/uu/sed/src/command.rs | 19 +++++---- src/uu/sed/src/compiler.rs | 9 +++-- src/uu/sed/src/named_writer.rs | 73 ++++++++++++++++++++++++++++++++++ src/uu/sed/src/processor.rs | 27 ++++--------- src/uu/sed/src/sed.rs | 1 + 5 files changed, 96 insertions(+), 33 deletions(-) create mode 100644 src/uu/sed/src/named_writer.rs diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 16e1dd6d..a020c8dd 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -11,11 +11,12 @@ // TODO: remove when compile is implemented #![allow(dead_code)] +use crate::named_writer::NamedWriter; + use regex::Captures; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; -use std::fs::File; use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; use uucore::error::{UResult, USimpleError}; @@ -139,13 +140,12 @@ impl ReplacementTemplate { #[derive(Debug)] /// Substitution command pub struct Substitution { - pub occurrence: usize, // Which occurrence to substitute - pub print_flag: bool, // True if 'p' flag - pub ignore_case: bool, // True if 'I' flag - pub write_file: Option, // Path to file if 'w' flag is used - pub write_handle: Option, // Cached open file - pub regex: Regex, // Regular expression - pub line_number: usize, // Line number + pub occurrence: usize, // Which occurrence to substitute + pub print_flag: bool, // True if 'p' flag + pub ignore_case: bool, // True if 'I' flag + pub write_file: Option>>, // Writer to file if 'w' flag is used + pub regex: Regex, // Regular expression + pub line_number: usize, // Line number pub replacement: ReplacementTemplate, // Specified broken-down replacement } @@ -156,7 +156,6 @@ impl Default for Substitution { print_flag: false, ignore_case: false, write_file: None, - write_handle: None, regex: Regex::new("").unwrap(), // safe dummy regex line_number: 0, replacement: ReplacementTemplate::default(), @@ -206,7 +205,7 @@ pub enum CommandData { Subcommand(Rc>), // Commands for 'b', 't', '{' Substitution(Box), // Substitute command 's' Transliteration(Box), // Transliteration command 'y' - WriteFileDescriptor(File), // File descriptor for 'w' + NamedWriter(Box), // File descriptor for 'w' } impl CommandData { diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 553203f9..f126855e 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -13,6 +13,7 @@ use crate::command::{ ReplacementTemplate, ScriptValue, Substitution, }; use crate::delimited_parser::{compilation_error, parse_char_escape, parse_regex}; +use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; use once_cell::sync::Lazy; @@ -558,7 +559,6 @@ pub fn compile_subst_command( print_flag: false, ignore_case: false, write_file: None, - write_handle: None, regex: compile_regex(lines, line, &pattern, false)?, // temp compile line_number: lines.get_line_number(), replacement: ReplacementTemplate::default(), @@ -672,7 +672,7 @@ pub fn compile_subst_flags( return compilation_error(lines, line, "missing filename after 'w' flag"); } - subst.write_file = Some(PathBuf::from(path)); + subst.write_file = Some(NamedWriter::new(PathBuf::from(path))?); // NOTE: subst.write_handle is resolved later at runtime return Ok(()); // 'w' is the last flag allowed } @@ -1604,7 +1604,10 @@ mod tests { let mut subst = Substitution::default(); compile_subst_flags(&lines, &mut chars, &mut subst).unwrap(); - assert_eq!(subst.write_file, Some(std::path::PathBuf::from("out.txt"))); + assert_eq!( + subst.write_file.as_ref().map(|w| w.borrow().path.clone()), + Some(std::path::PathBuf::from("out.txt")) + ); } #[test] diff --git a/src/uu/sed/src/named_writer.rs b/src/uu/sed/src/named_writer.rs new file mode 100644 index 00000000..b145abda --- /dev/null +++ b/src/uu/sed/src/named_writer.rs @@ -0,0 +1,73 @@ +// An abstraction for output files created on entry and flushed on exit +// +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Diomidis Spinellis +// +// This file is part of the uutils sed package. +// It is licensed under the MIT License. +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::cell::RefCell; +use std::fs::{File, OpenOptions}; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; +use std::rc::Rc; + +use uucore::error::{UResult, USimpleError}; + +thread_local! { + /// Global list of all writers that should be flushed at shutdown + static FLUSH_LIST: RefCell>>> = const { RefCell::new(Vec::new()) }; +} + +#[derive(Debug)] +/// Writer that tracks its file name for better error messages +pub struct NamedWriter { + pub path: PathBuf, + pub writer: BufWriter, +} + +impl NamedWriter { + /// Create a new writer, truncate the file, and register it for flushing. + pub fn new(path: PathBuf) -> UResult>> { + let file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&path) + .map_err(|e| USimpleError::new(2, format!("{}: {}", path.display(), e)))?; + + let writer = Rc::new(RefCell::new(NamedWriter { + path, + writer: BufWriter::new(file), + })); + + FLUSH_LIST.with(|list| list.borrow_mut().push(Rc::clone(&writer))); + Ok(writer) + } + + /// Write a line to the file with a newline, returning descriptive errors. + pub fn write_line(&mut self, line: &str) -> UResult<()> { + writeln!(self.writer, "{}", line) + .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + } + + /// Flush the writer, returning a descriptive error. + pub fn flush(&mut self) -> UResult<()> { + self.writer + .flush() + .map_err(|e| USimpleError::new(2, format!("{}: {}", self.path.display(), e))) + } +} + +/// Flush buffered content to the file, returning descriptive errors. +pub fn flush_all() -> UResult<()> { + FLUSH_LIST.with(|cell| { + for handle in cell.borrow().iter() { + handle.borrow_mut().flush()?; + } + + Ok(()) + }) +} diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index a3507972..c1c603c1 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -13,13 +13,12 @@ use crate::command::{ }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; +use crate::named_writer; use atty::Stream; use std::cell::RefCell; -use std::fs::OpenOptions; -use std::io::Write; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; /// Return true if the passed address matches the current I/O context. fn match_address( @@ -205,23 +204,8 @@ fn substitute( } // Write to file if needed. - if let Some(ref path) = sub.write_file { - // Check and cache the file handle if not already done. - let handle = if let Some(ref mut file) = sub.write_handle { - file - } else { - let file = OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(path) - .map_err(|e| { - USimpleError::new(2, format!("Failed to open {}: {}", path.display(), e)) - })?; - sub.write_handle.get_or_insert(file) - }; - - writeln!(handle, "{}", pattern.try_as_str()?)?; + if let Some(ref writer) = sub.write_file { + writer.borrow_mut().write_line(pattern.try_as_str()?)?; } } @@ -376,5 +360,8 @@ pub fn process_all_files( in_place.end()?; } + // Flush all output files + named_writer::flush_all()?; + Ok(()) } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index c5ab4b9e..50099fa7 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -13,6 +13,7 @@ pub mod compiler; pub mod delimited_parser; pub mod fast_io; pub mod in_place; +pub mod named_writer; pub mod processor; pub mod script_char_provider; pub mod script_line_provider; From 0c22c02ec179c24ffe03d4805324bdcd52bbb154 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 12:25:43 +0300 Subject: [PATCH 47/85] Add and fix integration tests for the s command - Implement handling for basic regular expressions - Fix handling of continuation lines --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/compiler.rs | 204 +++++++++++++----- src/uu/sed/src/delimited_parser.rs | 8 + src/uu/sed/src/script_char_provider.rs | 1 + src/uu/sed/src/script_line_provider.rs | 29 ++- src/uu/sed/src/sed.rs | 6 +- tests/by-util/test_sed.rs | 28 +++ tests/fixtures/sed/output/subst_any | 14 ++ tests/fixtures/sed/output/subst_any_global | 14 ++ tests/fixtures/sed/output/subst_brace | 14 ++ .../sed/output/subst_ere_numerical_groups | 14 ++ .../sed/output/subst_escaped_braced_separator | 14 ++ .../sed/output/subst_escaped_magic_separator | 14 ++ .../sed/output/subst_escaped_separator | 14 ++ .../output/subst_escaped_whole_match_group | 14 ++ tests/fixtures/sed/output/subst_multiline | 42 ++++ .../sed/output/subst_numbered_replacement | 14 ++ .../sed/output/subst_numerical_groups | 14 ++ .../sed/output/subst_whole_match_group | 14 ++ tests/fixtures/sed/output/subst_write_file | 28 +++ 20 files changed, 444 insertions(+), 58 deletions(-) create mode 100644 tests/fixtures/sed/output/subst_any create mode 100644 tests/fixtures/sed/output/subst_any_global create mode 100644 tests/fixtures/sed/output/subst_brace create mode 100644 tests/fixtures/sed/output/subst_ere_numerical_groups create mode 100644 tests/fixtures/sed/output/subst_escaped_braced_separator create mode 100644 tests/fixtures/sed/output/subst_escaped_magic_separator create mode 100644 tests/fixtures/sed/output/subst_escaped_separator create mode 100644 tests/fixtures/sed/output/subst_escaped_whole_match_group create mode 100644 tests/fixtures/sed/output/subst_multiline create mode 100644 tests/fixtures/sed/output/subst_numbered_replacement create mode 100644 tests/fixtures/sed/output/subst_numerical_groups create mode 100644 tests/fixtures/sed/output/subst_whole_match_group create mode 100644 tests/fixtures/sed/output/subst_write_file diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index a020c8dd..fdc86702 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -28,7 +28,7 @@ pub struct ProcessingContext { // Command-line flags with corresponding names pub all_output_files: bool, pub debug: bool, - pub regexp_extended: bool, + pub regex_extended: bool, pub follow_symlinks: bool, pub in_place: bool, pub in_place_suffix: Option, diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index f126855e..7811dd90 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -226,7 +226,7 @@ pub fn compile( // Compile provided scripts into a thread of commands fn compile_thread( lines: &mut ScriptLineProvider, - _processing_context: &mut ProcessingContext, + context: &ProcessingContext, ) -> UResult>>> { let mut head: Option>> = None; // A mutable reference to the place we’ll insert next @@ -252,7 +252,7 @@ fn compile_thread( } let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(lines, &mut line, &mut cmd)?; + let n_addr = compile_address_range(lines, &mut line, &mut cmd, context)?; let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; // The ! command shall be followed by another one @@ -264,7 +264,7 @@ fn compile_thread( } // Move cmd into next_p, transferring its ownership - let action = compile_command(lines, &mut line, &mut cmd, cmd_spec)?; + let action = compile_command(lines, &mut line, &mut cmd, cmd_spec, context)?; *next_p = Some(cmd); // Intermediate let binding to avoid the temporary drop @@ -296,13 +296,14 @@ fn compile_address_range( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Rc>, + context: &ProcessingContext, ) -> UResult { let mut n_addr = 0; let mut cmd = cmd.borrow_mut(); line.eat_spaces(); if !line.eol() && is_address_char(line.current()) { - if let Ok(addr1) = compile_address(lines, line) { + if let Ok(addr1) = compile_address(lines, line, context) { cmd.addr1 = Some(addr1); n_addr += 1; } @@ -313,7 +314,7 @@ fn compile_address_range( line.advance(); line.eat_spaces(); if !line.eol() { - if let Ok(addr2) = compile_address(lines, line) { + if let Ok(addr2) = compile_address(lines, line, context) { cmd.addr2 = Some(addr2); n_addr += 1; } @@ -324,7 +325,11 @@ fn compile_address_range( } /// Compile and return a single range address specification. -fn compile_address(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UResult
{ +fn compile_address( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + context: &ProcessingContext, +) -> UResult
{ let mut icase = false; if line.eol() { @@ -350,7 +355,7 @@ fn compile_address(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> Ok(Address { atype: AddressType::Re, - value: AddressValue::Regex(compile_regex(lines, line, &re, icase)?), + value: AddressValue::Regex(compile_regex(lines, line, &re, context, icase)?), }) } '$' => { @@ -395,11 +400,54 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) } +/// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. +/// - Translates `\(` and `\)` into `(` and `)` +/// - Escapes ERE-only metacharacters: `+ ? { } | ( )` +/// - Leaves all other characters as-is +fn bre_to_ere(pattern: &str) -> String { + let mut result = String::with_capacity(pattern.len()); + let mut chars = pattern.chars().peekable(); + + while let Some(c) = chars.next() { + if c == '\\' { + match chars.peek() { + Some('(') => { + chars.next(); + result.push('('); // group start + } + Some(')') => { + chars.next(); + result.push(')'); // group end + } + Some(&next) => { + chars.next(); + result.push('\\'); + result.push(next); // preserve other escaped characters + } + None => { + result.push('\\'); // trailing backslash, keep it + } + } + } else { + match c { + '+' | '?' | '{' | '}' | '|' | '(' | ')' => { + result.push('\\'); // escape unsupported ERE metacharacters + result.push(c); + } + _ => result.push(c), + } + } + } + + result +} + /// Compile the provided regular expression string into a corresponding engine. fn compile_regex( lines: &ScriptLineProvider, line: &ScriptCharProvider, pattern: &str, + context: &ProcessingContext, icase: bool, ) -> UResult { if pattern.is_empty() { @@ -411,15 +459,24 @@ fn compile_regex( } }) } else { + // Convert basic to extended regular expression if needed. + let ere_pattern = if context.regex_extended { + pattern + } else { + &bre_to_ere(pattern) + }; + + // Add case-insensitive modifier if needed. let full_pattern = if icase { - if pattern.is_empty() { + if ere_pattern.is_empty() { return compilation_error(lines, line, "cannot specify a modifier on an empty RE"); } - format!("(?i){}", pattern) + format!("(?i){}", ere_pattern) } else { - pattern.to_string() + ere_pattern.to_string() }; + // Compile into engine. let compiled = Regex::new(&full_pattern).map_err(|e| { compilation_error::(lines, line, format!("invalid regex '{}': {}", pattern, e)) .unwrap_err() @@ -452,6 +509,7 @@ pub fn compile_replacement( // Line continuation if line.eol() { if let Some(next_line_string) = lines.next_line()? { + literal.push('\n'); *line = ScriptCharProvider::new(&next_line_string); continue; } else { @@ -537,6 +595,7 @@ pub fn compile_subst_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, + context: &ProcessingContext, ) -> UResult { line.advance(); // move past 's' @@ -559,7 +618,7 @@ pub fn compile_subst_command( print_flag: false, ignore_case: false, write_file: None, - regex: compile_regex(lines, line, &pattern, false)?, // temp compile + regex: compile_regex(lines, line, &pattern, context, false)?, // temp compile line_number: lines.get_line_number(), replacement: ReplacementTemplate::default(), }); @@ -568,7 +627,7 @@ pub fn compile_subst_command( compile_subst_flags(lines, line, &mut subst)?; // Recompile regex with actual ignore_case flag - subst.regex = compile_regex(lines, line, &pattern, subst.ignore_case)?; + subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; line.eat_spaces(); if !line.eol() && line.current() == ';' { @@ -724,6 +783,7 @@ fn compile_command( line: &mut ScriptCharProvider, cmd: &mut Rc>, cmd_spec: &'static CommandSpec, + context: &ProcessingContext, ) -> UResult { let mut cmd = cmd.borrow_mut(); cmd.code = line.current(); @@ -738,7 +798,7 @@ fn compile_command( } CommandArgs::Substitute => { // s - return compile_subst_command(lines, line, &mut cmd); + return compile_subst_command(lines, line, &mut cmd, context); } // TODO CommandArgs::Text => { // a c i @@ -812,6 +872,15 @@ mod tests { (lines, line) } + /// Return a default ProcessingContext for use in tests. + + fn ctx() -> &'static ProcessingContext { + use std::sync::OnceLock; + + static CONTEXT: OnceLock = OnceLock::new(); + CONTEXT.get_or_init(ProcessingContext::default) + } + // lookup_command #[test] fn test_lookup_empty_command() { @@ -1015,7 +1084,7 @@ mod tests { #[test] fn test_compile_re_basic() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", false).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", ctx(), false).unwrap(); assert!(regex.is_match("abc")); assert!(!regex.is_match("ABC")); } @@ -1023,7 +1092,7 @@ mod tests { #[test] fn test_compile_re_case_insensitive() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", true).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", ctx(), true).unwrap(); assert!(regex.is_match("abc")); assert!(regex.is_match("ABC")); assert!(regex.is_match("AbC")); @@ -1033,11 +1102,11 @@ mod tests { fn test_compile_re_saved_and_reuse() { // Save a regex let (lines1, chars1) = dummy_providers(); - let _ = compile_regex(&lines1, &chars1, "abc", false).unwrap(); + let _ = compile_regex(&lines1, &chars1, "abc", ctx(), false).unwrap(); // Now try to reuse it let (lines2, chars2) = dummy_providers(); - let reused = compile_regex(&lines2, &chars2, "", false).unwrap(); + let reused = compile_regex(&lines2, &chars2, "", ctx(), false).unwrap(); assert!(reused.is_match("abc")); } @@ -1050,14 +1119,14 @@ mod tests { }); let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "", false); + let result = compile_regex(&lines, &chars, "", ctx(), false); assert!(result.is_err()); // Should fail because nothing was saved } #[test] fn test_compile_re_invalid() { let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "a[d", false); + let result = compile_regex(&lines, &chars, "a[d", ctx(), false); assert!(result.is_err()); // Should fail due to open bracketed expression } @@ -1065,7 +1134,7 @@ mod tests { #[test] fn test_compile_addr_line_number() { let (lines, mut chars) = make_providers("42"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Line)); if let AddressValue::LineNumber(n) = addr.value { assert_eq!(n, 42); @@ -1077,7 +1146,7 @@ mod tests { #[test] fn test_compile_addr_relative_line() { let (lines, mut chars) = make_providers("+7"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::RelLine)); if let AddressValue::LineNumber(n) = addr.value { assert_eq!(n, 7); @@ -1089,14 +1158,14 @@ mod tests { #[test] fn test_compile_addr_last_line() { let (lines, mut chars) = make_providers("$"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Last)); } #[test] fn test_compile_addr_regex() { let (lines, mut chars) = make_providers("/hello/"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("hello")); @@ -1108,7 +1177,7 @@ mod tests { #[test] fn test_compile_addr_regex_other_delimiter() { let (lines, mut chars) = make_providers("\\#hello#"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("hello")); @@ -1120,7 +1189,7 @@ mod tests { #[test] fn test_compile_addr_regex_with_modifier() { let (lines, mut chars) = make_providers("/hello/I"); - let addr = compile_address(&lines, &mut chars).unwrap(); + let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("HELLO")); // case-insensitive @@ -1133,11 +1202,11 @@ mod tests { fn test_compile_addr_empty_regex_saved() { // First save a regex let (lines1, mut chars1) = make_providers("/saved/"); - let _ = compile_address(&lines1, &mut chars1).unwrap(); + let _ = compile_address(&lines1, &mut chars1, ctx()).unwrap(); // Then reuse it with empty regex let (lines2, mut chars2) = make_providers("//"); - let addr = compile_address(&lines2, &mut chars2).unwrap(); + let addr = compile_address(&lines2, &mut chars2, ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("saved")); @@ -1151,7 +1220,7 @@ mod tests { fn test_compile_single_line_address() { let (lines, mut chars) = make_providers("42"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1164,7 +1233,7 @@ mod tests { fn test_compile_relative_address_range() { let (lines, mut chars) = make_providers("2,+3"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 2); @@ -1193,7 +1262,7 @@ mod tests { fn test_compile_last_address() { let (lines, mut chars) = make_providers("$"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1206,7 +1275,7 @@ mod tests { fn test_compile_absolute_address_range() { let (lines, mut chars) = make_providers("5,10"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 2); assert!(matches!( @@ -1223,7 +1292,7 @@ mod tests { fn test_compile_regex_address() { let (lines, mut chars) = make_providers("/foo/"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1242,7 +1311,7 @@ mod tests { fn test_compile_regex_address_range_other_delimiter() { let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 2); @@ -1273,7 +1342,7 @@ mod tests { fn test_compile_regex_with_modifier() { let (lines, mut chars) = make_providers("/foo/I"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1293,12 +1362,12 @@ mod tests { // First save a regex let (lines1, mut chars1) = make_providers("/abc/"); let mut cmd1 = Rc::new(RefCell::new(Command::default())); - compile_address_range(&lines1, &mut chars1, &mut cmd1).unwrap(); + compile_address_range(&lines1, &mut chars1, &mut cmd1, ctx()).unwrap(); // Now reuse it let (lines2, mut chars2) = make_providers("//"); let mut cmd2 = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2).unwrap(); + let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2, ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1319,14 +1388,10 @@ mod tests { ScriptLineProvider::new(input) } - fn make_processing_context() -> ProcessingContext { - ProcessingContext::default() - } - #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1335,7 +1400,7 @@ mod tests { #[test] fn test_compile_thread_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1344,7 +1409,7 @@ mod tests { #[test] fn test_compile_thread_single_command() { let mut provider = make_provider(&["42q"]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1368,7 +1433,7 @@ mod tests { #[test] fn test_compile_thread_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1392,7 +1457,7 @@ mod tests { #[test] fn test_compile_thread_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1408,7 +1473,7 @@ mod tests { #[test] fn test_compile_thread_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); - let mut opts = make_processing_context(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1450,7 +1515,6 @@ mod tests { fn test_compile_replacement_literal() { let (mut lines, mut chars) = make_providers("/hello/"); let template = compile_replacement(&mut lines, &mut chars).unwrap(); - dbg!(&template); assert_eq!(template.parts.len(), 1); assert!(matches!(&template.parts[0], ReplacementPart::Literal(s) if s == "hello")); @@ -1517,7 +1581,7 @@ mod tests { assert_eq!(template.parts.len(), 1); assert!(matches!( &template.parts[0], - ReplacementPart::Literal(s) if s == "first line continued" + ReplacementPart::Literal(s) if s == "first line\n continued" )); } @@ -1625,7 +1689,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s\\foo\\bar\\"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); assert!(err .to_string() .contains("substitute pattern cannot be delimited")); @@ -1636,7 +1700,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s//bar/"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); assert!(err.to_string().contains("unterminated substitute pattern")); } @@ -1645,7 +1709,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/x"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); assert!(err.to_string().contains("invalid substitute flag")); } @@ -1654,7 +1718,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/;"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap(); + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap(); assert!(matches!(result, ContinueAction::NextChar)); if let CommandData::Substitution(subst) = &cmd.data { @@ -1669,7 +1733,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd).unwrap(); + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap(); assert!(matches!(result, ContinueAction::NextLine)); match &cmd.data { @@ -1682,4 +1746,38 @@ mod tests { _ => panic!("Expected CommandData::Substitution"), } } + + // bre_to_ere + #[test] + fn test_bre_group_translation() { + assert_eq!(bre_to_ere(r"\(abc\)"), "(abc)"); + assert_eq!(bre_to_ere(r"a\(b\)c"), "a(b)c"); + } + + #[test] + fn test_ere_metacharacters_escaped() { + assert_eq!(bre_to_ere(r"a+b?c{1}|(d)"), r"a\+b\?c\{1\}\|\(d\)"); + } + + #[test] + fn test_literal_backslashes_preserved() { + assert_eq!(bre_to_ere(r"foo\\bar"), r"foo\\bar"); + assert_eq!(bre_to_ere(r"\."), r"\."); + } + + #[test] + fn test_character_classes_unchanged() { + assert_eq!(bre_to_ere(r"[a-z]"), "[a-z]"); + assert_eq!(bre_to_ere(r"[^0-9]"), "[^0-9]"); + } + + #[test] + fn test_anchors_and_dot_and_star() { + assert_eq!(bre_to_ere(r"^a.*b$"), "^a.*b$"); + } + + #[test] + fn test_trailing_backslash_is_preserved() { + assert_eq!(bre_to_ere(r"abc\"), r"abc\"); + } } diff --git a/src/uu/sed/src/delimited_parser.rs b/src/uu/sed/src/delimited_parser.rs index 625e655e..cd754d4e 100644 --- a/src/uu/sed/src/delimited_parser.rs +++ b/src/uu/sed/src/delimited_parser.rs @@ -783,6 +783,14 @@ mod tests { assert_eq!(line.current(), '/'); } + #[test] + fn test_regex_with_capture() { + let (lines, mut line) = make_providers(r"/\(.\)/c/"); + let parsed = parse_regex(&lines, &mut line).unwrap(); + assert_eq!(parsed, r"\(.\)"); + assert_eq!(line.current(), '/'); + } + #[test] fn test_regex_with_escape_sequence() { let (lines, mut line) = make_providers("/ab\\n/"); diff --git a/src/uu/sed/src/script_char_provider.rs b/src/uu/sed/src/script_char_provider.rs index 52b54e01..a3e3a85f 100644 --- a/src/uu/sed/src/script_char_provider.rs +++ b/src/uu/sed/src/script_char_provider.rs @@ -8,6 +8,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +#[derive(Debug)] pub struct ScriptCharProvider { line: Vec, pos: usize, diff --git a/src/uu/sed/src/script_line_provider.rs b/src/uu/sed/src/script_line_provider.rs index 24841d5d..e5876a5a 100644 --- a/src/uu/sed/src/script_line_provider.rs +++ b/src/uu/sed/src/script_line_provider.rs @@ -9,9 +9,11 @@ // file that was distributed with this source code. use crate::command::ScriptValue; +use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader}; +#[derive(Debug)] /// The provider of script lines across all specified scripts /// Scripts can be specified to sed as files or as strings. pub struct ScriptLineProvider { @@ -19,7 +21,7 @@ pub struct ScriptLineProvider { state: State, } -// Encapsulation of the script line provider's state +/// Encapsulation of the script line provider's state enum State { NotStarted, // Processing has not yet started Active { @@ -75,6 +77,10 @@ impl ScriptLineProvider { Some(*index + 1) // finished reading this source } else { *line_number += 1; + // Remove trailing newline + if line.ends_with('\n') { + line.pop(); + } return Ok(Some(line)); } } @@ -139,6 +145,27 @@ impl ScriptLineProvider { } } +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + State::NotStarted => f.debug_struct("NotStarted").finish(), + State::Done => f.debug_struct("Done").finish(), + State::Active { + index, + input_name, + line_number, + .. + } => f + .debug_struct("Active") + .field("index", index) + .field("input_name", input_name) + .field("line_number", line_number) + .field("reader", &"") + .finish(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 50099fa7..a75ecdc3 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -177,7 +177,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { ProcessingContext { all_output_files: matches.get_flag("all-output-files"), debug: matches.get_flag("debug"), - regexp_extended: matches.get_flag("regexp-extended"), + regex_extended: matches.get_flag("regexp-extended"), follow_symlinks: matches.get_flag("follow-symlinks"), in_place: matches.contains_id("in-place"), in_place_suffix: matches.get_one::("in-place").and_then(|s| { @@ -316,7 +316,7 @@ mod tests { assert!(!ctx.all_output_files); assert!(!ctx.debug); - assert!(!ctx.regexp_extended); + assert!(!ctx.regex_extended); assert!(!ctx.follow_symlinks); assert!(!ctx.in_place); assert_eq!(ctx.in_place_suffix, None); @@ -351,7 +351,7 @@ mod tests { assert!(ctx.all_output_files); assert!(ctx.debug); - assert!(ctx.regexp_extended); + assert!(ctx.regex_extended); assert!(ctx.follow_symlinks); assert!(ctx.in_place); assert!(ctx.in_place_suffix.is_none()); diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 7c6dac2a..02827453 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -191,3 +191,31 @@ check_output!( addr_pattern_to_pattern_negate, ["-n", "/1_4/,/10/!p", LINES1] ); + + +// Test substitutions +check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); +check_output!(subst_any_global, ["-e", r"s,.,X,g", LINES1]); +check_output!(subst_escaped_magic_separator, ["-e", r"s.\..X.g", LINES1]); +check_output!(subst_escaped_braced_separator, ["-e", r"s/[\/]/Q/", LINES1]); +check_output!(subst_escaped_separator, ["-e", r"s_\__X_", LINES1]); +check_output!(subst_whole_match_group, ["-e", r"s/./(&)/g", LINES1]); +check_output!( + subst_escaped_whole_match_group, + ["-e", r"s/./(\&)/g", LINES1] +); +check_output!( + subst_numerical_groups, + ["-e", r"s/\(.\)\(.\)\(.\)/x\3x\2x\1/g", LINES1] +); +check_output!( + subst_ere_numerical_groups, + ["--regexp-extended", "-e", r"s/(.)(.)(.)/x\3x\2x\1/g", LINES1] +); +check_output!(subst_multiline, ["-e", "s/_/u0\\\nu1\\\nu2/g", LINES1]); +check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); + +#[cfg(unix)] +check_output!(subst_write_file, ["-e", r"s/1/X/w /dev/stdout", LINES1]); + +check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); diff --git a/tests/fixtures/sed/output/subst_any b/tests/fixtures/sed/output/subst_any new file mode 100644 index 00000000..dcf124b4 --- /dev/null +++ b/tests/fixtures/sed/output/subst_any @@ -0,0 +1,14 @@ +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXXX +XXXXX +XXXXX +XXXXX +XXXXX diff --git a/tests/fixtures/sed/output/subst_any_global b/tests/fixtures/sed/output/subst_any_global new file mode 100644 index 00000000..dcf124b4 --- /dev/null +++ b/tests/fixtures/sed/output/subst_any_global @@ -0,0 +1,14 @@ +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXXX +XXXXX +XXXXX +XXXXX +XXXXX diff --git a/tests/fixtures/sed/output/subst_brace b/tests/fixtures/sed/output/subst_brace new file mode 100644 index 00000000..ded4e981 --- /dev/null +++ b/tests/fixtures/sed/output/subst_brace @@ -0,0 +1,14 @@ +lX_X +lX_X +lX_X +lX_4 +lX_5 +lX_6 +lX_7 +lX_8 +lX_9 +lX_X0 +lX_XX +lX_XX +lX_XX +lX_X4 diff --git a/tests/fixtures/sed/output/subst_ere_numerical_groups b/tests/fixtures/sed/output/subst_ere_numerical_groups new file mode 100644 index 00000000..08bbf8cf --- /dev/null +++ b/tests/fixtures/sed/output/subst_ere_numerical_groups @@ -0,0 +1,14 @@ +x_x1xl1 +x_x1xl2 +x_x1xl3 +x_x1xl4 +x_x1xl5 +x_x1xl6 +x_x1xl7 +x_x1xl8 +x_x1xl9 +x_x1xl10 +x_x1xl11 +x_x1xl12 +x_x1xl13 +x_x1xl14 diff --git a/tests/fixtures/sed/output/subst_escaped_braced_separator b/tests/fixtures/sed/output/subst_escaped_braced_separator new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/subst_escaped_braced_separator @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/subst_escaped_magic_separator b/tests/fixtures/sed/output/subst_escaped_magic_separator new file mode 100644 index 00000000..dcf124b4 --- /dev/null +++ b/tests/fixtures/sed/output/subst_escaped_magic_separator @@ -0,0 +1,14 @@ +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXX +XXXXX +XXXXX +XXXXX +XXXXX +XXXXX diff --git a/tests/fixtures/sed/output/subst_escaped_separator b/tests/fixtures/sed/output/subst_escaped_separator new file mode 100644 index 00000000..833e1ba4 --- /dev/null +++ b/tests/fixtures/sed/output/subst_escaped_separator @@ -0,0 +1,14 @@ +l1X1 +l1X2 +l1X3 +l1X4 +l1X5 +l1X6 +l1X7 +l1X8 +l1X9 +l1X10 +l1X11 +l1X12 +l1X13 +l1X14 diff --git a/tests/fixtures/sed/output/subst_escaped_whole_match_group b/tests/fixtures/sed/output/subst_escaped_whole_match_group new file mode 100644 index 00000000..0ff0b9e7 --- /dev/null +++ b/tests/fixtures/sed/output/subst_escaped_whole_match_group @@ -0,0 +1,14 @@ +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&) +(&)(&)(&)(&)(&) +(&)(&)(&)(&)(&) +(&)(&)(&)(&)(&) +(&)(&)(&)(&)(&) +(&)(&)(&)(&)(&) diff --git a/tests/fixtures/sed/output/subst_multiline b/tests/fixtures/sed/output/subst_multiline new file mode 100644 index 00000000..e0cd8b50 --- /dev/null +++ b/tests/fixtures/sed/output/subst_multiline @@ -0,0 +1,42 @@ +l1u0 +u1 +u21 +l1u0 +u1 +u22 +l1u0 +u1 +u23 +l1u0 +u1 +u24 +l1u0 +u1 +u25 +l1u0 +u1 +u26 +l1u0 +u1 +u27 +l1u0 +u1 +u28 +l1u0 +u1 +u29 +l1u0 +u1 +u210 +l1u0 +u1 +u211 +l1u0 +u1 +u212 +l1u0 +u1 +u213 +l1u0 +u1 +u214 diff --git a/tests/fixtures/sed/output/subst_numbered_replacement b/tests/fixtures/sed/output/subst_numbered_replacement new file mode 100644 index 00000000..86f75c15 --- /dev/null +++ b/tests/fixtures/sed/output/subst_numbered_replacement @@ -0,0 +1,14 @@ +l1_X +l1_X +l1_X +l1_X +l1_X +l1_X +l1_X +l1_X +l1_X +l1_X0 +l1_X1 +l1_X2 +l1_X3 +l1_X4 diff --git a/tests/fixtures/sed/output/subst_numerical_groups b/tests/fixtures/sed/output/subst_numerical_groups new file mode 100644 index 00000000..08bbf8cf --- /dev/null +++ b/tests/fixtures/sed/output/subst_numerical_groups @@ -0,0 +1,14 @@ +x_x1xl1 +x_x1xl2 +x_x1xl3 +x_x1xl4 +x_x1xl5 +x_x1xl6 +x_x1xl7 +x_x1xl8 +x_x1xl9 +x_x1xl10 +x_x1xl11 +x_x1xl12 +x_x1xl13 +x_x1xl14 diff --git a/tests/fixtures/sed/output/subst_whole_match_group b/tests/fixtures/sed/output/subst_whole_match_group new file mode 100644 index 00000000..47c08aeb --- /dev/null +++ b/tests/fixtures/sed/output/subst_whole_match_group @@ -0,0 +1,14 @@ +(l)(1)(_)(1) +(l)(1)(_)(2) +(l)(1)(_)(3) +(l)(1)(_)(4) +(l)(1)(_)(5) +(l)(1)(_)(6) +(l)(1)(_)(7) +(l)(1)(_)(8) +(l)(1)(_)(9) +(l)(1)(_)(1)(0) +(l)(1)(_)(1)(1) +(l)(1)(_)(1)(2) +(l)(1)(_)(1)(3) +(l)(1)(_)(1)(4) diff --git a/tests/fixtures/sed/output/subst_write_file b/tests/fixtures/sed/output/subst_write_file new file mode 100644 index 00000000..70a0f11d --- /dev/null +++ b/tests/fixtures/sed/output/subst_write_file @@ -0,0 +1,28 @@ +lX_1 +lX_2 +lX_3 +lX_4 +lX_5 +lX_6 +lX_7 +lX_8 +lX_9 +lX_10 +lX_11 +lX_12 +lX_13 +lX_14 +lX_1 +lX_2 +lX_3 +lX_4 +lX_5 +lX_6 +lX_7 +lX_8 +lX_9 +lX_10 +lX_11 +lX_12 +lX_13 +lX_14 From ca5a7dafff6e5353e322d4edf792c1439b8b1bd7 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 12:36:18 +0300 Subject: [PATCH 48/85] Add test for empty RE reuse --- tests/by-util/test_sed.rs | 1 + tests/fixtures/sed/output/addr_empty_re_reuse | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 tests/fixtures/sed/output/addr_empty_re_reuse diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 02827453..fa65ee59 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -191,6 +191,7 @@ check_output!( addr_pattern_to_pattern_negate, ["-n", "/1_4/,/10/!p", LINES1] ); +check_output!(addr_empty_re_reuse, ["-n", "/_2/,//p", LINES1, LINES2]); // Test substitutions diff --git a/tests/fixtures/sed/output/addr_empty_re_reuse b/tests/fixtures/sed/output/addr_empty_re_reuse new file mode 100644 index 00000000..feeeea41 --- /dev/null +++ b/tests/fixtures/sed/output/addr_empty_re_reuse @@ -0,0 +1,15 @@ +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 From a90bfe618d214814accd450a33cd39cea312c752 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 12:50:04 +0300 Subject: [PATCH 49/85] Test extended regular expression use --- tests/by-util/test_sed.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index fa65ee59..dd40fff9 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -193,7 +193,6 @@ check_output!( ); check_output!(addr_empty_re_reuse, ["-n", "/_2/,//p", LINES1, LINES2]); - // Test substitutions check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); check_output!(subst_any_global, ["-e", r"s,.,X,g", LINES1]); @@ -211,7 +210,12 @@ check_output!( ); check_output!( subst_ere_numerical_groups, - ["--regexp-extended", "-e", r"s/(.)(.)(.)/x\3x\2x\1/g", LINES1] + [ + "--regexp-extended", + "-e", + r"s/(.)(.)(.)/x\3x\2x\1/g", + LINES1 + ] ); check_output!(subst_multiline, ["-e", "s/_/u0\\\nu1\\\nu2/g", LINES1]); check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); From e37ac8fc92068c505f618c5f74c5234a984ffd20 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 12:50:36 +0300 Subject: [PATCH 50/85] Improve I/O file access error reporting --- src/uu/sed/src/compiler.rs | 2 +- src/uu/sed/src/named_writer.rs | 7 ++++++- src/uu/sed/src/processor.rs | 9 +++++++-- src/uu/sed/src/script_line_provider.rs | 12 +++++++++--- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 7811dd90..f38d8354 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -233,7 +233,7 @@ fn compile_thread( let mut next_p = &mut head; 'next_line: loop { - match lines.next_line().unwrap() { + match lines.next_line()? { None => { // TODO: Error if stack isn't empty return Ok(head); diff --git a/src/uu/sed/src/named_writer.rs b/src/uu/sed/src/named_writer.rs index b145abda..26482acf 100644 --- a/src/uu/sed/src/named_writer.rs +++ b/src/uu/sed/src/named_writer.rs @@ -36,7 +36,12 @@ impl NamedWriter { .write(true) .truncate(true) .open(&path) - .map_err(|e| USimpleError::new(2, format!("{}: {}", path.display(), e)))?; + .map_err(|e| { + USimpleError::new( + 2, + format!("Error opening output file {}: {}", path.display(), e), + ) + })?; let writer = Rc::new(RefCell::new(NamedWriter { path, diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index c1c603c1..23964d7f 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -18,7 +18,7 @@ use atty::Stream; use std::cell::RefCell; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::UResult; +use uucore::error::{UResult, USimpleError}; /// Return true if the passed address matches the current I/O context. fn match_address( @@ -349,7 +349,12 @@ pub fn process_all_files( for (index, path) in files.iter().enumerate() { context.last_file = index == last_file_index; - let mut reader = LineReader::open(path)?; + let mut reader = LineReader::open(path).map_err(|e| { + USimpleError::new( + 2, + format!("Error opening input file {}: {}", path.display(), e), + ) + })?; let output = in_place.begin(path)?; if context.separate { diff --git a/src/uu/sed/src/script_line_provider.rs b/src/uu/sed/src/script_line_provider.rs index e5876a5a..f287e982 100644 --- a/src/uu/sed/src/script_line_provider.rs +++ b/src/uu/sed/src/script_line_provider.rs @@ -12,6 +12,7 @@ use crate::command::ScriptValue; use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader}; +use uucore::error::{UResult, USimpleError}; #[derive(Debug)] /// The provider of script lines across all specified scripts @@ -59,7 +60,7 @@ impl ScriptLineProvider { } /// Return the next script line to process across all scripts. - pub fn next_line(&mut self) -> io::Result> { + pub fn next_line(&mut self) -> UResult> { let mut line = String::new(); loop { @@ -96,7 +97,7 @@ impl ScriptLineProvider { } // Move to the next available script source. - fn advance_source(&mut self, next_index: usize) -> io::Result<()> { + fn advance_source(&mut self, next_index: usize) -> UResult<()> { if next_index >= self.sources.len() { self.state = State::Done; return Ok(()); @@ -130,7 +131,12 @@ impl ScriptLineProvider { line_number: 0, }; } else { - let file = File::open(p)?; + let file = File::open(p).map_err(|e| { + USimpleError::new( + 2, + format!("Error opening script file {}: {}", p.display(), e), + ) + })?; self.state = State::Active { index: next_index, reader: Box::new(BufReader::new(file)), From 9e589d1d13787861353a6a71752ba9f0789c2a08 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 13:09:23 +0300 Subject: [PATCH 51/85] Move saved regex into processing context This simplifies its handling. --- src/uu/sed/src/command.rs | 2 + src/uu/sed/src/compiler.rs | 107 ++++++++++++++++--------------------- src/uu/sed/src/sed.rs | 2 + 3 files changed, 51 insertions(+), 60 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index fdc86702..7e0bc792 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -49,6 +49,8 @@ pub struct ProcessingContext { pub last_line: bool, /// True if the file is the last file of the ones specified pub last_file: bool, + /// Previously compiled RE, saved for reuse when specifying an empty RE + pub saved_regex: RefCell>, } #[derive(Debug, PartialEq)] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index f38d8354..56fd7550 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -24,11 +24,6 @@ use std::path::PathBuf; use std::rc::Rc; use uucore::error::UResult; -thread_local! { - /// The previously saved RE. It is reused when specifying an empty one. - static SAVED_REGEX: RefCell> = const { RefCell::new(None) }; -} - // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); @@ -451,13 +446,12 @@ fn compile_regex( icase: bool, ) -> UResult { if pattern.is_empty() { - SAVED_REGEX.with(|cell| { - if let Some(existing) = &*cell.borrow() { - Ok(existing.clone()) - } else { - compilation_error(lines, line, "no previously compiled regex available") - } - }) + let maybe_existing = context.saved_regex.borrow(); + if let Some(existing) = &*maybe_existing { + Ok(existing.clone()) + } else { + compilation_error(lines, line, "no previously compiled regex available") + } } else { // Convert basic to extended regular expression if needed. let ere_pattern = if context.regex_extended { @@ -482,9 +476,8 @@ fn compile_regex( .unwrap_err() })?; - SAVED_REGEX.with(|cell| { - *cell.borrow_mut() = Some(compiled.clone()); - }); + *context.saved_regex.borrow_mut() = Some(compiled.clone()); + Ok(compiled) } } @@ -873,12 +866,8 @@ mod tests { } /// Return a default ProcessingContext for use in tests. - - fn ctx() -> &'static ProcessingContext { - use std::sync::OnceLock; - - static CONTEXT: OnceLock = OnceLock::new(); - CONTEXT.get_or_init(ProcessingContext::default) + pub fn ctx() -> ProcessingContext { + ProcessingContext::default() } // lookup_command @@ -1084,7 +1073,7 @@ mod tests { #[test] fn test_compile_re_basic() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", ctx(), false).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", &ctx(), false).unwrap(); assert!(regex.is_match("abc")); assert!(!regex.is_match("ABC")); } @@ -1092,7 +1081,7 @@ mod tests { #[test] fn test_compile_re_case_insensitive() { let (lines, chars) = dummy_providers(); - let regex = compile_regex(&lines, &chars, "abc", ctx(), true).unwrap(); + let regex = compile_regex(&lines, &chars, "abc", &ctx(), true).unwrap(); assert!(regex.is_match("abc")); assert!(regex.is_match("ABC")); assert!(regex.is_match("AbC")); @@ -1100,33 +1089,29 @@ mod tests { #[test] fn test_compile_re_saved_and_reuse() { + let context = ctx(); // Save a regex let (lines1, chars1) = dummy_providers(); - let _ = compile_regex(&lines1, &chars1, "abc", ctx(), false).unwrap(); + let _ = compile_regex(&lines1, &chars1, "abc", &context, false).unwrap(); // Now try to reuse it let (lines2, chars2) = dummy_providers(); - let reused = compile_regex(&lines2, &chars2, "", ctx(), false).unwrap(); + let reused = compile_regex(&lines2, &chars2, "", &context, false).unwrap(); assert!(reused.is_match("abc")); } #[test] fn test_compile_re_empty_and_not_saved() { - // Clear saved regex - SAVED_REGEX.with(|cell| { - *cell.borrow_mut() = None; - }); - let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "", ctx(), false); + let result = compile_regex(&lines, &chars, "", &ctx(), false); assert!(result.is_err()); // Should fail because nothing was saved } #[test] fn test_compile_re_invalid() { let (lines, chars) = dummy_providers(); - let result = compile_regex(&lines, &chars, "a[d", ctx(), false); + let result = compile_regex(&lines, &chars, "a[d", &ctx(), false); assert!(result.is_err()); // Should fail due to open bracketed expression } @@ -1134,7 +1119,7 @@ mod tests { #[test] fn test_compile_addr_line_number() { let (lines, mut chars) = make_providers("42"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Line)); if let AddressValue::LineNumber(n) = addr.value { assert_eq!(n, 42); @@ -1146,7 +1131,7 @@ mod tests { #[test] fn test_compile_addr_relative_line() { let (lines, mut chars) = make_providers("+7"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::RelLine)); if let AddressValue::LineNumber(n) = addr.value { assert_eq!(n, 7); @@ -1158,14 +1143,14 @@ mod tests { #[test] fn test_compile_addr_last_line() { let (lines, mut chars) = make_providers("$"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Last)); } #[test] fn test_compile_addr_regex() { let (lines, mut chars) = make_providers("/hello/"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("hello")); @@ -1177,7 +1162,7 @@ mod tests { #[test] fn test_compile_addr_regex_other_delimiter() { let (lines, mut chars) = make_providers("\\#hello#"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("hello")); @@ -1189,7 +1174,7 @@ mod tests { #[test] fn test_compile_addr_regex_with_modifier() { let (lines, mut chars) = make_providers("/hello/I"); - let addr = compile_address(&lines, &mut chars, ctx()).unwrap(); + let addr = compile_address(&lines, &mut chars, &ctx()).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("HELLO")); // case-insensitive @@ -1200,13 +1185,14 @@ mod tests { #[test] fn test_compile_addr_empty_regex_saved() { + let context = ctx(); // First save a regex let (lines1, mut chars1) = make_providers("/saved/"); - let _ = compile_address(&lines1, &mut chars1, ctx()).unwrap(); + let _ = compile_address(&lines1, &mut chars1, &context).unwrap(); // Then reuse it with empty regex let (lines2, mut chars2) = make_providers("//"); - let addr = compile_address(&lines2, &mut chars2, ctx()).unwrap(); + let addr = compile_address(&lines2, &mut chars2, &context).unwrap(); assert!(matches!(addr.atype, AddressType::Re)); if let AddressValue::Regex(re) = addr.value { assert!(re.is_match("saved")); @@ -1220,7 +1206,7 @@ mod tests { fn test_compile_single_line_address() { let (lines, mut chars) = make_providers("42"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1233,7 +1219,7 @@ mod tests { fn test_compile_relative_address_range() { let (lines, mut chars) = make_providers("2,+3"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 2); @@ -1262,7 +1248,7 @@ mod tests { fn test_compile_last_address() { let (lines, mut chars) = make_providers("$"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1275,7 +1261,7 @@ mod tests { fn test_compile_absolute_address_range() { let (lines, mut chars) = make_providers("5,10"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 2); assert!(matches!( @@ -1292,7 +1278,7 @@ mod tests { fn test_compile_regex_address() { let (lines, mut chars) = make_providers("/foo/"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1311,7 +1297,7 @@ mod tests { fn test_compile_regex_address_range_other_delimiter() { let (lines, mut chars) = make_providers("\\#foo# , \\|bar|"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 2); @@ -1342,7 +1328,7 @@ mod tests { fn test_compile_regex_with_modifier() { let (lines, mut chars) = make_providers("/foo/I"); let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, ctx()).unwrap(); + let n_addr = compile_address_range(&lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1359,15 +1345,16 @@ mod tests { #[test] fn test_compile_re_reuse_saved() { + let context = ctx(); // First save a regex let (lines1, mut chars1) = make_providers("/abc/"); let mut cmd1 = Rc::new(RefCell::new(Command::default())); - compile_address_range(&lines1, &mut chars1, &mut cmd1, ctx()).unwrap(); + compile_address_range(&lines1, &mut chars1, &mut cmd1, &context).unwrap(); // Now reuse it let (lines2, mut chars2) = make_providers("//"); let mut cmd2 = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2, ctx()).unwrap(); + let n_addr = compile_address_range(&lines2, &mut chars2, &mut cmd2, &context).unwrap(); assert_eq!(n_addr, 1); assert!(matches!( @@ -1391,7 +1378,7 @@ mod tests { #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1400,7 +1387,7 @@ mod tests { #[test] fn test_compile_thread_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); assert!(result.is_none()); @@ -1409,7 +1396,7 @@ mod tests { #[test] fn test_compile_thread_single_command() { let mut provider = make_provider(&["42q"]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1433,7 +1420,7 @@ mod tests { #[test] fn test_compile_thread_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1457,7 +1444,7 @@ mod tests { #[test] fn test_compile_thread_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1473,7 +1460,7 @@ mod tests { #[test] fn test_compile_thread_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); - let mut opts = ctx(); + let mut opts = &ctx(); let result = compile_thread(&mut provider, &mut opts).unwrap(); let binding = result.unwrap(); @@ -1689,7 +1676,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s\\foo\\bar\\"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); assert!(err .to_string() .contains("substitute pattern cannot be delimited")); @@ -1700,7 +1687,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s//bar/"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); assert!(err.to_string().contains("unterminated substitute pattern")); } @@ -1709,7 +1696,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/x"); let mut cmd = Command::default(); - let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap_err(); + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); assert!(err.to_string().contains("invalid substitute flag")); } @@ -1718,7 +1705,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/;"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap(); + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert!(matches!(result, ContinueAction::NextChar)); if let CommandData::Substitution(subst) = &cmd.data { @@ -1733,7 +1720,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, ctx()).unwrap(); + let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); assert!(matches!(result, ContinueAction::NextLine)); match &cmd.data { diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index a75ecdc3..c56242a9 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -22,6 +22,7 @@ use crate::command::{ProcessingContext, ScriptValue}; use crate::compiler::compile; use crate::processor::process_all_files; use clap::{arg, Arg, ArgMatches, Command}; +use std::cell::RefCell; use std::path::PathBuf; use uucore::error::{UResult, UUsageError}; use uucore::format_usage; @@ -203,6 +204,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { last_address: false, last_line: false, last_file: false, + saved_regex: const { RefCell::new(None) }, } } From 13d067dafed731497b5be4922689bfa9eb01083d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 13:25:46 +0300 Subject: [PATCH 52/85] Improve substitution default handling --- src/uu/sed/src/command.rs | 2 +- src/uu/sed/src/compiler.rs | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 7e0bc792..cb4598e9 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -154,7 +154,7 @@ pub struct Substitution { impl Default for Substitution { fn default() -> Self { Substitution { - occurrence: 1, + occurrence: 0, print_flag: false, ignore_case: false, write_file: None, diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 56fd7550..0b6ef39a 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -607,19 +607,14 @@ pub fn compile_subst_command( } let mut subst = Box::new(Substitution { - occurrence: 0, - print_flag: false, - ignore_case: false, - write_file: None, - regex: compile_regex(lines, line, &pattern, context, false)?, // temp compile line_number: lines.get_line_number(), - replacement: ReplacementTemplate::default(), + ..Default::default() }); subst.replacement = compile_replacement(lines, line)?; compile_subst_flags(lines, line, &mut subst)?; - // Recompile regex with actual ignore_case flag + // Compile regex with now known ignore_case flag. subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; line.eat_spaces(); @@ -725,7 +720,6 @@ pub fn compile_subst_flags( } subst.write_file = Some(NamedWriter::new(PathBuf::from(path))?); - // NOTE: subst.write_handle is resolved later at runtime return Ok(()); // 'w' is the last flag allowed } From 5c2579c1027826763ce2c5290655b5a59cc793ef Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 13:49:30 +0300 Subject: [PATCH 53/85] Verify capture group references at compile time This is more efficient and provides better diagnostics. --- src/uu/sed/src/command.rs | 59 +++++++++++++++++++++++++++----------- src/uu/sed/src/compiler.rs | 29 +++++++++++++++++++ 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index cb4598e9..8b32ef52 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -19,7 +19,7 @@ use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; // Compilation and processing options provided mostly through the // command-line interface @@ -121,22 +121,32 @@ impl ReplacementTemplate { } ReplacementPart::Group(n) => { - let group_index = *n as usize; - if group_index >= caps.len() { - return Err(USimpleError::new( - 2, - // TODO: Provide code location info - format!("\\{} not defined in the regular expression", n), - )); - } - - result.push_str(caps.get(group_index).map_or("", |m| m.as_str())); + // Compilation guarantees we only get valid group numbers + result.push_str( + caps.get((*n).try_into().unwrap()) + .map_or("", |m| m.as_str()), + ); } } } Ok(result) } + + /// Returns the highest capture group number referenced in this template. + pub fn max_group_number(&self) -> u32 { + self.parts + .iter() + .filter_map(|part| { + if let ReplacementPart::Group(n) = part { + Some(*n) + } else { + None + } + }) + .max() + .unwrap_or(0) + } } #[derive(Debug)] @@ -329,15 +339,30 @@ mod tests { assert_eq!(result, "key: x, value: 123"); } + // max_group_number #[test] - // s/(a)(b)/\3/ - fn test_invalid_backreference() { + fn test_max_group_number_with_groups() { let template = ReplacementTemplate { - parts: vec![ReplacementPart::Group(3)], + parts: vec![ + ReplacementPart::Literal("a".into()), + ReplacementPart::Group(2), + ReplacementPart::WholeMatch, + ReplacementPart::Group(5), + ReplacementPart::Literal("z".into()), + ], }; - let caps = caps_for(r"(a)(b)", "ab"); // only groups 1 and 2 exist + assert_eq!(template.max_group_number(), 5); + } - let err = template.apply(&caps).unwrap_err(); - assert!(err.to_string().contains(r"\3 not defined")); + #[test] + fn test_max_group_number_without_groups() { + let template = ReplacementTemplate { + parts: vec![ + ReplacementPart::Literal("no".into()), + ReplacementPart::WholeMatch, + ReplacementPart::Literal("groups".into()), + ], + }; + assert_eq!(template.max_group_number(), 0); } } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 0b6ef39a..51826d3f 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -617,6 +617,24 @@ pub fn compile_subst_command( // Compile regex with now known ignore_case flag. subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; + let re_captures = subst + .regex + .captures_len() + .saturating_sub(1) + .try_into() + .unwrap(); + let max_group_number = subst.replacement.max_group_number(); + if max_group_number > re_captures { + return compilation_error( + lines, + line, + format!( + "group number \\{} is larger than the {} available RE groups", + max_group_number, re_captures + ), + ); + } + line.eat_spaces(); if !line.eol() && line.current() == ';' { line.advance(); @@ -1728,6 +1746,17 @@ mod tests { } } + #[test] + fn test_compile_subst_invalid_group_number() { + let (mut lines, mut chars) = make_providers(r"s/\(.\)\(.\)/\3\2\1/"); + let mut cmd = Command::default(); + + let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); + assert!(err + .to_string() + .contains("group number \\3 is larger than the 2 available RE groups")); + } + // bre_to_ere #[test] fn test_bre_group_translation() { From 10a48acf2083b2f6dd272debba594cb27dce65dd Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 16:54:10 +0300 Subject: [PATCH 54/85] Add missing type annotation --- src/uu/sed/src/compiler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 51826d3f..8e0a3422 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -617,7 +617,7 @@ pub fn compile_subst_command( // Compile regex with now known ignore_case flag. subst.regex = compile_regex(lines, line, &pattern, context, subst.ignore_case)?; - let re_captures = subst + let re_captures: u32 = subst .regex .captures_len() .saturating_sub(1) From ab97623f20219b1b9f14c2cc9bf5a09d8def2cc7 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 16:55:30 +0300 Subject: [PATCH 55/85] Fix and make portable the subst_write_file test --- tests/by-util/test_sed.rs | 22 +++++++++++--- tests/fixtures/sed/output/subst_write_file | 34 ++++------------------ 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index dd40fff9..0639be1e 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -8,7 +8,8 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use std::io::Write; +use std::fs; +use std::io::{Read, Write}; use tempfile::NamedTempFile; use uutests::new_ucmd; use uutests::util::TestScenario; @@ -219,8 +220,21 @@ check_output!( ); check_output!(subst_multiline, ["-e", "s/_/u0\\\nu1\\\nu2/g", LINES1]); check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); +check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); -#[cfg(unix)] -check_output!(subst_write_file, ["-e", r"s/1/X/w /dev/stdout", LINES1]); +#[test] +fn subst_write_file() -> std::io::Result<()> { + let temp = NamedTempFile::new()?; + let path = temp.path(); + let cmd = format!("s/_1/S_1/w {}", path.display()); -check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); + new_ucmd!().args(&["-n", &cmd, LINES1]).succeeds(); + + let mut actual = String::new(); + temp.reopen()?.read_to_string(&mut actual)?; + + let expected = fs::read_to_string("tests/fixtures/sed/output/subst_write_file")?; + assert_eq!(actual, expected, "Output did not match fixture"); + + Ok(()) +} diff --git a/tests/fixtures/sed/output/subst_write_file b/tests/fixtures/sed/output/subst_write_file index 70a0f11d..952ff972 100644 --- a/tests/fixtures/sed/output/subst_write_file +++ b/tests/fixtures/sed/output/subst_write_file @@ -1,28 +1,6 @@ -lX_1 -lX_2 -lX_3 -lX_4 -lX_5 -lX_6 -lX_7 -lX_8 -lX_9 -lX_10 -lX_11 -lX_12 -lX_13 -lX_14 -lX_1 -lX_2 -lX_3 -lX_4 -lX_5 -lX_6 -lX_7 -lX_8 -lX_9 -lX_10 -lX_11 -lX_12 -lX_13 -lX_14 +l1S_1 +l1S_10 +l1S_11 +l1S_12 +l1S_13 +l1S_14 From 8b4b43a4739d7b138e64707b88b36a2399f545dc Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 17:01:04 +0300 Subject: [PATCH 56/85] Addsubst_ tests --- tests/by-util/test_sed.rs | 2 ++ tests/fixtures/sed/output/subst_case_insensitive | 14 ++++++++++++++ tests/fixtures/sed/output/subst_print | 6 ++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/fixtures/sed/output/subst_case_insensitive create mode 100644 tests/fixtures/sed/output/subst_print diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 0639be1e..31b68e63 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -201,6 +201,7 @@ check_output!(subst_escaped_magic_separator, ["-e", r"s.\..X.g", LINES1]); check_output!(subst_escaped_braced_separator, ["-e", r"s/[\/]/Q/", LINES1]); check_output!(subst_escaped_separator, ["-e", r"s_\__X_", LINES1]); check_output!(subst_whole_match_group, ["-e", r"s/./(&)/g", LINES1]); +check_output!(subst_print, ["-ne", "s/1_1/S&/p", LINES1]); check_output!( subst_escaped_whole_match_group, ["-e", r"s/./(\&)/g", LINES1] @@ -221,6 +222,7 @@ check_output!( check_output!(subst_multiline, ["-e", "s/_/u0\\\nu1\\\nu2/g", LINES1]); check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); +check_output!(subst_case_insensitive, ["-e", r"s/L/Line/", LINES1]); #[test] fn subst_write_file() -> std::io::Result<()> { diff --git a/tests/fixtures/sed/output/subst_case_insensitive b/tests/fixtures/sed/output/subst_case_insensitive new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/subst_case_insensitive @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/subst_print b/tests/fixtures/sed/output/subst_print new file mode 100644 index 00000000..9f44d0c9 --- /dev/null +++ b/tests/fixtures/sed/output/subst_print @@ -0,0 +1,6 @@ +lS1_1 +lS1_10 +lS1_11 +lS1_12 +lS1_13 +lS1_14 From 9c5b956a4e8e1dea3ca7d2f000814c8da79e03a2 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 17:20:03 +0300 Subject: [PATCH 57/85] Update format to current formatter --- src/uu/sed/src/compiler.rs | 30 +++++++++++++++++------------- src/uu/sed/src/processor.rs | 2 +- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 8e0a3422..d8d3913d 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -799,7 +799,7 @@ fn compile_command( return compile_empty_command(lines, line, &mut cmd); } CommandArgs::NonSelect => { // ! - // Implemented at a heigher level. + // Implemented at a higher level. } CommandArgs::Substitute => { // s @@ -1636,9 +1636,10 @@ mod tests { let mut subst = Substitution::default(); let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); - assert!(err - .to_string() - .contains("multiple 'g' or numeric flags in substitute command")); + assert!( + err.to_string() + .contains("multiple 'g' or numeric flags in substitute command") + ); } #[test] @@ -1647,9 +1648,10 @@ mod tests { let mut subst = Substitution::default(); let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); - assert!(err - .to_string() - .contains("multiple 'g' or numeric flags in substitute command")); + assert!( + err.to_string() + .contains("multiple 'g' or numeric flags in substitute command") + ); } #[test] @@ -1689,9 +1691,10 @@ mod tests { let mut cmd = Command::default(); let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!(err - .to_string() - .contains("substitute pattern cannot be delimited")); + assert!( + err.to_string() + .contains("substitute pattern cannot be delimited") + ); } #[test] @@ -1752,9 +1755,10 @@ mod tests { let mut cmd = Command::default(); let err = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap_err(); - assert!(err - .to_string() - .contains("group number \\3 is larger than the 2 available RE groups")); + assert!( + err.to_string() + .contains("group number \\3 is larger than the 2 available RE groups") + ); } // bre_to_ere diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 23964d7f..9b05b985 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -325,7 +325,7 @@ fn process_file( // The compilation should supply only valid codes. _ => panic!("invalid command code"), } // match - // Advance to next command. + // Advance to next command. current = command.next.clone(); } From 305aa47d261a255f675756ba9876d61814498e38 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 17:26:08 +0300 Subject: [PATCH 58/85] Remove unsafe warning --- src/uu/sed/src/fast_io.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index c5236cb8..cf86fa8d 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -247,7 +247,9 @@ impl IOChunkContent<'_> { #[cfg(unix)] unsafe fn as_str_unchecked(&self) -> &str { match self { - IOChunkContent::MmapInput { content, .. } => std::str::from_utf8_unchecked(content), + IOChunkContent::MmapInput { content, .. } => unsafe { + std::str::from_utf8_unchecked(content) + }, IOChunkContent::Owned { content, .. } => content, } } From a799a97df223cc679f924cacec9ffdc2ba0fa8a2 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 17:52:17 +0300 Subject: [PATCH 59/85] Partially address \r on Windows (WIP) - Terminate lines with \r. - Remove \r when reading files. Pending: - Add \r when processing continuation lines. - Add \r on the w command and the s w flag. - More ... Some unit tests may have broken due to this. --- src/uu/sed/src/fast_io.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index cf86fa8d..ac3ce38a 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -122,10 +122,14 @@ impl ReadLineCursor { } // O(1) check whether it ended in '\n' let has_newline = self.buffer.ends_with('\n'); - // strip it if you don’t want to expose it to the caller + // Strip it if you don’t want to expose it to the caller. if has_newline { self.buffer.pop(); } + // Also strip \r on Windows. + if cfg!(windows) && self.buffer.ends_with('\r') { + self.buffer.pop(); // remove '\r' + } let line = std::mem::take(&mut self.buffer); let is_last_line = self.reader.fill_buf()?.is_empty(); Ok(Some((line, has_newline, is_last_line))) @@ -499,7 +503,7 @@ impl OutputBuffer { } => { self.out.write_all(content.as_bytes())?; if *has_newline { - self.out.write_all(b"\n")?; + self.out.write_all(b"\r\n")?; } Ok(()) } From 39c9fccd44163cf155e55362460765307c2f3dc4 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 18:39:33 +0300 Subject: [PATCH 60/85] Revert "Partially address \r on Windows (WIP)" This reverts commit a799a97df223cc679f924cacec9ffdc2ba0fa8a2. Dealing with \r on Windows is a wormhole that brings more trouble than benefits. - Producing \r may confuse other tools. - Silently consuming \r is inconsistent. - Adjusting behavior according to input file conventions is messy and likely incorrect when processing hybrid input. --- src/uu/sed/src/fast_io.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index ac3ce38a..cf86fa8d 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -122,14 +122,10 @@ impl ReadLineCursor { } // O(1) check whether it ended in '\n' let has_newline = self.buffer.ends_with('\n'); - // Strip it if you don’t want to expose it to the caller. + // strip it if you don’t want to expose it to the caller if has_newline { self.buffer.pop(); } - // Also strip \r on Windows. - if cfg!(windows) && self.buffer.ends_with('\r') { - self.buffer.pop(); // remove '\r' - } let line = std::mem::take(&mut self.buffer); let is_last_line = self.reader.fill_buf()?.is_empty(); Ok(Some((line, has_newline, is_last_line))) @@ -503,7 +499,7 @@ impl OutputBuffer { } => { self.out.write_all(content.as_bytes())?; if *has_newline { - self.out.write_all(b"\r\n")?; + self.out.write_all(b"\n")?; } Ok(()) } From d7320523c80fb0bce20eeab31b597d294d152e36 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 18:42:58 +0300 Subject: [PATCH 61/85] Ensure no auto-crlf on Windows --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06116bfb..171a050c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,9 @@ jobs: os: [ubuntu-latest, macOS-latest, windows-latest] steps: - uses: actions/checkout@v4 + with: + # Force LF line endings + core.autocrlf: false - uses: dtolnay/rust-toolchain@stable - run: cargo test --all From abd54066850818603346d4bd08df4e4f54a3c5aa Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 11 May 2025 18:20:50 +0300 Subject: [PATCH 62/85] See and fix CR files before testing --- .github/workflows/ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 171a050c..220f98c6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,19 @@ jobs: # Force LF line endings core.autocrlf: false - uses: dtolnay/rust-toolchain@stable + - name: File type before conversion + run: file tests/fixtures/sed/*/* + - name: Remove carriage return + shell: bash + run: | + if command -v dos2unix >/dev/null 2>&1; then + echo "dos2unix is available; converting" + dos2unix tests/fixtures/sed/*/* + else + echo "dos2unix is NOT available" + fi + - name: File type after convesion + run: file tests/fixtures/sed/*/* - run: cargo test --all coverage: From fa853c027a4dc8a5ad631ac72cbe774062f29ef1 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Tue, 13 May 2025 20:24:56 +0300 Subject: [PATCH 63/85] Implement the transliteration command --- README.md | 3 + src/uu/sed/src/command.rs | 140 +++++++++++++++++++++- src/uu/sed/src/compiler.rs | 89 +++++++++++--- src/uu/sed/src/processor.rs | 31 ++++- tests/by-util/test_sed.rs | 6 + tests/fixtures/sed/output/trans_delimiter | 14 +++ tests/fixtures/sed/output/trans_simple | 14 +++ 7 files changed, 275 insertions(+), 22 deletions(-) create mode 100644 tests/fixtures/sed/output/trans_delimiter create mode 100644 tests/fixtures/sed/output/trans_simple diff --git a/README.md b/README.md index 06c328df..90cadc66 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ cargo run --release * Spaces can precede a regular expression modifier. * `I` can be used in as a synonym for the `i` (case insensitive) substitution flag. +* In addition to `\n`, other escape sequences (octal, hex, C) are supported + in the strings of the `y` command. + Under POSIX these yield undefined behavior. ### Supported BSD and GNU extensions * The second address in a range can be specified as a relative address with +N. diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 8b32ef52..f99aa8c0 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -175,11 +175,61 @@ impl Default for Substitution { } } +/// The block of the first and most common Unicode characters: +/// ASCII, Latin Extended, Greek, Curillic, Coptic, Arabic, etc. +/// It comprises all UCS-2 characters. We use a fast lookup array for these. +const COMMON_UNICODE: usize = 2048; + #[derive(Debug)] /// Transliteration command (y) pub struct Transliteration { - pub byte_table: [u8; 256], // Byte translation table - pub multi_map: HashMap, // Direct mapping from one char to another + fast: [char; COMMON_UNICODE], + slow: HashMap, +} + +impl Default for Transliteration { + /// Create a new Transliteration with identity mapping for the fast-path. + fn default() -> Self { + let mut fast = ['\0'; COMMON_UNICODE]; + for (i, slot) in fast.iter_mut().enumerate() { + *slot = char::from_u32(i as u32).unwrap_or('\0'); + } + Self { + fast, + slow: HashMap::new(), + } + } +} + +impl Transliteration { + /// Create through character mappings from `source` to `target`. + pub fn from_strings(source: &str, target: &str) -> Self { + let mut result = Self::default(); + for (from, to) in source.chars().zip(target.chars()) { + result.insert(from, to); + } + result + } + + /// Set a transliteration mapping from one character to another. + fn insert(&mut self, from: char, to: char) { + let cp = from as usize; + if cp < COMMON_UNICODE { + self.fast[cp] = to; + } else { + self.slow.insert(from, to); + } + } + + /// Look up a character transliteration. + pub fn lookup(&self, ch: char) -> char { + let cp = ch as usize; + if cp < COMMON_UNICODE { + self.fast[cp] + } else { + self.slow.get(&ch).copied().unwrap_or(ch) + } + } } #[derive(Debug)] @@ -365,4 +415,90 @@ mod tests { }; assert_eq!(template.max_group_number(), 0); } + + // Transliteration + // Creation and internal functions + #[test] + fn test_identity_lookup_fast_path() { + let t = Transliteration::default(); + assert_eq!(t.lookup('A'), 'A'); + assert_eq!(t.lookup('z'), 'z'); + assert_eq!(t.lookup('\u{07FF}'), '\u{07FF}'); // highest 2-byte UTF-8 char + } + + #[test] + fn test_identity_lookup_slow_path() { + let t = Transliteration::default(); + assert_eq!(t.lookup('\u{0800}'), '\u{0800}'); // just outside fast path + assert_eq!(t.lookup('\u{1F600}'), '\u{1F600}'); // 😀 + } + + #[test] + fn test_insert_and_lookup_fast_path() { + let mut t = Transliteration::default(); + t.insert('a', 'α'); + t.insert('b', 'β'); + assert_eq!(t.lookup('a'), 'α'); + assert_eq!(t.lookup('b'), 'β'); + assert_eq!(t.lookup('c'), 'c'); // unchanged + } + + #[test] + fn test_insert_and_lookup_slow_path() { + let mut t = Transliteration::default(); + t.insert('🦀', 'c'); // U+1F980 Crab emoji -> 'c' + assert_eq!(t.lookup('🦀'), 'c'); + assert_eq!(t.lookup('🦁'), '🦁'); // unchanged + } + + #[test] + fn test_overwrite_mapping() { + let mut t = Transliteration::default(); + t.insert('x', '1'); + assert_eq!(t.lookup('x'), '1'); + t.insert('x', '2'); + assert_eq!(t.lookup('x'), '2'); + } + + #[test] + fn test_all_fast_path_mapped_to_space() { + let mut t = Transliteration::default(); + for cp in 0..COMMON_UNICODE { + if let Some(ch) = char::from_u32(cp as u32) { + t.insert(ch, ' '); + } + } + assert_eq!(t.lookup('A'), ' '); + assert_eq!(t.lookup('\u{07FF}'), ' '); + } + + // from_strings + fn test_basic_transliteration() { + let t = Transliteration::from_strings("abcδ", "1234"); + + assert_eq!(t.lookup('a'), '1'); + assert_eq!(t.lookup('b'), '2'); + assert_eq!(t.lookup('c'), '3'); + assert_eq!(t.lookup('δ'), '4'); + assert_eq!(t.lookup('e'), 'e'); // not mapped, fallback + } + + #[test] + fn test_unicode_slow_path() { + let source = "é漢🦀"; + let target = "e文c"; + let t = Transliteration::from_strings(source, target); + + assert_eq!(t.lookup('é'), 'e'); + assert_eq!(t.lookup('漢'), '文'); + assert_eq!(t.lookup('🦀'), 'c'); + assert_eq!(t.lookup('x'), 'x'); // fast fallback + assert_eq!(t.lookup('文'), '文'); // slow fallback + } + + #[test] + fn test_overwrite_fast_path() { + let t = Transliteration::from_strings("aa", "12"); + assert_eq!(t.lookup('a'), '2'); // last mapping wins + } } diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index d8d3913d..305f248d 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -10,9 +10,11 @@ use crate::command::{ Address, AddressType, AddressValue, Command, CommandData, ProcessingContext, ReplacementPart, - ReplacementTemplate, ScriptValue, Substitution, + ReplacementTemplate, ScriptValue, Substitution, Transliteration, +}; +use crate::delimited_parser::{ + compilation_error, parse_char_escape, parse_regex, parse_transliteration, }; -use crate::delimited_parser::{compilation_error, parse_char_escape, parse_regex}; use crate::named_writer::NamedWriter; use crate::script_char_provider::ScriptCharProvider; use crate::script_line_provider::ScriptLineProvider; @@ -30,18 +32,18 @@ static CMD_MAP: Lazy> = Lazy::new(build_command_map); // Types of command arguments recognized by the parser #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CommandArgs { - Empty, // d D g G h H l n N p P q x = \0 - Text, // a c i - NonSelect, // ! - Group, // { - EndGroup, // } - Comment, // # - Branch, // b t - Label, // : - ReadFile, // r - WriteFile, // w - Substitute, // s - Translate, // y + Empty, // d D g G h H l n N p P q x = \0 + Text, // a c i + NonSelect, // ! + Group, // { + EndGroup, // } + Comment, // # + Branch, // b t + Label, // : + ReadFile, // r + WriteFile, // w + Substitute, // s + Transliterate, // y } // Command specification @@ -173,7 +175,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'y', n_addr: 2, - args: CommandArgs::Translate, + args: CommandArgs::Transliterate, }, CommandSpec { code: '!', @@ -396,7 +398,7 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR } /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. -/// - Translates `\(` and `\)` into `(` and `)` +/// - Replacces `\(` and `\)` with `(` and `)` /// - Escapes ERE-only metacharacters: `+ ? { } | ( )` /// - Leaves all other characters as-is fn bre_to_ere(pattern: &str) -> String { @@ -654,6 +656,53 @@ pub fn compile_subst_command( Ok(ContinueAction::NextLine) } +pub fn compile_trans_command( + lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + line.advance(); // move past 'y' + + let delimiter = line.current(); + if delimiter == '\0' || delimiter == '\\' { + return compilation_error( + lines, + line, + "transliteration string cannot be delimited by newline or backslash", + ); + } + + let source = parse_transliteration(lines, line)?; + let target = parse_transliteration(lines, line)?; + if source.chars().count() != target.chars().count() { + return compilation_error( + lines, + line, + "transliteration strings are not the same length", + ); + } + + let transliteration = Box::new(Transliteration::from_strings(&source, &target)); + + line.advance(); // move past last delimiter + if !line.eol() && line.current() == ';' { + line.advance(); + cmd.data = CommandData::Transliteration(transliteration); + return Ok(ContinueAction::NextChar); + } + + if !line.eol() { + return compilation_error( + lines, + line, + format!("extra characters at the end of the {} command", cmd.code), + ); + } + + cmd.data = CommandData::Transliteration(transliteration); + Ok(ContinueAction::NextLine) +} + /// Parse the substitution command's optional flags pub fn compile_subst_flags( lines: &ScriptLineProvider, @@ -805,6 +854,10 @@ fn compile_command( // s return compile_subst_command(lines, line, &mut cmd, context); } + CommandArgs::Transliterate => { + // y + return compile_trans_command(lines, line, &mut cmd); + } // TODO CommandArgs::Text => { // a c i } @@ -822,8 +875,6 @@ fn compile_command( } CommandArgs::WriteFile => { // w } - CommandArgs::Translate => { // y - } } Ok(ContinueAction::NextLine) @@ -964,7 +1015,7 @@ mod tests { fn test_lookup_translate_command() { let cmd = lookup_command('y').unwrap(); assert_eq!(cmd.n_addr, 2); - assert_eq!(cmd.args, CommandArgs::Translate); + assert_eq!(cmd.args, CommandArgs::Transliterate); } #[test] diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 9b05b985..99ae6356 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -10,6 +10,7 @@ use crate::command::{ Address, AddressType, AddressValue, Command, CommandData, ProcessingContext, Substitution, + Transliteration, }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; @@ -212,6 +213,29 @@ fn substitute( Ok(()) } +/// Apply the specified transliteration in the provided pattern space. +fn transliterate(pattern: &mut IOChunk, trans: &Transliteration) -> UResult<()> { + let text = pattern.try_as_str()?; + let mut result = String::with_capacity(text.len()); + let mut replaced = false; + + // Perform the transliteration. + for ch in text.chars() { + let mapped = trans.lookup(ch); + if mapped != ch { + replaced = true; + } + result.push(mapped); + } + + // Lazy replace. + if replaced { + pattern.set_to_string(result, true); + } + + Ok(()) +} + /// Process a single input file fn process_file( commands: &Option>>, @@ -311,7 +335,12 @@ fn process_file( // TODO } 'y' => { - // TODO + let trans = match &mut command.data { + CommandData::Transliteration(trans) => trans, + _ => panic!("Expected Transliteration command data"), + }; + + transliterate(&mut pattern, trans)?; } ':' => { // TODO diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 31b68e63..ac58f7ba 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -240,3 +240,9 @@ fn subst_write_file() -> std::io::Result<()> { Ok(()) } + +check_output!(trans_simple, ["-e", r"y/0123456789/9876543210/", LINES1]); +check_output!( + trans_delimiter, + ["-e", r"y10\123456789198765432\101", LINES1] +); diff --git a/tests/fixtures/sed/output/trans_delimiter b/tests/fixtures/sed/output/trans_delimiter new file mode 100644 index 00000000..48646d1a --- /dev/null +++ b/tests/fixtures/sed/output/trans_delimiter @@ -0,0 +1,14 @@ +l8_8 +l8_7 +l8_6 +l8_5 +l8_4 +l8_3 +l8_2 +l8_1 +l8_0 +l8_89 +l8_88 +l8_87 +l8_86 +l8_85 diff --git a/tests/fixtures/sed/output/trans_simple b/tests/fixtures/sed/output/trans_simple new file mode 100644 index 00000000..48646d1a --- /dev/null +++ b/tests/fixtures/sed/output/trans_simple @@ -0,0 +1,14 @@ +l8_8 +l8_7 +l8_6 +l8_5 +l8_4 +l8_3 +l8_2 +l8_1 +l8_0 +l8_89 +l8_88 +l8_87 +l8_86 +l8_85 From 50b70910997b16e2f21047c88f63edbc4911e1d5 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 14 May 2025 13:33:00 +0300 Subject: [PATCH 64/85] Fix handling of files ending without a newline --- src/uu/sed/src/fast_io.rs | 57 ++++++++++++++++++++- src/uu/sed/src/processor.rs | 4 +- tests/by-util/test_sed.rs | 3 ++ tests/fixtures/sed/output/subst_no_new_line | 1 + tests/fixtures/sed/output/trans_no_new_line | 1 + 5 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 tests/fixtures/sed/output/subst_no_new_line create mode 100644 tests/fixtures/sed/output/trans_no_new_line diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index cf86fa8d..f946f4cb 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -167,11 +167,25 @@ impl<'a> IOChunk<'a> { } } + /// Return true if the content ends with a newline. + pub fn is_newline_terminated(&self) -> bool { + match &self.content { + IOChunkContent::Owned { has_newline, .. } => *has_newline, + #[cfg(unix)] + IOChunkContent::MmapInput { full_span, .. } => { + if let Some(&last) = full_span.last() { + last == b'\n' + } else { + false + } + } + } + } + /// Set the object's contents to the specified string. - /// Convert it into Owned if needed. + /// Convert it into Owned if needed. pub fn set_to_string(&mut self, new_content: String, add_newline: bool) { self.utf8_verified = true; - // TODO: Default newline to true and remove argumnt if always true. match &mut self.content { IOChunkContent::Owned { content, @@ -978,4 +992,43 @@ mod tests { Ok(()) } + + #[test] + fn test_owned_newline_terminated() { + let chunk = IOChunk::from_content(IOChunkContent::new_owned("line".to_string(), true)); + assert!(chunk.is_newline_terminated()); + } + + #[test] + fn test_owned_not_newline_terminated() { + let chunk = IOChunk::from_content(IOChunkContent::new_owned("line".to_string(), false)); + assert!(!chunk.is_newline_terminated()); + } + + #[cfg(unix)] + #[test] + fn test_mmap_newline_terminated() { + let content = b"line"; + let full_span = b"line\n"; + let chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + assert!(chunk.is_newline_terminated()); + } + + #[cfg(unix)] + #[test] + fn test_mmap_not_newline_terminated() { + let content = b"line"; + let full_span = b"line"; + let chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + assert!(!chunk.is_newline_terminated()); + } + + #[cfg(unix)] + #[test] + fn test_mmap_empty() { + let content = b""; + let full_span = b""; + let chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + assert!(!chunk.is_newline_terminated()); + } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 99ae6356..21f68327 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -198,7 +198,7 @@ fn substitute( if replaced { result.push_str(&text[last_end..]); - pattern.set_to_string(result, true); + pattern.set_to_string(result, pattern.is_newline_terminated()); if sub.print_flag { write_chunk(output, context, pattern)?; @@ -230,7 +230,7 @@ fn transliterate(pattern: &mut IOChunk, trans: &Transliteration) -> UResult<()> // Lazy replace. if replaced { - pattern.set_to_string(result, true); + pattern.set_to_string(result, pattern.is_newline_terminated()); } Ok(()) diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index ac58f7ba..99bfcd96 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -129,6 +129,7 @@ macro_rules! check_output { // Input files const LINES1: &str = "input/lines1"; const LINES2: &str = "input/lines2"; +const NO_NEW_LINE: &str = "input/no-new-line.txt"; // Test address ranges check_output!(addr_one_line, ["-n", "-e", "4p", LINES1]); @@ -223,6 +224,7 @@ check_output!(subst_multiline, ["-e", "s/_/u0\\\nu1\\\nu2/g", LINES1]); check_output!(subst_numbered_replacement, ["-e", r"s/./X/4", LINES1]); check_output!(subst_brace, ["-e", r"s/[123]/X/g", LINES1]); check_output!(subst_case_insensitive, ["-e", r"s/L/Line/", LINES1]); +check_output!(subst_no_new_line, ["-e", r"s/l/L/g", NO_NEW_LINE]); #[test] fn subst_write_file() -> std::io::Result<()> { @@ -246,3 +248,4 @@ check_output!( trans_delimiter, ["-e", r"y10\123456789198765432\101", LINES1] ); +check_output!(trans_no_new_line, ["-e", r"y/l/L/", NO_NEW_LINE]); diff --git a/tests/fixtures/sed/output/subst_no_new_line b/tests/fixtures/sed/output/subst_no_new_line new file mode 100644 index 00000000..accf6276 --- /dev/null +++ b/tests/fixtures/sed/output/subst_no_new_line @@ -0,0 +1 @@ +HeLLo \ No newline at end of file diff --git a/tests/fixtures/sed/output/trans_no_new_line b/tests/fixtures/sed/output/trans_no_new_line new file mode 100644 index 00000000..accf6276 --- /dev/null +++ b/tests/fixtures/sed/output/trans_no_new_line @@ -0,0 +1 @@ +HeLLo \ No newline at end of file From 8f1e6e152bb70a9468ab2d4ed71cf2d75ccdf011 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 14 May 2025 14:45:38 +0300 Subject: [PATCH 65/85] Add ensure_owned and as_mut_string methods --- src/uu/sed/src/fast_io.rs | 176 +++++++++++++++++++++++++++++++++++++- 1 file changed, 175 insertions(+), 1 deletion(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index f946f4cb..e85b8bcf 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -202,7 +202,7 @@ impl<'a> IOChunk<'a> { } } - /// Return the content as a string. + /// Return the content as a str. pub fn try_as_str(&mut self) -> Result<&str, Box> { match &self.content { #[cfg(unix)] @@ -219,6 +219,38 @@ impl<'a> IOChunk<'a> { IOChunkContent::Owned { content, .. } => Ok(content), } } + + /// Convert content to the Owned variant if it's not already. + /// Fails if the conversion to UTF-8 fails. + pub fn ensure_owned(&mut self) -> Result<(), Box> { + match &self.content { + IOChunkContent::Owned { .. } => Ok(()), // already owned + #[cfg(unix)] + IOChunkContent::MmapInput { content, full_span } => { + match std::str::from_utf8(content) { + Ok(valid_str) => { + let has_newline = full_span.last().copied() == Some(b'\n'); + self.content = + IOChunkContent::new_owned(valid_str.to_string(), has_newline); + self.utf8_verified = true; + Ok(()) + } + Err(e) => Err(USimpleError::new(1, e.to_string())), + } + } + } + } + + /// Return mutable access to the content as a String. + pub fn as_mut_string(&mut self) -> Result<&mut String, Box> { + self.ensure_owned()?; + + match &mut self.content { + IOChunkContent::Owned { content, .. } => Ok(content), + #[allow(unreachable_patterns)] + _ => unreachable!("ensure_owned should convert to Owned"), + } + } } /// Data to be written to a file. It can come from the mmapped @@ -993,6 +1025,7 @@ mod tests { Ok(()) } + // is_newline_terminated #[test] fn test_owned_newline_terminated() { let chunk = IOChunk::from_content(IOChunkContent::new_owned("line".to_string(), true)); @@ -1031,4 +1064,145 @@ mod tests { let chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); assert!(!chunk.is_newline_terminated()); } + + // ensure_owned() + #[test] + fn test_ensure_owned_on_owned() { + let mut chunk = + IOChunk::from_content(IOChunkContent::new_owned("already owned".to_string(), true)); + + let result = chunk.ensure_owned(); + assert!(result.is_ok()); + + // Content must be unchanged + match &chunk.content { + IOChunkContent::Owned { + content, + has_newline, + .. + } => { + assert_eq!(content, "already owned"); + assert!(*has_newline); + } + #[cfg(unix)] + _ => panic!("Expected Owned variant"), + } + } + + #[cfg(unix)] + #[test] + fn test_ensure_owned_on_mmap_valid_utf8() { + let content = b"mmap string"; + let full_span = b"mmap string\n"; + + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + let result = chunk.ensure_owned(); + assert!(result.is_ok()); + + match &chunk.content { + IOChunkContent::Owned { + content, + has_newline, + .. + } => { + assert_eq!(content, "mmap string"); + assert!(*has_newline); + } + _ => panic!("Expected Owned variant after ensure_owned"), + } + } + + #[cfg(unix)] + #[test] + fn test_ensure_owned_on_mmap_valid_utf8_no_newline() { + let content = b"no newline"; + let full_span = b"no newline"; + + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + let result = chunk.ensure_owned(); + assert!(result.is_ok()); + + match &chunk.content { + IOChunkContent::Owned { + content, + has_newline, + .. + } => { + assert_eq!(content, "no newline"); + assert!(!*has_newline); + } + _ => panic!("Expected Owned variant after ensure_owned"), + } + } + + #[cfg(unix)] + #[test] + fn test_ensure_owned_on_mmap_invalid_utf8() { + let content = b"bad\xFFutf8"; + let full_span = b"bad\xFFutf8\n"; + + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + let result = chunk.ensure_owned(); + assert!(result.is_err()); + let err_msg = format!("{}", result.unwrap_err()); + assert!( + err_msg.contains("invalid utf-8"), + "Unexpected error message: {}", + err_msg + ); + } + + // as_mut_string + #[test] + fn test_as_mut_string_on_owned() { + let mut chunk = + IOChunk::from_content(IOChunkContent::new_owned("hello".to_string(), false)); + + let s = chunk.as_mut_string().unwrap(); + s.push_str(" world"); + + assert_eq!(chunk.try_as_str().unwrap(), "hello world"); + } + + #[cfg(unix)] + #[test] + fn test_as_mut_string_on_mmap_input_valid_utf8() { + let content = b"foo"; + let full_span = b"foo\n"; + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + { + let s = chunk.as_mut_string().unwrap(); + s.push_str("bar"); + } + + assert_eq!(chunk.try_as_str().unwrap(), "foobar"); + } + + #[cfg(unix)] + #[test] + fn test_as_mut_string_on_utf8_multibyte() { + let content = "λινe".as_bytes(); + let full_span = "λινe\n".as_bytes(); + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + chunk.as_mut_string().unwrap().push_str(" Δεδομένα"); + + assert_eq!(chunk.try_as_str().unwrap(), "λινe Δεδομένα"); + } + + #[cfg(unix)] + #[test] + fn test_as_mut_string_invalid_utf8() { + let content = b"abc\xFF"; // invalid UTF-8 + let full_span = b"abc\xFF\n"; + let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); + + let result = chunk.as_mut_string(); + assert!(result.is_err()); + assert!(format!("{}", result.unwrap_err()).contains("invalid utf-8")); + } } From 063ac983fe8f38e4f4ac3bdded69d6aa596234bf Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Wed, 14 May 2025 18:50:55 +0300 Subject: [PATCH 66/85] Implement the N and P commands --- src/uu/sed/src/command.rs | 13 +++++ src/uu/sed/src/compiler.rs | 2 +- src/uu/sed/src/processor.rs | 51 ++++++++++++++++--- src/uu/sed/src/sed.rs | 1 + tests/by-util/test_sed.rs | 15 ++++++ tests/fixtures/sed/output/pattern_next_print | 4 ++ tests/fixtures/sed/output/print_to_newline | 4 ++ tests/fixtures/sed/output/subst_newline_class | 1 + tests/fixtures/sed/output/subst_newline_re | 1 + tests/fixtures/sed/output/trans_newline | 13 +++++ 10 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/sed/output/pattern_next_print create mode 100644 tests/fixtures/sed/output/print_to_newline create mode 100644 tests/fixtures/sed/output/subst_newline_class create mode 100644 tests/fixtures/sed/output/subst_newline_re create mode 100644 tests/fixtures/sed/output/trans_newline diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index f99aa8c0..f4b73be0 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -51,6 +51,10 @@ pub struct ProcessingContext { pub last_file: bool, /// Previously compiled RE, saved for reuse when specifying an empty RE pub saved_regex: RefCell>, + /// Modification of input processing action + // This is required to avoid doubly borrowing the reader in the 'N' + // command. + pub input_action: Option, } #[derive(Debug, PartialEq)] @@ -310,6 +314,15 @@ pub struct Space { pub backup: String, // Backing memory } +#[derive(Debug, Clone)] +/// Action to execute after reading a new input line +pub struct InputAction { + /// Next command to execute (rather than commands from start) + pub next_command: Option>>, + /// Data to prepend to the read contents + pub prepend: String, +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 305f248d..06449321 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -501,7 +501,7 @@ pub fn compile_replacement( '\\' => { line.advance(); - // Line continuation + // Line input_action if line.eol() { if let Some(next_line_string) = lines.next_line()? { literal.push('\n'); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 21f68327..83702133 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -9,8 +9,8 @@ // file that was distributed with this source code. use crate::command::{ - Address, AddressType, AddressValue, Command, CommandData, ProcessingContext, Substitution, - Transliteration, + Address, AddressType, AddressValue, Command, CommandData, InputAction, ProcessingContext, + Substitution, Transliteration, }; use crate::fast_io::{IOChunk, LineReader, OutputBuffer}; use crate::in_place::InPlace; @@ -243,10 +243,28 @@ fn process_file( output: &mut OutputBuffer, context: &mut ProcessingContext, ) -> UResult<()> { + // Loop over the input lines while let Some((mut pattern, last_line)) = reader.get_line()? { context.last_line = last_line; context.line_number += 1; - let mut current: Option>> = commands.clone(); + + // Set the script command from which to start. + let mut current: Option>> = + if let Some(action) = context.input_action.take() { + // Continue processing the `N` command. + let current_line = pattern.try_as_str()?; + let mut combined_lines = action.prepend; + combined_lines.push('\n'); + combined_lines.push_str(current_line); + + pattern.set_to_string(combined_lines, pattern.is_newline_terminated()); + action.next_command + } else { + // Start from the script top. + commands.clone() + }; + + // Loop over script commands. while let Some(command_rc) = current { let mut command = command_rc.borrow_mut(); @@ -256,7 +274,6 @@ fn process_file( continue; } - // TODO: continue if command doesn't apply match command.code { '{' => { current = Some(command.data.get_subcommand()); @@ -302,14 +319,32 @@ fn process_file( // TODO } 'N' => { - // TODO + // Append to pattern `\n` and the next line + // Rather than reading input here, which would result + // in a double borrow on reader, modify the action + // to perform when the next line is read. + context.input_action = Some(InputAction { + next_command: command.next.clone(), + prepend: pattern.try_as_str()?.to_string(), + }); + break; } 'p' => { // Write the pattern space to standard output. write_chunk(output, context, &pattern)?; } 'P' => { - // TODO + // Output pattern space, up to the first \n. + let line = pattern.try_as_str()?; + match line.find('\n') { + Some(pos) => { + output.write_str(&line[..=pos])?; + } + None => { + output.write_str(line)?; + output.write_str("\n")?; + } + } } 'q' => { // TODO @@ -346,7 +381,7 @@ fn process_file( // TODO } '}' => { - // TODO + // Nothing to do here } '=' => { // TODO @@ -358,7 +393,7 @@ fn process_file( current = command.next.clone(); } - if !context.quiet { + if !context.quiet && context.input_action.is_none() { write_chunk(output, context, &pattern)?; } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 76b907c2..d16dd6df 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -201,6 +201,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { last_line: false, last_file: false, saved_regex: const { RefCell::new(None) }, + input_action: None, } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 99bfcd96..55fba115 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -249,3 +249,18 @@ check_output!( ["-e", r"y10\123456789198765432\101", LINES1] ); check_output!(trans_no_new_line, ["-e", r"y/l/L/", NO_NEW_LINE]); +check_output!(trans_newline, ["-e", r"1N;2y/\n/X/", LINES1]); + +// TODO: Enable when "{}" is implemented. +#[cfg(any())] +check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); + +// TODO: Enable when "{}" is implemented. +#[cfg(any())] +check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); + +// TODO: Enable when "{}" is implemented. +#[cfg(any())] +check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); + +check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); diff --git a/tests/fixtures/sed/output/pattern_next_print b/tests/fixtures/sed/output/pattern_next_print new file mode 100644 index 00000000..8cf6fd76 --- /dev/null +++ b/tests/fixtures/sed/output/pattern_next_print @@ -0,0 +1,4 @@ +l1_1 +l1_4 +l1_7 +l1_10 diff --git a/tests/fixtures/sed/output/print_to_newline b/tests/fixtures/sed/output/print_to_newline new file mode 100644 index 00000000..67a31ba0 --- /dev/null +++ b/tests/fixtures/sed/output/print_to_newline @@ -0,0 +1,4 @@ +l1_1 +l1_1 +l1_1 +l1_2 diff --git a/tests/fixtures/sed/output/subst_newline_class b/tests/fixtures/sed/output/subst_newline_class new file mode 100644 index 00000000..0c5c10a5 --- /dev/null +++ b/tests/fixtures/sed/output/subst_newline_class @@ -0,0 +1 @@ +l1_1Xl1_2 diff --git a/tests/fixtures/sed/output/subst_newline_re b/tests/fixtures/sed/output/subst_newline_re new file mode 100644 index 00000000..0c5c10a5 --- /dev/null +++ b/tests/fixtures/sed/output/subst_newline_re @@ -0,0 +1 @@ +l1_1Xl1_2 diff --git a/tests/fixtures/sed/output/trans_newline b/tests/fixtures/sed/output/trans_newline new file mode 100644 index 00000000..f414bb53 --- /dev/null +++ b/tests/fixtures/sed/output/trans_newline @@ -0,0 +1,13 @@ +l1_1Xl1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 From 95abc508cff9181f203ee241b91c5c03d73b56f8 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 15 May 2025 17:55:08 +0300 Subject: [PATCH 67/85] Implement the D command --- src/uu/sed/src/processor.rs | 13 ++++++++++++- tests/by-util/test_sed.rs | 2 ++ tests/fixtures/sed/output/pattern_delete_no_newline | 13 +++++++++++++ tests/fixtures/sed/output/pattern_delete_to_newline | 0 4 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/sed/output/pattern_delete_no_newline create mode 100644 tests/fixtures/sed/output/pattern_delete_to_newline diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 83702133..4fb3574b 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -295,7 +295,18 @@ fn process_file( break; } 'D' => { - // TODO + // Delete up to \n and start a new cycle without new input. + if let Some(pos) = pattern.try_as_str()?.find('\n') { + // Clone just the needed slice before mutating pattern. + let tail = pattern.try_as_str()?[pos + 1..].to_string(); + pattern.set_to_string(tail, pattern.is_newline_terminated()); + current = commands.clone(); + continue; + } else { + // Same as d + pattern.clear(); + break; + } } 'g' => { // TODO diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 55fba115..2f33de93 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -264,3 +264,5 @@ check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); +check_output!(pattern_delete_to_newline, ["-n", r"N;N;N;D", LINES1]); +check_output!(pattern_delete_no_newline, ["-e", r"2D", LINES1]); diff --git a/tests/fixtures/sed/output/pattern_delete_no_newline b/tests/fixtures/sed/output/pattern_delete_no_newline new file mode 100644 index 00000000..8c5285bb --- /dev/null +++ b/tests/fixtures/sed/output/pattern_delete_no_newline @@ -0,0 +1,13 @@ +l1_1 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_delete_to_newline b/tests/fixtures/sed/output/pattern_delete_to_newline new file mode 100644 index 00000000..e69de29b From 3759c2c494cd282b2aeb4b0b948ca7df53f17021 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 15 May 2025 18:27:45 +0300 Subject: [PATCH 68/85] Refactor duplicated command ending parsing --- src/uu/sed/src/compiler.rs | 79 +++++++++++++++----------------------- 1 file changed, 30 insertions(+), 49 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 06449321..c9457c9c 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -397,6 +397,28 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) } +/// Parse the end of a command, returning the appropriate ContinueAction +fn parse_command_ending( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult { + if !line.eol() && line.current() == ';' { + line.advance(); + return Ok(ContinueAction::NextChar); + } + + if !line.eol() { + return compilation_error( + lines, + line, + format!("extra characters at the end of the {} command", cmd.code), + ); + } + + Ok(ContinueAction::NextLine) +} + /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. /// - Replacces `\(` and `\)` with `(` and `)` /// - Escapes ERE-only metacharacters: `+ ? { } | ( )` @@ -586,7 +608,7 @@ pub fn compile_replacement( } } -pub fn compile_subst_command( +fn compile_subst_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, @@ -637,26 +659,12 @@ pub fn compile_subst_command( ); } - line.eat_spaces(); - if !line.eol() && line.current() == ';' { - line.advance(); - cmd.data = CommandData::Substitution(subst); - return Ok(ContinueAction::NextChar); - } - - if !line.eol() { - return compilation_error( - lines, - line, - format!("extra characters at the end of the {} command", cmd.code), - ); - } - cmd.data = CommandData::Substitution(subst); - Ok(ContinueAction::NextLine) + + parse_command_ending(lines, line, cmd) } -pub fn compile_trans_command( +fn compile_trans_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, @@ -683,24 +691,10 @@ pub fn compile_trans_command( } let transliteration = Box::new(Transliteration::from_strings(&source, &target)); + cmd.data = CommandData::Transliteration(transliteration); line.advance(); // move past last delimiter - if !line.eol() && line.current() == ';' { - line.advance(); - cmd.data = CommandData::Transliteration(transliteration); - return Ok(ContinueAction::NextChar); - } - - if !line.eol() { - return compilation_error( - lines, - line, - format!("extra characters at the end of the {} command", cmd.code), - ); - } - - cmd.data = CommandData::Transliteration(transliteration); - Ok(ContinueAction::NextLine) + parse_command_ending(lines, line, cmd) } /// Parse the substitution command's optional flags @@ -807,7 +801,7 @@ pub fn compile_subst_flags( /// Compile a command that doesn't take any arguments // Handles d D g G h H l n N p P q x = -pub fn compile_empty_command( +fn compile_empty_command( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, @@ -815,20 +809,7 @@ pub fn compile_empty_command( line.advance(); // Skip the command character line.eat_spaces(); // Skip any trailing whitespace - if !line.eol() && line.current() == ';' { - line.advance(); - return Ok(ContinueAction::NextChar); - } - - if !line.eol() { - return compilation_error( - lines, - line, - format!("extra characters at the end of the {} command", cmd.code), - ); - } - - Ok(ContinueAction::NextLine) + parse_command_ending(lines, line, cmd) } // Compile the specified command From edd215bffae1ad7a76f91f54addd105931c492a2 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Thu, 15 May 2025 21:37:23 +0300 Subject: [PATCH 69/85] Optimize D implementation --- src/uu/sed/src/processor.rs | 5 ++--- tests/by-util/test_sed.rs | 2 +- tests/fixtures/sed/output/pattern_delete_to_newline | 3 +++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 4fb3574b..37000023 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -297,9 +297,8 @@ fn process_file( 'D' => { // Delete up to \n and start a new cycle without new input. if let Some(pos) = pattern.try_as_str()?.find('\n') { - // Clone just the needed slice before mutating pattern. - let tail = pattern.try_as_str()?[pos + 1..].to_string(); - pattern.set_to_string(tail, pattern.is_newline_terminated()); + let s = pattern.as_mut_string().unwrap(); + s.drain(..=pos); current = commands.clone(); continue; } else { diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 2f33de93..86a7fdb5 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -264,5 +264,5 @@ check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); -check_output!(pattern_delete_to_newline, ["-n", r"N;N;N;D", LINES1]); +check_output!(pattern_delete_to_newline, ["-n", r"2N;3p;3D;3p", LINES1]); check_output!(pattern_delete_no_newline, ["-e", r"2D", LINES1]); diff --git a/tests/fixtures/sed/output/pattern_delete_to_newline b/tests/fixtures/sed/output/pattern_delete_to_newline index e69de29b..b4b743ef 100644 --- a/tests/fixtures/sed/output/pattern_delete_to_newline +++ b/tests/fixtures/sed/output/pattern_delete_to_newline @@ -0,0 +1,3 @@ +l1_2 +l1_3 +l1_3 From bb864d446d3fc8a65d374b3ed8c30e2949e0943d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 12:23:36 +0300 Subject: [PATCH 70/85] Implement g G h H n N q x single-letter commands Also fix the N command. --- src/uu/sed/src/command.rs | 11 ++++ src/uu/sed/src/fast_io.rs | 35 +++++----- src/uu/sed/src/processor.rs | 66 +++++++++++++++---- src/uu/sed/src/sed.rs | 4 +- tests/by-util/test_sed.rs | 20 ++++++ .../fixtures/sed/output/pattern_append_delete | 10 +++ .../output/pattern_append_delete2_separate | 17 +++++ .../sed/output/pattern_append_delete_2 | 16 +++++ .../fixtures/sed/output/pattern_delete_print | 13 ++++ .../sed/output/pattern_hold_append_swap | 20 ++++++ .../sed/output/pattern_next_no_output | 0 tests/fixtures/sed/output/pattern_next_output | 14 ++++ .../sed/output/pattern_next_print_no_output | 13 ++++ .../sed/output/pattern_next_print_output | 27 ++++++++ tests/fixtures/sed/output/pattern_quit | 5 ++ tests/fixtures/sed/output/pattern_quit_2 | 5 ++ 16 files changed, 249 insertions(+), 27 deletions(-) create mode 100644 tests/fixtures/sed/output/pattern_append_delete create mode 100644 tests/fixtures/sed/output/pattern_append_delete2_separate create mode 100644 tests/fixtures/sed/output/pattern_append_delete_2 create mode 100644 tests/fixtures/sed/output/pattern_delete_print create mode 100644 tests/fixtures/sed/output/pattern_hold_append_swap create mode 100644 tests/fixtures/sed/output/pattern_next_no_output create mode 100644 tests/fixtures/sed/output/pattern_next_output create mode 100644 tests/fixtures/sed/output/pattern_next_print_no_output create mode 100644 tests/fixtures/sed/output/pattern_next_print_output create mode 100644 tests/fixtures/sed/output/pattern_quit create mode 100644 tests/fixtures/sed/output/pattern_quit_2 diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index f4b73be0..acdb40ee 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -49,12 +49,23 @@ pub struct ProcessingContext { pub last_line: bool, /// True if the file is the last file of the ones specified pub last_file: bool, + /// Stop processing further input. + pub stop_processing: bool, /// Previously compiled RE, saved for reuse when specifying an empty RE pub saved_regex: RefCell>, /// Modification of input processing action // This is required to avoid doubly borrowing the reader in the 'N' // command. pub input_action: Option, + /// Hold space + pub hold: StringSpace, +} + +#[derive(Clone, Debug, Default, PartialEq)] +/// A space mirroring IOChunk, but only with a String +pub struct StringSpace { + pub content: String, // Line content without newline + pub has_newline: bool, // True if \n-terminated } #[derive(Debug, PartialEq)] diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index e85b8bcf..cf12e655 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -241,12 +241,16 @@ impl<'a> IOChunk<'a> { } } - /// Return mutable access to the content as a String. - pub fn as_mut_string(&mut self) -> Result<&mut String, Box> { + /// Return mutable access to the content and has_newline fields. + pub fn fields_mut(&mut self) -> Result<(&mut String, &mut bool), Box> { self.ensure_owned()?; match &mut self.content { - IOChunkContent::Owned { content, .. } => Ok(content), + IOChunkContent::Owned { + content, + has_newline, + .. + } => Ok((content, has_newline)), #[allow(unreachable_patterns)] _ => unreachable!("ensure_owned should convert to Owned"), } @@ -1155,13 +1159,13 @@ mod tests { ); } - // as_mut_string + // fields_mut #[test] - fn test_as_mut_string_on_owned() { + fn test_fields_mut_on_owned() { let mut chunk = IOChunk::from_content(IOChunkContent::new_owned("hello".to_string(), false)); - let s = chunk.as_mut_string().unwrap(); + let (s, _) = chunk.fields_mut().unwrap(); s.push_str(" world"); assert_eq!(chunk.try_as_str().unwrap(), "hello world"); @@ -1169,13 +1173,13 @@ mod tests { #[cfg(unix)] #[test] - fn test_as_mut_string_on_mmap_input_valid_utf8() { + fn test_fields_mut_on_mmap_input_valid_utf8() { let content = b"foo"; let full_span = b"foo\n"; let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); { - let s = chunk.as_mut_string().unwrap(); + let (s, _) = chunk.fields_mut().unwrap(); s.push_str("bar"); } @@ -1184,24 +1188,25 @@ mod tests { #[cfg(unix)] #[test] - fn test_as_mut_string_on_utf8_multibyte() { - let content = "λινe".as_bytes(); - let full_span = "λινe\n".as_bytes(); + fn test_fields_mut_on_utf8_multibyte() { + let content = "Ζωντανά!".as_bytes(); + let full_span = "Ζωντανά!\n".as_bytes(); let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); - chunk.as_mut_string().unwrap().push_str(" Δεδομένα"); + let (s, _) = chunk.fields_mut().unwrap(); + s.push_str(" Δεδομένα"); - assert_eq!(chunk.try_as_str().unwrap(), "λινe Δεδομένα"); + assert_eq!(chunk.try_as_str().unwrap(), "Ζωντανά! Δεδομένα"); } #[cfg(unix)] #[test] - fn test_as_mut_string_invalid_utf8() { + fn test_fields_mut_invalid_utf8() { let content = b"abc\xFF"; // invalid UTF-8 let full_span = b"abc\xFF\n"; let mut chunk = IOChunk::from_content(IOChunkContent::MmapInput { content, full_span }); - let result = chunk.as_mut_string(); + let result = chunk.fields_mut(); assert!(result.is_err()); assert!(format!("{}", result.unwrap_err()).contains("invalid utf-8")); } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 37000023..9f50b229 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -244,7 +244,7 @@ fn process_file( context: &mut ProcessingContext, ) -> UResult<()> { // Loop over the input lines - while let Some((mut pattern, last_line)) = reader.get_line()? { + 'lines: while let Some((mut pattern, last_line)) = reader.get_line()? { context.last_line = last_line; context.line_number += 1; @@ -297,7 +297,7 @@ fn process_file( 'D' => { // Delete up to \n and start a new cycle without new input. if let Some(pos) = pattern.try_as_str()?.find('\n') { - let s = pattern.as_mut_string().unwrap(); + let (s, _) = pattern.fields_mut()?; s.drain(..=pos); current = commands.clone(); continue; @@ -308,16 +308,26 @@ fn process_file( } } 'g' => { - // TODO + // Replace pattern with the contents of the hold space. + pattern.set_to_string(context.hold.content.clone(), context.hold.has_newline); } 'G' => { - // TODO + // Append to pattern \n followed by hold space contents. + let (pat_content, pat_has_newline) = pattern.fields_mut()?; + pat_content.push('\n'); + pat_content.push_str(&context.hold.content); + *pat_has_newline = context.hold.has_newline; } 'h' => { - // TODO + // Replace hold with the contents of the pattern space. + context.hold.content = pattern.try_as_str()?.to_string(); + context.hold.has_newline = pattern.is_newline_terminated(); } 'H' => { - // TODO + // Append to hold \n followed by pattern space contents. + context.hold.content.push('\n'); + context.hold.content.push_str(pattern.try_as_str()?); + context.hold.has_newline = pattern.is_newline_terminated(); } 'i' => { // TODO @@ -326,7 +336,7 @@ fn process_file( // TODO } 'n' => { - // TODO + break; } 'N' => { // Append to pattern `\n` and the next line @@ -337,7 +347,7 @@ fn process_file( next_command: command.next.clone(), prepend: pattern.try_as_str()?.to_string(), }); - break; + continue 'lines; } 'p' => { // Write the pattern space to standard output. @@ -357,7 +367,8 @@ fn process_file( } } 'q' => { - // TODO + context.stop_processing = true; + break; } 'r' => { // TODO @@ -377,7 +388,10 @@ fn process_file( // TODO } 'x' => { - // TODO + // Exchange the contents of the pattern and hold spaces. + let (pat_content, pat_has_newline) = pattern.fields_mut()?; + std::mem::swap(pat_content, &mut context.hold.content); + std::mem::swap(pat_has_newline, &mut context.hold.has_newline); } 'y' => { let trans = match &mut command.data { @@ -403,10 +417,27 @@ fn process_file( current = command.next.clone(); } - if !context.quiet && context.input_action.is_none() { + if !context.quiet { write_chunk(output, context, &pattern)?; } + + if context.stop_processing { + break; + } } + + // Handle any N command remains. + if context.separate && !context.quiet { + if let Some(action) = context.input_action.take() { + let mut pending = action.prepend; + pending.push('\n'); + output.write_str(pending)?; + if context.unbuffered { + output.flush()?; + } + } + } + Ok(()) } @@ -436,7 +467,20 @@ pub fn process_all_files( } process_file(&commands, &mut reader, output, &mut context)?; + // Handle any N command remains. + if context.last_file && !context.separate && !context.quiet { + if let Some(action) = context.input_action.take() { + let mut pending = action.prepend; + pending.push('\n'); + output.write_str(pending)?; + } + } + in_place.end()?; + + if context.stop_processing { + break; + } } // Flush all output files diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index d16dd6df..577a435c 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -18,7 +18,7 @@ pub mod processor; pub mod script_char_provider; pub mod script_line_provider; -use crate::command::{ProcessingContext, ScriptValue}; +use crate::command::{ProcessingContext, ScriptValue, StringSpace}; use crate::compiler::compile; use crate::processor::process_all_files; use clap::{Arg, ArgMatches, Command, arg}; @@ -200,8 +200,10 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { last_address: false, last_line: false, last_file: false, + stop_processing: false, saved_regex: const { RefCell::new(None) }, input_action: None, + hold: StringSpace::default(), } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 86a7fdb5..9f101ff4 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -266,3 +266,23 @@ check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); check_output!(pattern_delete_to_newline, ["-n", r"2N;3p;3D;3p", LINES1]); check_output!(pattern_delete_no_newline, ["-e", r"2D", LINES1]); +check_output!(pattern_delete_print, ["-n", r"4d;p", LINES1]); + +// FreeBSD sed does not produce any output for the following two +check_output!(pattern_append_delete, ["-e", r"N;N;N;D", LINES1]); +check_output!(pattern_append_delete_2, ["-e", r"N;N;N;D", LINES1, LINES2]); + +check_output!( + pattern_append_delete2_separate, + ["-s", r"N;N;N;D", LINES1, LINES2] +); +check_output!( + pattern_hold_append_swap, + ["-e", r"2h;3H;4g;5G;6x;6p;6x;6p", LINES1] +); +check_output!(pattern_next_output, ["-e", r"4n", LINES1]); +check_output!(pattern_next_no_output, ["-n", "-e", r"4n", LINES1]); +check_output!(pattern_next_print_output, ["-e", r"4n;p", LINES1]); +check_output!(pattern_next_print_no_output, ["-n", "-e", r"4n;p", LINES1]); +check_output!(pattern_quit, [r"5q", LINES1]); +check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); diff --git a/tests/fixtures/sed/output/pattern_append_delete b/tests/fixtures/sed/output/pattern_append_delete new file mode 100644 index 00000000..18476253 --- /dev/null +++ b/tests/fixtures/sed/output/pattern_append_delete @@ -0,0 +1,10 @@ +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_append_delete2_separate b/tests/fixtures/sed/output/pattern_append_delete2_separate new file mode 100644 index 00000000..cc300c2f --- /dev/null +++ b/tests/fixtures/sed/output/pattern_append_delete2_separate @@ -0,0 +1,17 @@ +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/pattern_append_delete_2 b/tests/fixtures/sed/output/pattern_append_delete_2 new file mode 100644 index 00000000..ae3aa47f --- /dev/null +++ b/tests/fixtures/sed/output/pattern_append_delete_2 @@ -0,0 +1,16 @@ +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 +l2_1 +l2_2 +l2_3 +l2_4 +l2_5 +l2_6 +l2_7 +l2_8 +l2_9 diff --git a/tests/fixtures/sed/output/pattern_delete_print b/tests/fixtures/sed/output/pattern_delete_print new file mode 100644 index 00000000..77e5cc0e --- /dev/null +++ b/tests/fixtures/sed/output/pattern_delete_print @@ -0,0 +1,13 @@ +l1_1 +l1_2 +l1_3 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_hold_append_swap b/tests/fixtures/sed/output/pattern_hold_append_swap new file mode 100644 index 00000000..e9169dc7 --- /dev/null +++ b/tests/fixtures/sed/output/pattern_hold_append_swap @@ -0,0 +1,20 @@ +l1_1 +l1_2 +l1_3 +l1_2 +l1_3 +l1_5 +l1_2 +l1_3 +l1_2 +l1_3 +l1_6 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_next_no_output b/tests/fixtures/sed/output/pattern_next_no_output new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures/sed/output/pattern_next_output b/tests/fixtures/sed/output/pattern_next_output new file mode 100644 index 00000000..3bcc601e --- /dev/null +++ b/tests/fixtures/sed/output/pattern_next_output @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_next_print_no_output b/tests/fixtures/sed/output/pattern_next_print_no_output new file mode 100644 index 00000000..77e5cc0e --- /dev/null +++ b/tests/fixtures/sed/output/pattern_next_print_no_output @@ -0,0 +1,13 @@ +l1_1 +l1_2 +l1_3 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_next_print_output b/tests/fixtures/sed/output/pattern_next_print_output new file mode 100644 index 00000000..30d6f04d --- /dev/null +++ b/tests/fixtures/sed/output/pattern_next_print_output @@ -0,0 +1,27 @@ +l1_1 +l1_1 +l1_2 +l1_2 +l1_3 +l1_3 +l1_4 +l1_5 +l1_5 +l1_6 +l1_6 +l1_7 +l1_7 +l1_8 +l1_8 +l1_9 +l1_9 +l1_10 +l1_10 +l1_11 +l1_11 +l1_12 +l1_12 +l1_13 +l1_13 +l1_14 +l1_14 diff --git a/tests/fixtures/sed/output/pattern_quit b/tests/fixtures/sed/output/pattern_quit new file mode 100644 index 00000000..7ffbba3e --- /dev/null +++ b/tests/fixtures/sed/output/pattern_quit @@ -0,0 +1,5 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 diff --git a/tests/fixtures/sed/output/pattern_quit_2 b/tests/fixtures/sed/output/pattern_quit_2 new file mode 100644 index 00000000..7ffbba3e --- /dev/null +++ b/tests/fixtures/sed/output/pattern_quit_2 @@ -0,0 +1,5 @@ +l1_1 +l1_2 +l1_3 +l1_4 +l1_5 From 0c06e66e1f2c0db6f504020e5b0681d5053f84a1 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 12:28:42 +0300 Subject: [PATCH 71/85] Clean up defined structures --- src/uu/sed/src/command.rs | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index acdb40ee..ba356386 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -21,9 +21,9 @@ use std::path::PathBuf; // For file descriptors and equivalent use std::rc::Rc; use uucore::error::UResult; -// Compilation and processing options provided mostly through the -// command-line interface #[derive(Debug, Default, Clone)] +/// Compilation and processing options provided mostly through the +/// command-line interface pub struct ProcessingContext { // Command-line flags with corresponding names pub all_output_files: bool, @@ -294,10 +294,8 @@ impl CommandData { } } -/* - * Structure containing things to append before a line is read - */ #[derive(Debug)] +/// Text to append before a line is read pub struct AppendBuffer { append_type: AppendType, content: String, @@ -309,22 +307,13 @@ pub enum AppendType { File, } -/// Flag for space modifications #[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Flag for space modifications pub enum SpaceFlag { Append, // Append to contents Replace, // Replace contents } -/// Structure for a processing space (process, hold, otherwise). -#[derive(Debug)] -pub struct Space { - pub current: String, // Current space content - pub deleted: bool, // Whether content was deleted - pub append_newline: bool, // Whether originally terminated by \n - pub backup: String, // Backing memory -} - #[derive(Debug, Clone)] /// Action to execute after reading a new input line pub struct InputAction { From 3de177b9f487e5832b958a5b6492f37f37cc517e Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 12:36:40 +0300 Subject: [PATCH 72/85] Rename try_as_str method into as_str For the sake of consistency; we don't employ try in other methods with similar semantics. --- src/uu/sed/src/fast_io.rs | 14 +++++++------- src/uu/sed/src/processor.rs | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/uu/sed/src/fast_io.rs b/src/uu/sed/src/fast_io.rs index cf12e655..cb46c7f5 100644 --- a/src/uu/sed/src/fast_io.rs +++ b/src/uu/sed/src/fast_io.rs @@ -203,7 +203,7 @@ impl<'a> IOChunk<'a> { } /// Return the content as a str. - pub fn try_as_str(&mut self) -> Result<&str, Box> { + pub fn as_str(&mut self) -> Result<&str, Box> { match &self.content { #[cfg(unix)] IOChunkContent::MmapInput { content, .. } => { @@ -951,7 +951,7 @@ mod tests { } if let Some((mut content, last_line)) = reader.get_line()? { - assert_eq!(content.try_as_str().unwrap(), "last line"); + assert_eq!(content.as_str().unwrap(), "last line"); assert!(last_line); } else { panic!("Expected IOChunk"); @@ -1015,11 +1015,11 @@ mod tests { } if let Some((mut content, last_line)) = reader.get_line()? { - assert_eq!(content.try_as_str().unwrap(), "last line"); + assert_eq!(content.as_str().unwrap(), "last line"); assert!(content.utf8_verified); assert!(last_line); // Cached version - assert_eq!(content.try_as_str().unwrap(), "last line"); + assert_eq!(content.as_str().unwrap(), "last line"); } else { panic!("Expected IOChunk"); } @@ -1168,7 +1168,7 @@ mod tests { let (s, _) = chunk.fields_mut().unwrap(); s.push_str(" world"); - assert_eq!(chunk.try_as_str().unwrap(), "hello world"); + assert_eq!(chunk.as_str().unwrap(), "hello world"); } #[cfg(unix)] @@ -1183,7 +1183,7 @@ mod tests { s.push_str("bar"); } - assert_eq!(chunk.try_as_str().unwrap(), "foobar"); + assert_eq!(chunk.as_str().unwrap(), "foobar"); } #[cfg(unix)] @@ -1196,7 +1196,7 @@ mod tests { let (s, _) = chunk.fields_mut().unwrap(); s.push_str(" Δεδομένα"); - assert_eq!(chunk.try_as_str().unwrap(), "Ζωντανά! Δεδομένα"); + assert_eq!(chunk.as_str().unwrap(), "Ζωντανά! Δεδομένα"); } #[cfg(unix)] diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 9f50b229..0226afcc 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -30,7 +30,7 @@ fn match_address( match addr.atype { AddressType::Re => { if let AddressValue::Regex(ref re) = addr.value { - Ok(re.is_match(pattern.try_as_str()?)) + Ok(re.is_match(pattern.as_str()?)) } else { Ok(false) } @@ -173,7 +173,7 @@ fn substitute( let mut result = String::new(); let mut replaced = false; - let text = pattern.try_as_str()?; + let text = pattern.as_str()?; for caps in sub.regex.captures_iter(text) { count += 1; @@ -206,7 +206,7 @@ fn substitute( // Write to file if needed. if let Some(ref writer) = sub.write_file { - writer.borrow_mut().write_line(pattern.try_as_str()?)?; + writer.borrow_mut().write_line(pattern.as_str()?)?; } } @@ -215,7 +215,7 @@ fn substitute( /// Apply the specified transliteration in the provided pattern space. fn transliterate(pattern: &mut IOChunk, trans: &Transliteration) -> UResult<()> { - let text = pattern.try_as_str()?; + let text = pattern.as_str()?; let mut result = String::with_capacity(text.len()); let mut replaced = false; @@ -252,7 +252,7 @@ fn process_file( let mut current: Option>> = if let Some(action) = context.input_action.take() { // Continue processing the `N` command. - let current_line = pattern.try_as_str()?; + let current_line = pattern.as_str()?; let mut combined_lines = action.prepend; combined_lines.push('\n'); combined_lines.push_str(current_line); @@ -296,7 +296,7 @@ fn process_file( } 'D' => { // Delete up to \n and start a new cycle without new input. - if let Some(pos) = pattern.try_as_str()?.find('\n') { + if let Some(pos) = pattern.as_str()?.find('\n') { let (s, _) = pattern.fields_mut()?; s.drain(..=pos); current = commands.clone(); @@ -320,13 +320,13 @@ fn process_file( } 'h' => { // Replace hold with the contents of the pattern space. - context.hold.content = pattern.try_as_str()?.to_string(); + context.hold.content = pattern.as_str()?.to_string(); context.hold.has_newline = pattern.is_newline_terminated(); } 'H' => { // Append to hold \n followed by pattern space contents. context.hold.content.push('\n'); - context.hold.content.push_str(pattern.try_as_str()?); + context.hold.content.push_str(pattern.as_str()?); context.hold.has_newline = pattern.is_newline_terminated(); } 'i' => { @@ -345,7 +345,7 @@ fn process_file( // to perform when the next line is read. context.input_action = Some(InputAction { next_command: command.next.clone(), - prepend: pattern.try_as_str()?.to_string(), + prepend: pattern.as_str()?.to_string(), }); continue 'lines; } @@ -355,7 +355,7 @@ fn process_file( } 'P' => { // Output pattern space, up to the first \n. - let line = pattern.try_as_str()?; + let line = pattern.as_str()?; match line.find('\n') { Some(pos) => { output.write_str(&line[..=pos])?; From 9d060fff6770b68a7cbd176dc634e408a8fe0681 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 12:50:20 +0300 Subject: [PATCH 73/85] Remove useless elements --- .github/workflows/ci.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 220f98c6..37207417 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,13 +25,8 @@ jobs: os: [ubuntu-latest, macOS-latest, windows-latest] steps: - uses: actions/checkout@v4 - with: - # Force LF line endings - core.autocrlf: false - uses: dtolnay/rust-toolchain@stable - - name: File type before conversion - run: file tests/fixtures/sed/*/* - - name: Remove carriage return + - name: Remove carriage returns shell: bash run: | if command -v dos2unix >/dev/null 2>&1; then @@ -40,8 +35,6 @@ jobs: else echo "dos2unix is NOT available" fi - - name: File type after convesion - run: file tests/fixtures/sed/*/* - run: cargo test --all coverage: From 635444fd3a6355f3d62c623e0dc298cd030f7e65 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 12:57:03 +0300 Subject: [PATCH 74/85] Ensure no CRs on Windows code coverage --- .github/workflows/ci.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37207417..2b45736c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,6 +49,15 @@ jobs: - { os: windows-latest , features: windows } steps: - uses: actions/checkout@v4 + - name: Remove carriage returns + shell: bash + run: | + if command -v dos2unix >/dev/null 2>&1; then + echo "dos2unix is available; converting" + dos2unix tests/fixtures/sed/*/* + else + echo "dos2unix is NOT available" + fi - name: Initialize workflow variables id: vars shell: bash From 7b3883dd2c2f434aeba9226740817ae73979480d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 13:10:42 +0300 Subject: [PATCH 75/85] Simplify LF setting See https://github.com/actions/checkout/issues/135 --- .github/workflows/ci.yml | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b45736c..057fce14 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,17 +24,12 @@ jobs: matrix: os: [ubuntu-latest, macOS-latest, windows-latest] steps: + - name: Set Git to use LF, even on Windows + run: | + git config --global core.autocrlf false + git config --global core.eol lf - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - - name: Remove carriage returns - shell: bash - run: | - if command -v dos2unix >/dev/null 2>&1; then - echo "dos2unix is available; converting" - dos2unix tests/fixtures/sed/*/* - else - echo "dos2unix is NOT available" - fi - run: cargo test --all coverage: @@ -48,16 +43,11 @@ jobs: - { os: macos-latest , features: macos } - { os: windows-latest , features: windows } steps: - - uses: actions/checkout@v4 - - name: Remove carriage returns - shell: bash + - name: Set Git to use LF, even on Windows run: | - if command -v dos2unix >/dev/null 2>&1; then - echo "dos2unix is available; converting" - dos2unix tests/fixtures/sed/*/* - else - echo "dos2unix is NOT available" - fi + git config --global core.autocrlf false + git config --global core.eol lf + - uses: actions/checkout@v4 - name: Initialize workflow variables id: vars shell: bash From e4267bcbd85cce6887ba8dbcefe327b175802d59 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 19:22:23 +0300 Subject: [PATCH 76/85] Simplify command thread construction --- src/uu/sed/src/compiler.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index c9457c9c..f6430bbe 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -226,8 +226,7 @@ fn compile_thread( context: &ProcessingContext, ) -> UResult>>> { let mut head: Option>> = None; - // A mutable reference to the place we’ll insert next - let mut next_p = &mut head; + let mut tail: Option>> = None; 'next_line: loop { match lines.next_line()? { @@ -260,18 +259,17 @@ fn compile_thread( cmd_spec = get_cmd_spec(lines, &line, n_addr)?; } - // Move cmd into next_p, transferring its ownership let action = compile_command(lines, &mut line, &mut cmd, cmd_spec, context)?; - - *next_p = Some(cmd); - // Intermediate let binding to avoid the temporary drop - let cmd_rc = next_p.as_mut().unwrap(); - let cmd_ptr = - &mut cmd_rc.borrow_mut().next as *mut Option>>; - unsafe { - next_p = &mut *cmd_ptr; + if let Some(ref t) = tail { + // there's already a tail: link it + t.borrow_mut().next = Some(cmd.clone()); + } else { + // first element: set head + head = Some(cmd.clone()); } + tail = Some(cmd); + match action { ContinueAction::NextLine => continue 'next_line, ContinueAction::NextChar => continue 'next_char, From f8221a8b4ff373c0ca47f38140735bf0846a5077 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Fri, 16 May 2025 20:53:24 +0300 Subject: [PATCH 77/85] Simplify line and char processing This will also make it easier to call compile_thread recursively. --- src/uu/sed/src/compiler.rs | 130 +++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 72 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index f6430bbe..0c6e5bdb 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -202,20 +202,14 @@ fn build_command_map() -> HashMap { formats.into_iter().map(|f| (f.code, f)).collect() } -// How to continue after processing a command -#[derive(Debug)] -pub enum ContinueAction { - NextLine, - NextChar, -} - pub fn compile( scripts: Vec, processing_context: &mut ProcessingContext, ) -> UResult>>> { let mut make_providers = ScriptLineProvider::new(scripts); - let result = compile_thread(&mut make_providers, processing_context)?; + let mut empty_line = ScriptCharProvider::new(""); + let result = compile_thread(&mut make_providers, &mut empty_line, processing_context)?; // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -223,60 +217,51 @@ pub fn compile( // Compile provided scripts into a thread of commands fn compile_thread( lines: &mut ScriptLineProvider, + line: &mut ScriptCharProvider, context: &ProcessingContext, ) -> UResult>>> { let mut head: Option>> = None; let mut tail: Option>> = None; - 'next_line: loop { - match lines.next_line()? { - None => { - // TODO: Error if stack isn't empty - return Ok(head); + loop { + line.eat_spaces(); + if line.eol() || line.current() == '#' { + // TODO: set processing_context.quiet for StringVal starting with #n + match lines.next_line()? { + None => { + return Ok(head); + } + Some(line_string) => { + *line = ScriptCharProvider::new(&line_string); + } } - Some(line_string) => { - let mut line = ScriptCharProvider::new(&line_string); - - // TODO: set processing_context.quiet for StringVal starting with #n - 'next_char: loop { - line.eat_spaces(); - if line.eol() || line.current() == '#' { - continue 'next_line; - } else if line.current() == ';' { - line.advance(); - continue 'next_char; - } - - let mut cmd = Rc::new(RefCell::new(Command::default())); - let n_addr = compile_address_range(lines, &mut line, &mut cmd, context)?; - let mut cmd_spec = get_cmd_spec(lines, &line, n_addr)?; - - // The ! command shall be followed by another one - if cmd_spec.args == CommandArgs::NonSelect { - line.advance(); - line.eat_spaces(); - cmd.borrow_mut().non_select = true; - cmd_spec = get_cmd_spec(lines, &line, n_addr)?; - } + continue; + } else if line.current() == ';' { + line.advance(); + continue; + } - let action = compile_command(lines, &mut line, &mut cmd, cmd_spec, context)?; - if let Some(ref t) = tail { - // there's already a tail: link it - t.borrow_mut().next = Some(cmd.clone()); - } else { - // first element: set head - head = Some(cmd.clone()); - } + let mut cmd = Rc::new(RefCell::new(Command::default())); + let n_addr = compile_address_range(lines, line, &mut cmd, context)?; + let mut cmd_spec = get_cmd_spec(lines, line, n_addr)?; - tail = Some(cmd); + // The ! command shall be followed by another one + if cmd_spec.args == CommandArgs::NonSelect { + line.advance(); + line.eat_spaces(); + cmd.borrow_mut().non_select = true; + cmd_spec = get_cmd_spec(lines, line, n_addr)?; + } - match action { - ContinueAction::NextLine => continue 'next_line, - ContinueAction::NextChar => continue 'next_char, - } - } - } + compile_command(lines, line, &mut cmd, cmd_spec, context)?; + if let Some(ref t) = tail { + // there's already a tail: link it + t.borrow_mut().next = Some(cmd.clone()); + } else { + // first element: set head + head = Some(cmd.clone()); } + tail = Some(cmd); } } @@ -395,15 +380,15 @@ fn parse_number(lines: &ScriptLineProvider, line: &mut ScriptCharProvider) -> UR .map_err(|msg| compilation_error::(lines, line, msg).unwrap_err()) } -/// Parse the end of a command, returning the appropriate ContinueAction +/// Parse the end of a command, failing with an error on extra characters. fn parse_command_ending( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, -) -> UResult { +) -> UResult<()> { if !line.eol() && line.current() == ';' { line.advance(); - return Ok(ContinueAction::NextChar); + return Ok(()); } if !line.eol() { @@ -414,7 +399,7 @@ fn parse_command_ending( ); } - Ok(ContinueAction::NextLine) + Ok(()) } /// Convert a primitive BRE pattern to a safe ERE-compatible pattern string. @@ -611,7 +596,7 @@ fn compile_subst_command( line: &mut ScriptCharProvider, cmd: &mut Command, context: &ProcessingContext, -) -> UResult { +) -> UResult<()> { line.advance(); // move past 's' let delimiter = line.current(); @@ -666,7 +651,7 @@ fn compile_trans_command( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, -) -> UResult { +) -> UResult<()> { line.advance(); // move past 'y' let delimiter = line.current(); @@ -803,7 +788,7 @@ fn compile_empty_command( lines: &ScriptLineProvider, line: &mut ScriptCharProvider, cmd: &mut Command, -) -> UResult { +) -> UResult<()> { line.advance(); // Skip the command character line.eat_spaces(); // Skip any trailing whitespace @@ -817,7 +802,7 @@ fn compile_command( cmd: &mut Rc>, cmd_spec: &'static CommandSpec, context: &ProcessingContext, -) -> UResult { +) -> UResult<()> { let mut cmd = cmd.borrow_mut(); cmd.code = line.current(); @@ -856,7 +841,7 @@ fn compile_command( } } - Ok(ContinueAction::NextLine) + Ok(()) } // Return the specification for the command letter at the current line position @@ -1417,12 +1402,16 @@ mod tests { ScriptLineProvider::new(input) } + fn empty_line() -> ScriptCharProvider { + ScriptCharProvider::new("") + } + #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); } @@ -1431,7 +1420,7 @@ mod tests { let mut provider = make_provider(&["# comment", " ", ";;"]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); } @@ -1440,7 +1429,7 @@ mod tests { let mut provider = make_provider(&["42q"]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let cmd = binding.borrow(); @@ -1464,7 +1453,7 @@ mod tests { let mut provider = make_provider(&["42!p"]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let cmd = binding.borrow(); @@ -1488,7 +1477,7 @@ mod tests { let mut provider = make_provider(&["1q", "2d"]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let first = binding.borrow(); @@ -1504,7 +1493,7 @@ mod tests { let mut provider = make_provider(&["1q;2d"]); let mut opts = &ctx(); - let result = compile_thread(&mut provider, &mut opts).unwrap(); + let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let first = binding.borrow(); @@ -1750,8 +1739,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/;"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); - assert!(matches!(result, ContinueAction::NextChar)); + compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); if let CommandData::Substitution(subst) = &cmd.data { assert_eq!(subst.replacement.parts.len(), 1); @@ -1765,9 +1753,7 @@ mod tests { let (mut lines, mut chars) = make_providers("s/foo/bar/"); let mut cmd = Command::default(); - let result = compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); - assert!(matches!(result, ContinueAction::NextLine)); - + compile_subst_command(&mut lines, &mut chars, &mut cmd, &ctx()).unwrap(); match &cmd.data { CommandData::Substitution(subst) => { assert_eq!(subst.replacement.parts.len(), 1); From 3b88ac6e3541c2b97cd2642da295bf27d3ab5f8f Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 11:39:55 +0300 Subject: [PATCH 78/85] Implement command blocks --- src/uu/sed/src/command.rs | 21 ++--- src/uu/sed/src/compiler.rs | 80 +++++++++++------ src/uu/sed/src/processor.rs | 24 ++++-- src/uu/sed/src/sed.rs | 2 + tests/by-util/test_sed.rs | 85 +++++++++++++++++-- .../fixtures/sed/output/addr_simple_negation | 14 +++ .../fixtures/sed/output/block_negative_range | 14 +++ .../sed/output/block_negative_range_2 | 23 +++++ .../output/block_nested_negative_selection | 14 +++ .../sed/output/block_nested_selection | 14 +++ tests/fixtures/sed/output/block_simple_range | 14 +++ 11 files changed, 253 insertions(+), 52 deletions(-) create mode 100644 tests/fixtures/sed/output/addr_simple_negation create mode 100644 tests/fixtures/sed/output/block_negative_range create mode 100644 tests/fixtures/sed/output/block_negative_range_2 create mode 100644 tests/fixtures/sed/output/block_nested_negative_selection create mode 100644 tests/fixtures/sed/output/block_nested_selection create mode 100644 tests/fixtures/sed/output/block_simple_range diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ba356386..876f81fb 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -59,6 +59,10 @@ pub struct ProcessingContext { pub input_action: Option, /// Hold space pub hold: StringSpace, + /// Nesting of { } at compile time + pub parsed_block_nesting: usize, + /// Nested blocks at run time + pub processing_block_stack: Vec>>>, } #[derive(Clone, Debug, Default, PartialEq)] @@ -279,19 +283,10 @@ impl Default for Command { /// Command-specific data pub enum CommandData { None, - Subcommand(Rc>), // Commands for 'b', 't', '{' - Substitution(Box), // Substitute command 's' - Transliteration(Box), // Transliteration command 'y' - NamedWriter(Box), // File descriptor for 'w' -} - -impl CommandData { - pub fn get_subcommand(&self) -> Rc> { - match self { - CommandData::Subcommand(rc) => Rc::clone(rc), - _ => panic!("Called get on non-Subcommand variant"), - } - } + Subcommand(Option>>), // Commands for 'b', 't', '{' + Substitution(Box), // Substitute command 's' + Transliteration(Box), // Transliteration command 'y' + NamedWriter(Box), // File descriptor for 'w' } #[derive(Debug)] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 0c6e5bdb..d04abb97 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -24,7 +24,7 @@ use std::cell::RefCell; use std::collections::HashMap; use std::path::PathBuf; use std::rc::Rc; -use uucore::error::UResult; +use uucore::error::{UResult, USimpleError}; // A global, immutable map of command properties, initialized on first access static CMD_MAP: Lazy> = Lazy::new(build_command_map); @@ -35,7 +35,7 @@ enum CommandArgs { Empty, // d D g G h H l n N p P q x = \0 Text, // a c i NonSelect, // ! - Group, // { + BeginGroup, // { EndGroup, // } Comment, // # Branch, // b t @@ -60,7 +60,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: '{', n_addr: 2, - args: CommandArgs::Group, + args: CommandArgs::BeginGroup, }, CommandSpec { code: '}', @@ -204,12 +204,21 @@ fn build_command_map() -> HashMap { pub fn compile( scripts: Vec, - processing_context: &mut ProcessingContext, + context: &mut ProcessingContext, ) -> UResult>>> { let mut make_providers = ScriptLineProvider::new(scripts); let mut empty_line = ScriptCharProvider::new(""); - let result = compile_thread(&mut make_providers, &mut empty_line, processing_context)?; + let result = compile_thread(&mut make_providers, &mut empty_line, context)?; + + if context.parsed_block_nesting > 0 { + return Err(USimpleError::new(1, "unmatched `{'")); + } + + // Comment-out the following to show the compiled script. + #[cfg(any())] + dbg!(&result); + // TODO: fix-up labels, check used labels, setup append & match structures Ok(result) } @@ -218,7 +227,7 @@ pub fn compile( fn compile_thread( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, - context: &ProcessingContext, + context: &mut ProcessingContext, ) -> UResult>>> { let mut head: Option>> = None; let mut tail: Option>> = None; @@ -226,7 +235,7 @@ fn compile_thread( loop { line.eat_spaces(); if line.eol() || line.current() == '#' { - // TODO: set processing_context.quiet for StringVal starting with #n + // TODO: set context.quiet for StringVal starting with #n match lines.next_line()? { None => { return Ok(head); @@ -243,14 +252,29 @@ fn compile_thread( let mut cmd = Rc::new(RefCell::new(Command::default())); let n_addr = compile_address_range(lines, line, &mut cmd, context)?; + line.eat_spaces(); let mut cmd_spec = get_cmd_spec(lines, line, n_addr)?; // The ! command shall be followed by another one - if cmd_spec.args == CommandArgs::NonSelect { - line.advance(); - line.eat_spaces(); - cmd.borrow_mut().non_select = true; - cmd_spec = get_cmd_spec(lines, line, n_addr)?; + match cmd_spec.args { + CommandArgs::NonSelect => { + line.advance(); + line.eat_spaces(); + cmd.borrow_mut().non_select = true; + cmd_spec = get_cmd_spec(lines, line, n_addr)?; + } + CommandArgs::EndGroup => { + if context.parsed_block_nesting == 0 { + return compilation_error(lines, line, "unexpected `}'"); + } + context.parsed_block_nesting -= 1; + line.advance(); + line.eat_spaces(); + let mut cmd_ref = cmd.borrow_mut(); + parse_command_ending(lines, line, &mut cmd_ref)?; + return Ok(head); + } + _ => (), } compile_command(lines, line, &mut cmd, cmd_spec, context)?; @@ -693,8 +717,11 @@ pub fn compile_subst_flags( subst.ignore_case = false; subst.write_file = None; - while !line.eol() { + loop { line.eat_spaces(); + if line.eol() { + break; + } match line.current() { 'g' => { @@ -801,7 +828,7 @@ fn compile_command( line: &mut ScriptCharProvider, cmd: &mut Rc>, cmd_spec: &'static CommandSpec, - context: &ProcessingContext, + context: &mut ProcessingContext, ) -> UResult<()> { let mut cmd = cmd.borrow_mut(); cmd.code = line.current(); @@ -825,7 +852,12 @@ fn compile_command( // TODO CommandArgs::Text => { // a c i } - CommandArgs::Group => { // { + CommandArgs::BeginGroup => { + // { + line.advance(); // move past '{' + context.parsed_block_nesting += 1; + let block_body = compile_thread(lines, line, context)?; + cmd.data = CommandData::Subcommand(block_body); } CommandArgs::EndGroup => { // } } @@ -859,7 +891,7 @@ fn get_cmd_spec( let opt_cmd_spec = lookup_command(ch); if opt_cmd_spec.is_none() { - return compilation_error(lines, line, format!("invalid command code {}", ch)); + return compilation_error(lines, line, format!("invalid command code `{}'", ch)); } let cmd_spec = opt_cmd_spec.unwrap(); @@ -923,7 +955,7 @@ mod tests { fn test_lookup_group_command() { let cmd = lookup_command('{').unwrap(); assert_eq!(cmd.n_addr, 2); - assert_eq!(cmd.args, CommandArgs::Group); + assert_eq!(cmd.args, CommandArgs::BeginGroup); } #[test] @@ -1051,7 +1083,7 @@ mod tests { assert!(result.is_err()); let msg = result.unwrap_err().to_string(); - assert!(msg.contains("script.sed:2:0: error: invalid command code @")); + assert!(msg.contains("script.sed:2:0: error: invalid command code `@'")); } #[test] @@ -1409,7 +1441,7 @@ mod tests { #[test] fn test_compile_thread_empty_input() { let mut provider = make_provider(&[]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); @@ -1418,7 +1450,7 @@ mod tests { #[test] fn test_compile_thread_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); @@ -1427,7 +1459,7 @@ mod tests { #[test] fn test_compile_thread_single_command() { let mut provider = make_provider(&["42q"]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); @@ -1451,7 +1483,7 @@ mod tests { #[test] fn test_compile_thread_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); @@ -1475,7 +1507,7 @@ mod tests { #[test] fn test_compile_thread_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); @@ -1491,7 +1523,7 @@ mod tests { #[test] fn test_compile_thread_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); - let mut opts = &ctx(); + let mut opts = ctx(); let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 0226afcc..8fc2797f 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -276,15 +276,30 @@ fn process_file( match command.code { '{' => { - current = Some(command.data.get_subcommand()); + // Block begin; start processing the enclosed ones. + let block_body = { + match &mut command.data { + CommandData::Subcommand(block) => block.clone(), + _ => panic!("Expected Subcommand command data"), + } + }; + context.processing_block_stack.push(command.next.clone()); + current = block_body; + continue; + } + '}' => { + // Block end: continue with the block's next command. + current = context + .processing_block_stack + .pop() + .expect("empty block command stack"); continue; } 'a' => { // TODO } 'b' => { - current = Some(command.data.get_subcommand()); - continue; + // TODO } 'c' => { // TODO @@ -404,9 +419,6 @@ fn process_file( ':' => { // TODO } - '}' => { - // Nothing to do here - } '=' => { // TODO } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 577a435c..412cd5a3 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -204,6 +204,8 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { saved_regex: const { RefCell::new(None) }, input_action: None, hold: StringSpace::default(), + parsed_block_nesting: 0, + processing_block_stack: Vec::new(), } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 9f101ff4..2f8034e5 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -194,6 +194,7 @@ check_output!( ["-n", "/1_4/,/10/!p", LINES1] ); check_output!(addr_empty_re_reuse, ["-n", "/_2/,//p", LINES1, LINES2]); +check_output!(addr_simple_negation, ["-e", r"4,12!s/^/^/", LINES1]); // Test substitutions check_output!(subst_any, ["-e", r"s/./X/g", LINES1]); @@ -250,17 +251,8 @@ check_output!( ); check_output!(trans_no_new_line, ["-e", r"y/l/L/", NO_NEW_LINE]); check_output!(trans_newline, ["-e", r"1N;2y/\n/X/", LINES1]); - -// TODO: Enable when "{}" is implemented. -#[cfg(any())] check_output!(subst_newline_class, ["-n", r"1{;N;s/[\n]/X/;p;}", LINES1]); - -// TODO: Enable when "{}" is implemented. -#[cfg(any())] check_output!(subst_newline_re, ["-n", r"1{;N;s/\n/X/;p;}", LINES1]); - -// TODO: Enable when "{}" is implemented. -#[cfg(any())] check_output!(print_to_newline, ["-n", r"1{;N;P;P;p;}", LINES1]); check_output!(pattern_next_print, ["-n", r"N;N;P", LINES1]); @@ -286,3 +278,78 @@ check_output!(pattern_next_print_output, ["-e", r"4n;p", LINES1]); check_output!(pattern_next_print_no_output, ["-n", "-e", r"4n;p", LINES1]); check_output!(pattern_quit, [r"5q", LINES1]); check_output!(pattern_quit_2, [r"5q", LINES1, LINES2]); + +check_output!( + block_simple_range, + [ + "-e", + r#" +4,12 { + s/^/^/ + s/$/$/ + s/_/T/ +}"#, + LINES1 + ] +); + +check_output!( + block_negative_range, + [ + "-e", + r#" +4,12 !{ + s/^/^/ + s/$/$/ + s/_/T/ +}"#, + LINES1 + ] +); + +check_output!( + block_negative_range_2, + [ + "-e", + r#" +4,12 !{ + s/^/^/ + s/$/$/ + s/_/T/ +}"#, + LINES1, + LINES2 + ] +); + +check_output!( + block_nested_selection, + [ + "-e", + r#" +4,12 { + s/^/^/ + /6/,/10/ { + s/$/$/ + /8/ s/_/T/ + } +}"#, + LINES1 + ] +); + +check_output!( + block_nested_negative_selection, + [ + "-e", + r#" +4,12 !{ + s/^/^/ + /6/,/10/ !{ + s/$/$/ + /8/ !s/_/T/ + } +}"#, + LINES1 + ] +); diff --git a/tests/fixtures/sed/output/addr_simple_negation b/tests/fixtures/sed/output/addr_simple_negation new file mode 100644 index 00000000..67f03ef6 --- /dev/null +++ b/tests/fixtures/sed/output/addr_simple_negation @@ -0,0 +1,14 @@ +^l1_1 +^l1_2 +^l1_3 +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +^l1_13 +^l1_14 diff --git a/tests/fixtures/sed/output/block_negative_range b/tests/fixtures/sed/output/block_negative_range new file mode 100644 index 00000000..5b15dae5 --- /dev/null +++ b/tests/fixtures/sed/output/block_negative_range @@ -0,0 +1,14 @@ +^l1T1$ +^l1T2$ +^l1T3$ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +^l1T13$ +^l1T14$ diff --git a/tests/fixtures/sed/output/block_negative_range_2 b/tests/fixtures/sed/output/block_negative_range_2 new file mode 100644 index 00000000..d5484cb6 --- /dev/null +++ b/tests/fixtures/sed/output/block_negative_range_2 @@ -0,0 +1,23 @@ +^l1T1$ +^l1T2$ +^l1T3$ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +^l1T13$ +^l1T14$ +^l2T1$ +^l2T2$ +^l2T3$ +^l2T4$ +^l2T5$ +^l2T6$ +^l2T7$ +^l2T8$ +^l2T9$ diff --git a/tests/fixtures/sed/output/block_nested_negative_selection b/tests/fixtures/sed/output/block_nested_negative_selection new file mode 100644 index 00000000..5b15dae5 --- /dev/null +++ b/tests/fixtures/sed/output/block_nested_negative_selection @@ -0,0 +1,14 @@ +^l1T1$ +^l1T2$ +^l1T3$ +l1_4 +l1_5 +l1_6 +l1_7 +l1_8 +l1_9 +l1_10 +l1_11 +l1_12 +^l1T13$ +^l1T14$ diff --git a/tests/fixtures/sed/output/block_nested_selection b/tests/fixtures/sed/output/block_nested_selection new file mode 100644 index 00000000..488e94f6 --- /dev/null +++ b/tests/fixtures/sed/output/block_nested_selection @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +^l1_4 +^l1_5 +^l1_6$ +^l1_7$ +^l1T8$ +^l1_9$ +^l1_10$ +^l1_11 +^l1_12 +l1_13 +l1_14 diff --git a/tests/fixtures/sed/output/block_simple_range b/tests/fixtures/sed/output/block_simple_range new file mode 100644 index 00000000..f963b03f --- /dev/null +++ b/tests/fixtures/sed/output/block_simple_range @@ -0,0 +1,14 @@ +l1_1 +l1_2 +l1_3 +^l1T4$ +^l1T5$ +^l1T6$ +^l1T7$ +^l1T8$ +^l1T9$ +^l1T10$ +^l1T11$ +^l1T12$ +l1_13 +l1_14 From 0281a12b2a27a8cdaf4caeecae72734d15b477ec Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 13:59:17 +0300 Subject: [PATCH 79/85] Replace runtime block stack with static links The runtime stack would not handle jumps within blocks. Also, patched links are more efficient at runtime. --- src/uu/sed/src/command.rs | 11 +-- src/uu/sed/src/compiler.rs | 187 +++++++++++++++++++++++++++++++++++- src/uu/sed/src/processor.rs | 17 +--- src/uu/sed/src/sed.rs | 1 - 4 files changed, 195 insertions(+), 21 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index 876f81fb..ace23f45 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -61,8 +61,6 @@ pub struct ProcessingContext { pub hold: StringSpace, /// Nesting of { } at compile time pub parsed_block_nesting: usize, - /// Nested blocks at run time - pub processing_block_stack: Vec>>>, } #[derive(Clone, Debug, Default, PartialEq)] @@ -283,10 +281,11 @@ impl Default for Command { /// Command-specific data pub enum CommandData { None, - Subcommand(Option>>), // Commands for 'b', 't', '{' - Substitution(Box), // Substitute command 's' - Transliteration(Box), // Transliteration command 'y' - NamedWriter(Box), // File descriptor for 'w' + Block(Option>>), // Commands for '{' + BranchTarget(Option>>), // Commands for 'b', 't' + Substitution(Box), // Substitute command 's' + Transliteration(Box), // Transliteration command 'y' + NamedWriter(Box), // File descriptor for 'w' } #[derive(Debug)] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index d04abb97..44ae8647 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -214,6 +214,7 @@ pub fn compile( if context.parsed_block_nesting > 0 { return Err(USimpleError::new(1, "unmatched `{'")); } + patch_block_endings(result.clone()); // Comment-out the following to show the compiled script. #[cfg(any())] @@ -223,6 +224,57 @@ pub fn compile( Ok(result) } +/// For every Command in the top-level `head` chain, look for +/// `CommandData::Block(Some(sub_head))`. Recursively patch +/// the sub-chain, then splice its tail back to the original +/// “next” pointer of the *parent* (falling back to its own +/// parent_next if its own next was `None`). +fn patch_block_endings(head: Option>>) { + fn patch_block_endings_to_parent( + mut cur: Option>>, + parent_next: Option>>, + ) { + while let Some(rc_cmd) = cur { + // Borrow mutably just long enough to inspect/rewire this node + let cmd = rc_cmd.borrow_mut(); + // Save this node’s own next pointer + let own_next = cmd.next.clone(); + // Decide what “splice target” to use: + // - if this node has its own_next, use that + // - otherwise, fall back to parent_next + let splice_target = own_next.clone().or(parent_next.clone()); + + // If it has a sub-block, recurse and then patch its tail + if let CommandData::Block(Some(ref sub_head)) = cmd.data { + // 1) recurse into the sub-chain, passing along splice_target + patch_block_endings_to_parent(Some(sub_head.clone()), splice_target.clone()); + + // 2) find the tail of that sub-chain + let mut tail = sub_head.clone(); + loop { + let next_in_sub = tail.borrow().next.clone(); + match next_in_sub { + Some(n) => tail = n, + None => break, + } + } + + // 3) splice the tail’s `.next` to splice_target + tail.borrow_mut().next = splice_target.clone(); + } + + // drop the borrow before moving on + drop(cmd); + + // advance to the next sibling in this level + cur = own_next; + } + } + + // top-level has no parent, so pass None + patch_block_endings_to_parent(head, None); +} + // Compile provided scripts into a thread of commands fn compile_thread( lines: &mut ScriptLineProvider, @@ -857,7 +909,7 @@ fn compile_command( line.advance(); // move past '{' context.parsed_block_nesting += 1; let block_body = compile_thread(lines, line, context)?; - cmd.data = CommandData::Subcommand(block_body); + cmd.data = CommandData::Block(block_body); } CommandArgs::EndGroup => { // } } @@ -1842,4 +1894,137 @@ mod tests { fn test_trailing_backslash_is_preserved() { assert_eq!(bre_to_ere(r"abc\"), r"abc\"); } + + // patch_block_endings + + // Create a command with the specified code. + fn cmd(code: char) -> Rc> { + Rc::new(RefCell::new(Command { + code, + ..Default::default() + })) + } + + // Link the vector of passed commands into a list, returning head. + fn link(cmds: Vec>>) -> Option>> { + for i in 0..cmds.len().saturating_sub(1) { + cmds[i].borrow_mut().next = Some(cmds[i + 1].clone()); + } + cmds.first().cloned() + } + + // Return the command codes along the passed linked list. + fn collect_codes(mut head: Option>>) -> Vec { + let mut result = Vec::new(); + while let Some(cmd) = head { + let cmd_ref = cmd.borrow(); + result.push(cmd_ref.code); + head = cmd_ref.next.clone(); + } + result + } + + #[test] + fn test_flat_chain() { + let a = cmd('a'); + let b = cmd('b'); + let head = link(vec![a.clone(), b.clone()]); + + patch_block_endings(head.clone()); + + assert_eq!(collect_codes(head), vec!['a', 'b']); + } + + #[test] + fn test_simple_block_relinks_tail() { + // a ; { x ; y ; } b + let a = cmd('a'); + let block = cmd('{'); + let x = cmd('x'); + let y = cmd('y'); + let b = cmd('b'); + + let head = link(vec![a.clone(), block.clone(), b.clone()]); + let sub_head = link(vec![x.clone(), y.clone()]); + block.borrow_mut().data = CommandData::Block(sub_head.clone()); + + patch_block_endings(head.clone()); + + // Expect x -> y -> b + assert_eq!(collect_codes(sub_head), vec!['x', 'y', 'b']); + // Expect a -> { -> b still valid + assert_eq!(collect_codes(Some(a)), vec!['a', '{', 'b']); + } + + #[test] + fn test_empty_block_no_panic() { + let a = cmd('a'); + a.borrow_mut().data = CommandData::Block(None); + + patch_block_endings(Some(a.clone())); + + assert_eq!(collect_codes(Some(a)), vec!['a']); + } + + #[test] + fn test_nested_blocks() { + // a + // { + // m + // { + // x + // y + // } + // n + // } + // b + let a = cmd('a'); + let b = cmd('b'); + let x = cmd('x'); + let y = cmd('y'); + let m = cmd('m'); + let n = cmd('n'); + let outer_block = cmd('{'); + let inner_block = cmd('{'); + + let head = link(vec![a.clone(), outer_block.clone(), b.clone()]); + let outer = link(vec![m.clone(), inner_block.clone(), n.clone()]); + let inner = link(vec![x.clone(), y.clone()]); + outer_block.borrow_mut().data = CommandData::Block(outer.clone()); + inner_block.borrow_mut().data = CommandData::Block(inner.clone()); + + patch_block_endings(head.clone()); + + assert_eq!(collect_codes(head), vec!['a', '{', 'b']); + assert_eq!(collect_codes(inner), vec!['x', 'y', 'n', 'b']); + assert_eq!(collect_codes(outer), vec!['m', '{', 'n', 'b']); + } + + #[test] + fn test_empty_nested_blocks() { + // a + // { + // { + // x + // } + // } + // b + let a = cmd('a'); + let b = cmd('b'); + let x = cmd('x'); + let outer_block = cmd('{'); + let inner_block = cmd('{'); + + let head = link(vec![a.clone(), outer_block.clone(), b.clone()]); + let outer = link(vec![inner_block.clone()]); + let inner = link(vec![x.clone()]); + outer_block.borrow_mut().data = CommandData::Block(outer.clone()); + inner_block.borrow_mut().data = CommandData::Block(inner.clone()); + + patch_block_endings(head.clone()); + + assert_eq!(collect_codes(head), vec!['a', '{', 'b']); + assert_eq!(collect_codes(outer), vec!['{', 'b']); + assert_eq!(collect_codes(inner), vec!['x', 'b']); + } } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index 8fc2797f..d11f4670 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -277,23 +277,14 @@ fn process_file( match command.code { '{' => { // Block begin; start processing the enclosed ones. - let block_body = { - match &mut command.data { - CommandData::Subcommand(block) => block.clone(), - _ => panic!("Expected Subcommand command data"), - } + let CommandData::Block(body) = &command.data else { + panic!("Expected Block command data"); }; - context.processing_block_stack.push(command.next.clone()); - current = block_body; + current = body.clone(); continue; } '}' => { - // Block end: continue with the block's next command. - current = context - .processing_block_stack - .pop() - .expect("empty block command stack"); - continue; + // Block end: continue with the block's patched next. } 'a' => { // TODO diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 412cd5a3..613685c6 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -205,7 +205,6 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { input_action: None, hold: StringSpace::default(), parsed_block_nesting: 0, - processing_block_stack: Vec::new(), } } From dce3d5d624a7f5b5a2a142aeca42f23da0a68ef6 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 14:08:51 +0300 Subject: [PATCH 80/85] Improve function name --- src/uu/sed/src/compiler.rs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 44ae8647..882bc09f 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -209,7 +209,7 @@ pub fn compile( let mut make_providers = ScriptLineProvider::new(scripts); let mut empty_line = ScriptCharProvider::new(""); - let result = compile_thread(&mut make_providers, &mut empty_line, context)?; + let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; if context.parsed_block_nesting > 0 { return Err(USimpleError::new(1, "unmatched `{'")); @@ -276,7 +276,7 @@ fn patch_block_endings(head: Option>>) { } // Compile provided scripts into a thread of commands -fn compile_thread( +fn compile_sequence( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, context: &mut ProcessingContext, @@ -908,7 +908,7 @@ fn compile_command( // { line.advance(); // move past '{' context.parsed_block_nesting += 1; - let block_body = compile_thread(lines, line, context)?; + let block_body = compile_sequence(lines, line, context)?; cmd.data = CommandData::Block(block_body); } CommandArgs::EndGroup => { // } @@ -1477,7 +1477,7 @@ mod tests { }; } - // compile_thread + // compile_sequence fn make_provider(lines: &[&str]) -> ScriptLineProvider { let input = lines .iter() @@ -1491,29 +1491,29 @@ mod tests { } #[test] - fn test_compile_thread_empty_input() { + fn test_compile_sequence_empty_input() { let mut provider = make_provider(&[]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); } #[test] - fn test_compile_thread_comment_only() { + fn test_compile_sequence_comment_only() { let mut provider = make_provider(&["# comment", " ", ";;"]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); assert!(result.is_none()); } #[test] - fn test_compile_thread_single_command() { + fn test_compile_sequence_single_command() { let mut provider = make_provider(&["42q"]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let cmd = binding.borrow(); @@ -1533,11 +1533,11 @@ mod tests { } #[test] - fn test_compile_thread_non_selected_single_command() { + fn test_compile_sequence_non_selected_single_command() { let mut provider = make_provider(&["42!p"]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let cmd = binding.borrow(); @@ -1557,11 +1557,11 @@ mod tests { } #[test] - fn test_compile_thread_multiple_lines() { + fn test_compile_sequence_multiple_lines() { let mut provider = make_provider(&["1q", "2d"]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let first = binding.borrow(); @@ -1573,11 +1573,11 @@ mod tests { } #[test] - fn test_compile_thread_single_line_multiple_commands() { + fn test_compile_sequence_single_line_multiple_commands() { let mut provider = make_provider(&["1q;2d"]); let mut opts = ctx(); - let result = compile_thread(&mut provider, &mut empty_line(), &mut opts).unwrap(); + let result = compile_sequence(&mut provider, &mut empty_line(), &mut opts).unwrap(); let binding = result.unwrap(); let first = binding.borrow(); From 4c2ef5fa3c9a66d12e3146887ed79f515921e91d Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 16:22:21 +0300 Subject: [PATCH 81/85] Compile labels --- src/uu/sed/src/command.rs | 6 +- src/uu/sed/src/compiler.rs | 125 +++++++++++++++++++++++++------------ src/uu/sed/src/sed.rs | 2 + 3 files changed, 92 insertions(+), 41 deletions(-) diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ace23f45..ac15dfe4 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -61,6 +61,8 @@ pub struct ProcessingContext { pub hold: StringSpace, /// Nesting of { } at compile time pub parsed_block_nesting: usize, + /// Command associated with each label + pub label_to_command_map: HashMap>>, } #[derive(Clone, Debug, Default, PartialEq)] @@ -279,13 +281,15 @@ impl Default for Command { #[derive(Debug)] /// Command-specific data +/// After parsing, t, b Label elements are converted into BranchTarget ones. pub enum CommandData { None, Block(Option>>), // Commands for '{' BranchTarget(Option>>), // Commands for 'b', 't' + Label(Option), // Label name for 'b', 't', ':' + NamedWriter(Box), // File descriptor for 'w' Substitution(Box), // Substitute command 's' Transliteration(Box), // Transliteration command 'y' - NamedWriter(Box), // File descriptor for 'w' } #[derive(Debug)] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 882bc09f..777b378a 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -37,9 +37,7 @@ enum CommandArgs { NonSelect, // ! BeginGroup, // { EndGroup, // } - Comment, // # - Branch, // b t - Label, // : + Label, // b t : ReadFile, // r WriteFile, // w Substitute, // s @@ -75,7 +73,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: 'b', n_addr: 2, - args: CommandArgs::Branch, + args: CommandArgs::Label, }, CommandSpec { code: 'c', @@ -160,7 +158,7 @@ fn build_command_map() -> HashMap { CommandSpec { code: 't', n_addr: 2, - args: CommandArgs::Branch, + args: CommandArgs::Label, }, CommandSpec { code: 'w', @@ -187,11 +185,6 @@ fn build_command_map() -> HashMap { n_addr: 0, args: CommandArgs::Label, }, - CommandSpec { - code: '#', - n_addr: 0, - args: CommandArgs::Comment, - }, CommandSpec { code: '=', n_addr: 1, @@ -874,6 +867,33 @@ fn compile_empty_command( parse_command_ending(lines, line, cmd) } +fn compile_label_command( + lines: &ScriptLineProvider, + line: &mut ScriptCharProvider, + cmd: &mut Command, +) -> UResult<()> { + line.advance(); // Skip the command character + line.eat_spaces(); // Skip any leading whitespace + + let mut label = String::new(); + while !line.eol() && line.current().is_ascii_alphabetic() { + label.push(line.current()); + line.advance(); + } + + if label.is_empty() { + if cmd.code == ':' { + return compilation_error(lines, line, "empty label"); + } + cmd.data = CommandData::Label(None); + } else { + cmd.data = CommandData::Label(Some(label)); + } + + line.eat_spaces(); // Skip any trailing whitespace + parse_command_ending(lines, line, cmd) +} + // Compile the specified command fn compile_command( lines: &mut ScriptLineProvider, @@ -886,10 +906,24 @@ fn compile_command( cmd.code = line.current(); match cmd_spec.args { + CommandArgs::BeginGroup => { + // { + line.advance(); // move past '{' + context.parsed_block_nesting += 1; + let block_body = compile_sequence(lines, line, context)?; + cmd.data = CommandData::Block(block_body); + } + CommandArgs::EndGroup => { // } + // Implemented at a higher level. + } CommandArgs::Empty => { // d D g G h H l n N p P q x = return compile_empty_command(lines, line, &mut cmd); } + CommandArgs::Label => { + // b t : + compile_label_command(lines, line, &mut cmd)?; + } CommandArgs::NonSelect => { // ! // Implemented at a higher level. } @@ -904,21 +938,6 @@ fn compile_command( // TODO CommandArgs::Text => { // a c i } - CommandArgs::BeginGroup => { - // { - line.advance(); // move past '{' - context.parsed_block_nesting += 1; - let block_body = compile_sequence(lines, line, context)?; - cmd.data = CommandData::Block(block_body); - } - CommandArgs::EndGroup => { // } - } - CommandArgs::Comment => { // # - } - CommandArgs::Branch => { // b t - } - CommandArgs::Label => { // : - } CommandArgs::ReadFile => { // r } CommandArgs::WriteFile => { // w @@ -1017,20 +1036,6 @@ mod tests { assert_eq!(cmd.args, CommandArgs::EndGroup); } - #[test] - fn test_lookup_comment_command() { - let cmd = lookup_command('#').unwrap(); - assert_eq!(cmd.n_addr, 0); - assert_eq!(cmd.args, CommandArgs::Comment); - } - - #[test] - fn test_lookup_branch_command() { - let cmd = lookup_command('b').unwrap(); - assert_eq!(cmd.n_addr, 2); - assert_eq!(cmd.args, CommandArgs::Branch); - } - #[test] fn test_lookup_label_command() { let cmd = lookup_command(':').unwrap(); @@ -1786,7 +1791,6 @@ mod tests { let err = compile_subst_flags(&lines, &mut chars, &mut subst).unwrap_err(); assert!(err.to_string().contains("invalid substitute flag")); } - // compile_subst_command #[test] fn test_compile_subst_invalid_delimiter_backslash() { @@ -2027,4 +2031,45 @@ mod tests { assert_eq!(collect_codes(outer), vec!['{', 'b']); assert_eq!(collect_codes(inner), vec!['x', 'b']); } + + // compile_label_command + #[test] + fn test_compile_label_command() { + let (mut lines, mut chars) = make_providers(": foo"); + let mut cmd = Command::default(); + + compile_label_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Label(label) => { + let name = label.clone().unwrap(); + assert_eq!(name, "foo"); + } + _ => panic!("Expected CommandData::Label"), + } + } + + #[test] + fn test_compile_missing_label_command() { + let (mut lines, mut chars) = make_providers(": ;"); + let mut cmd = Command::default(); + + cmd.code = ':'; + let err = compile_label_command(&mut lines, &mut chars, &mut cmd).unwrap_err(); + assert!(err.to_string().contains("empty label")); + } + + #[test] + fn test_compile_empty_label_command() { + let (mut lines, mut chars) = make_providers("b ;"); + let mut cmd = Command::default(); + + cmd.code = 'b'; + compile_label_command(&mut lines, &mut chars, &mut cmd).unwrap(); + match &cmd.data { + CommandData::Label(label) => { + assert!(label.is_none()); + } + _ => panic!("Expected CommandData::Label(None)"), + } + } } diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 613685c6..5ada44ed 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -23,6 +23,7 @@ use crate::compiler::compile; use crate::processor::process_all_files; use clap::{Arg, ArgMatches, Command, arg}; use std::cell::RefCell; +use std::collections::HashMap; use std::path::PathBuf; use uucore::error::{UResult, UUsageError}; use uucore::format_usage; @@ -205,6 +206,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { input_action: None, hold: StringSpace::default(), parsed_block_nesting: 0, + label_to_command_map: HashMap::new(), } } From a404366670c4d0e3a21cca38f14119c1a27d63ba Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 16:47:15 +0300 Subject: [PATCH 82/85] Create map from labels to commands --- src/uu/sed/src/compiler.rs | 166 +++++++++++++++++++++++++++++-------- 1 file changed, 131 insertions(+), 35 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 777b378a..11adc67d 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -209,6 +209,8 @@ pub fn compile( } patch_block_endings(result.clone()); + populate_label_map(result.clone(), context); + // Comment-out the following to show the compiled script. #[cfg(any())] dbg!(&result); @@ -268,6 +270,30 @@ fn patch_block_endings(head: Option>>) { patch_block_endings_to_parent(head, None); } +/// Populate the context's label map with references to associated commands. +fn populate_label_map(mut cur: Option>>, context: &mut ProcessingContext) { + while let Some(rc_cmd) = cur { + // Borrow mutably just long enough to inspect/rewire this node + let cmd = rc_cmd.borrow_mut(); + + // Extract any label to insert after borrow ends + let maybe_label = match &cmd.data { + CommandData::Block(Some(sub_head)) => { + populate_label_map(Some(sub_head.clone()), context); + None + } + CommandData::Label(Some(label)) => Some(label.clone()), + _ => None, + }; + + if let Some(label) = maybe_label { + context.label_to_command_map.insert(label, rc_cmd.clone()); + } + + cur = cmd.next.clone(); + } +} + // Compile provided scripts into a thread of commands fn compile_sequence( lines: &mut ScriptLineProvider, @@ -1902,7 +1928,7 @@ mod tests { // patch_block_endings // Create a command with the specified code. - fn cmd(code: char) -> Rc> { + fn command_with_code(code: char) -> Rc> { Rc::new(RefCell::new(Command { code, ..Default::default() @@ -1910,7 +1936,7 @@ mod tests { } // Link the vector of passed commands into a list, returning head. - fn link(cmds: Vec>>) -> Option>> { + fn link_commands(cmds: Vec>>) -> Option>> { for i in 0..cmds.len().saturating_sub(1) { cmds[i].borrow_mut().next = Some(cmds[i + 1].clone()); } @@ -1930,9 +1956,9 @@ mod tests { #[test] fn test_flat_chain() { - let a = cmd('a'); - let b = cmd('b'); - let head = link(vec![a.clone(), b.clone()]); + let a = command_with_code('a'); + let b = command_with_code('b'); + let head = link_commands(vec![a.clone(), b.clone()]); patch_block_endings(head.clone()); @@ -1942,14 +1968,14 @@ mod tests { #[test] fn test_simple_block_relinks_tail() { // a ; { x ; y ; } b - let a = cmd('a'); - let block = cmd('{'); - let x = cmd('x'); - let y = cmd('y'); - let b = cmd('b'); - - let head = link(vec![a.clone(), block.clone(), b.clone()]); - let sub_head = link(vec![x.clone(), y.clone()]); + let a = command_with_code('a'); + let block = command_with_code('{'); + let x = command_with_code('x'); + let y = command_with_code('y'); + let b = command_with_code('b'); + + let head = link_commands(vec![a.clone(), block.clone(), b.clone()]); + let sub_head = link_commands(vec![x.clone(), y.clone()]); block.borrow_mut().data = CommandData::Block(sub_head.clone()); patch_block_endings(head.clone()); @@ -1962,7 +1988,7 @@ mod tests { #[test] fn test_empty_block_no_panic() { - let a = cmd('a'); + let a = command_with_code('a'); a.borrow_mut().data = CommandData::Block(None); patch_block_endings(Some(a.clone())); @@ -1982,18 +2008,18 @@ mod tests { // n // } // b - let a = cmd('a'); - let b = cmd('b'); - let x = cmd('x'); - let y = cmd('y'); - let m = cmd('m'); - let n = cmd('n'); - let outer_block = cmd('{'); - let inner_block = cmd('{'); - - let head = link(vec![a.clone(), outer_block.clone(), b.clone()]); - let outer = link(vec![m.clone(), inner_block.clone(), n.clone()]); - let inner = link(vec![x.clone(), y.clone()]); + let a = command_with_code('a'); + let b = command_with_code('b'); + let x = command_with_code('x'); + let y = command_with_code('y'); + let m = command_with_code('m'); + let n = command_with_code('n'); + let outer_block = command_with_code('{'); + let inner_block = command_with_code('{'); + + let head = link_commands(vec![a.clone(), outer_block.clone(), b.clone()]); + let outer = link_commands(vec![m.clone(), inner_block.clone(), n.clone()]); + let inner = link_commands(vec![x.clone(), y.clone()]); outer_block.borrow_mut().data = CommandData::Block(outer.clone()); inner_block.borrow_mut().data = CommandData::Block(inner.clone()); @@ -2013,15 +2039,15 @@ mod tests { // } // } // b - let a = cmd('a'); - let b = cmd('b'); - let x = cmd('x'); - let outer_block = cmd('{'); - let inner_block = cmd('{'); - - let head = link(vec![a.clone(), outer_block.clone(), b.clone()]); - let outer = link(vec![inner_block.clone()]); - let inner = link(vec![x.clone()]); + let a = command_with_code('a'); + let b = command_with_code('b'); + let x = command_with_code('x'); + let outer_block = command_with_code('{'); + let inner_block = command_with_code('{'); + + let head = link_commands(vec![a.clone(), outer_block.clone(), b.clone()]); + let outer = link_commands(vec![inner_block.clone()]); + let inner = link_commands(vec![x.clone()]); outer_block.borrow_mut().data = CommandData::Block(outer.clone()); inner_block.borrow_mut().data = CommandData::Block(inner.clone()); @@ -2072,4 +2098,74 @@ mod tests { _ => panic!("Expected CommandData::Label(None)"), } } + + // populate_label_map + fn command_with_data(data: CommandData) -> Rc> { + Rc::new(RefCell::new(Command { + data, + ..Default::default() + })) + } + + #[test] + fn test_single_label() { + let cmd = command_with_data(CommandData::Label(Some("start".to_string()))); + let mut context = ProcessingContext::default(); + + populate_label_map(Some(cmd.clone()), &mut context); + + assert_eq!(context.label_to_command_map.len(), 1); + assert!(context.label_to_command_map.contains_key("start")); + assert!(Rc::ptr_eq(&context.label_to_command_map["start"], &cmd)); + } + + #[test] + fn test_label_inside_block() { + let nested = command_with_data(CommandData::Label(Some("inside".to_string()))); + let block = command_with_data(CommandData::Block(Some(nested.clone()))); + let mut context = ProcessingContext::default(); + + populate_label_map(Some(block.clone()), &mut context); + + assert_eq!(context.label_to_command_map.len(), 1); + assert!(context.label_to_command_map.contains_key("inside")); + assert!(Rc::ptr_eq(&context.label_to_command_map["inside"], &nested)); + } + + #[test] + fn test_multiple_labels() { + let a = command_with_data(CommandData::Label(Some("a".to_string()))); + let b = command_with_data(CommandData::Label(Some("b".to_string()))); + let head = link_commands(vec![a.clone(), b.clone()]); + + let mut context = ProcessingContext::default(); + populate_label_map(head, &mut context); + + assert_eq!(context.label_to_command_map.len(), 2); + assert!(context.label_to_command_map.contains_key("a")); + assert!(context.label_to_command_map.contains_key("b")); + } + + #[test] + fn test_no_labels() { + let a = command_with_data(CommandData::None); + let b = command_with_data(CommandData::None); + let head = link_commands(vec![a.clone(), b.clone()]); + + let mut context = ProcessingContext::default(); + populate_label_map(head, &mut context); + + assert_eq!(context.label_to_command_map.len(), 0); + } + + #[test] + fn test_label_none_is_ignored() { + let cmd = command_with_data(CommandData::Label(None)); + let mut context = ProcessingContext::default(); + + populate_label_map(Some(cmd.clone()), &mut context); + + // The map should remain empty since the label is None + assert_eq!(context.label_to_command_map.len(), 0); + } } From 1e190f3f6317b99a68a58cc70a38ca9f54ccf620 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 17:15:37 +0300 Subject: [PATCH 83/85] Resolve branch targets to commands --- src/uu/sed/src/compiler.rs | 143 ++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 11adc67d..a748bc80 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -22,6 +22,7 @@ use once_cell::sync::Lazy; use regex::Regex; use std::cell::RefCell; use std::collections::HashMap; +use std::mem; use std::path::PathBuf; use std::rc::Rc; use uucore::error::{UResult, USimpleError}; @@ -204,18 +205,21 @@ pub fn compile( let mut empty_line = ScriptCharProvider::new(""); let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; + // Link the ends of if context.parsed_block_nesting > 0 { return Err(USimpleError::new(1, "unmatched `{'")); } patch_block_endings(result.clone()); + // Link branch commands to the target label commands. populate_label_map(result.clone(), context); + resolve_branch_targets(result.clone(), context)?; // Comment-out the following to show the compiled script. #[cfg(any())] dbg!(&result); - // TODO: fix-up labels, check used labels, setup append & match structures + // TODO: setup append & match structures Ok(result) } @@ -294,7 +298,53 @@ fn populate_label_map(mut cur: Option>>, context: &mut Proce } } -// Compile provided scripts into a thread of commands +/// Replace branch labels with references to the corresponding commands. +/// Raise an error on undefined labels. +fn resolve_branch_targets( + mut cur: Option>>, + context: &mut ProcessingContext, +) -> UResult<()> { + while let Some(rc_cmd) = cur { + // Borrow mutably just long enough to inspect/rewire this node + let mut cmd = rc_cmd.borrow_mut(); + + // Recurse into blocks + if let CommandData::Block(Some(sub_head)) = &cmd.data { + resolve_branch_targets(Some(sub_head.clone()), context)?; + } + + // Only for 't' or 'b' commands: + if matches!(cmd.code, 't' | 'b') { + // Take ownership of the current data + let old_data = mem::replace(&mut cmd.data, CommandData::None); + + // Build the replacement + let new_data = match old_data { + CommandData::Label(Some(label)) => { + let target = context + .label_to_command_map + .get(&label) + .cloned() + .ok_or_else(|| { + USimpleError::new(2, format!("undefined label `{}'", label)) + })?; + CommandData::BranchTarget(Some(target)) + } + CommandData::Label(None) => CommandData::BranchTarget(None), + other => other, // put back anything else unchanged + }; + + // Store it back + cmd.data = new_data; + } + + // Advance to the next sibling + cur = cmd.next.clone(); + } + Ok(()) +} + +/// Compile provided scripts into a sequence of commands. fn compile_sequence( lines: &mut ScriptLineProvider, line: &mut ScriptCharProvider, @@ -2168,4 +2218,93 @@ mod tests { // The map should remain empty since the label is None assert_eq!(context.label_to_command_map.len(), 0); } + + #[test] + fn test_branch_target_resolved() { + let target = command_with_data(CommandData::Label(Some("end".to_string()))); + target.borrow_mut().code = ':'; + + let branch = command_with_data(CommandData::Label(Some("end".to_string()))); + branch.borrow_mut().code = 'b'; + + let head = link_commands(vec![branch.clone(), target.clone()]); + let mut context = ProcessingContext::default(); + + populate_label_map(head.clone(), &mut context); + let result = resolve_branch_targets(head.clone(), &mut context); + assert!(result.is_ok()); + + match &branch.borrow().data { + CommandData::BranchTarget(Some(ptr)) => { + assert!(Rc::ptr_eq(ptr, &target)); + } + _ => panic!("Expected BranchTarget(Some(...))"), + } + } + + // resolve_branch_targets + #[test] + fn test_branch_target_missing_label_gives_error() { + let branch = command_with_data(CommandData::Label(Some("nope".to_string()))); + branch.borrow_mut().code = 't'; + + let mut context = ProcessingContext::default(); + let result = resolve_branch_targets(Some(branch.clone()), &mut context); + + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("undefined label `nope'")); + } + + #[test] + fn test_branch_with_no_label_resolves_to_none() { + let branch = command_with_data(CommandData::Label(None)); + branch.borrow_mut().code = 'b'; + + let mut context = ProcessingContext::default(); + let result = resolve_branch_targets(Some(branch.clone()), &mut context); + + assert!(result.is_ok()); + match &branch.borrow().data { + CommandData::BranchTarget(None) => {} // ok + _ => panic!("Expected BranchTarget(None)"), + } + } + + #[test] + fn test_non_branch_label_is_unchanged() { + let cmd = command_with_data(CommandData::Label(Some("unchanged".to_string()))); + cmd.borrow_mut().code = 'q'; // not a branch command + + let mut context = ProcessingContext::default(); + let result = resolve_branch_targets(Some(cmd.clone()), &mut context); + assert!(result.is_ok()); + + match &cmd.borrow().data { + CommandData::Label(Some(label)) => assert_eq!(label, "unchanged"), + _ => panic!("Expected Label(Some(...)) to remain unchanged"), + } + } + + #[test] + fn test_branch_in_nested_block() { + let label = command_with_data(CommandData::Label(Some("inner".to_string()))); + label.borrow_mut().code = ':'; + + let branch = command_with_data(CommandData::Label(Some("inner".to_string()))); + branch.borrow_mut().code = 't'; + + let block = command_with_data(CommandData::Block(Some(label.clone()))); + let head = link_commands(vec![branch.clone(), block]); + + let mut context = ProcessingContext::default(); + populate_label_map(Some(label.clone()), &mut context); + let result = resolve_branch_targets(head.clone(), &mut context); + + assert!(result.is_ok()); + match &branch.borrow().data { + CommandData::BranchTarget(Some(ptr)) => assert!(Rc::ptr_eq(ptr, &label)), + _ => panic!("Expected BranchTarget(Some(...))"), + } + } } From fa5910d4e258a1e1b6a29f89293851666e487662 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 17:29:53 +0300 Subject: [PATCH 84/85] Report error on duplicate labels While at it fix the label population to avoid defining labels for the branch command targets. --- README.md | 5 +++- src/uu/sed/src/compiler.rs | 55 +++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 90cadc66..d66a6b86 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,10 @@ cargo run --release handling of regular expressions. This _sed_ program can also handle arbitrary byte sequences if no part of the input is treated as string. -- The last line (`$`) address is interpreted as the last non-empty line of +* The command will report an error and fail if duplicate labels are found + in the script. + This matches the BSD behavior. The GNU version accepts duplicate labels. +* The last line (`$`) address is interpreted as the last non-empty line of the last file. If files specified in subsequent arguments until the last one are empty, then the last line condition will never be triggered. This behavior is consistent with the diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index a748bc80..601289ca 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -212,7 +212,7 @@ pub fn compile( patch_block_endings(result.clone()); // Link branch commands to the target label commands. - populate_label_map(result.clone(), context); + populate_label_map(result.clone(), context)?; resolve_branch_targets(result.clone(), context)?; // Comment-out the following to show the compiled script. @@ -275,7 +275,10 @@ fn patch_block_endings(head: Option>>) { } /// Populate the context's label map with references to associated commands. -fn populate_label_map(mut cur: Option>>, context: &mut ProcessingContext) { +fn populate_label_map( + mut cur: Option>>, + context: &mut ProcessingContext, +) -> UResult<()> { while let Some(rc_cmd) = cur { // Borrow mutably just long enough to inspect/rewire this node let cmd = rc_cmd.borrow_mut(); @@ -283,7 +286,7 @@ fn populate_label_map(mut cur: Option>>, context: &mut Proce // Extract any label to insert after borrow ends let maybe_label = match &cmd.data { CommandData::Block(Some(sub_head)) => { - populate_label_map(Some(sub_head.clone()), context); + populate_label_map(Some(sub_head.clone()), context)?; None } CommandData::Label(Some(label)) => Some(label.clone()), @@ -291,11 +294,17 @@ fn populate_label_map(mut cur: Option>>, context: &mut Proce }; if let Some(label) = maybe_label { - context.label_to_command_map.insert(label, rc_cmd.clone()); + if cmd.code == ':' { + if context.label_to_command_map.contains_key(&label) { + return Err(USimpleError::new(2, format!("duplicate label `{}'", label))); + } + context.label_to_command_map.insert(label, rc_cmd.clone()); + } } cur = cmd.next.clone(); } + Ok(()) } /// Replace branch labels with references to the corresponding commands. @@ -2160,9 +2169,10 @@ mod tests { #[test] fn test_single_label() { let cmd = command_with_data(CommandData::Label(Some("start".to_string()))); + cmd.borrow_mut().code = ':'; let mut context = ProcessingContext::default(); - populate_label_map(Some(cmd.clone()), &mut context); + populate_label_map(Some(cmd.clone()), &mut context).unwrap(); assert_eq!(context.label_to_command_map.len(), 1); assert!(context.label_to_command_map.contains_key("start")); @@ -2172,10 +2182,11 @@ mod tests { #[test] fn test_label_inside_block() { let nested = command_with_data(CommandData::Label(Some("inside".to_string()))); + nested.borrow_mut().code = ':'; let block = command_with_data(CommandData::Block(Some(nested.clone()))); let mut context = ProcessingContext::default(); - populate_label_map(Some(block.clone()), &mut context); + populate_label_map(Some(block.clone()), &mut context).unwrap(); assert_eq!(context.label_to_command_map.len(), 1); assert!(context.label_to_command_map.contains_key("inside")); @@ -2185,11 +2196,13 @@ mod tests { #[test] fn test_multiple_labels() { let a = command_with_data(CommandData::Label(Some("a".to_string()))); + a.borrow_mut().code = ':'; let b = command_with_data(CommandData::Label(Some("b".to_string()))); + b.borrow_mut().code = ':'; let head = link_commands(vec![a.clone(), b.clone()]); let mut context = ProcessingContext::default(); - populate_label_map(head, &mut context); + populate_label_map(head, &mut context).unwrap(); assert_eq!(context.label_to_command_map.len(), 2); assert!(context.label_to_command_map.contains_key("a")); @@ -2203,7 +2216,7 @@ mod tests { let head = link_commands(vec![a.clone(), b.clone()]); let mut context = ProcessingContext::default(); - populate_label_map(head, &mut context); + populate_label_map(head, &mut context).unwrap(); assert_eq!(context.label_to_command_map.len(), 0); } @@ -2213,12 +2226,31 @@ mod tests { let cmd = command_with_data(CommandData::Label(None)); let mut context = ProcessingContext::default(); - populate_label_map(Some(cmd.clone()), &mut context); + populate_label_map(Some(cmd.clone()), &mut context).unwrap(); // The map should remain empty since the label is None assert_eq!(context.label_to_command_map.len(), 0); } + #[test] + fn test_duplicate_label_gives_error() { + let a1 = command_with_data(CommandData::Label(Some("dup".to_string()))); + a1.borrow_mut().code = ':'; + + let a2 = command_with_data(CommandData::Label(Some("dup".to_string()))); + a2.borrow_mut().code = ':'; + + let head = link_commands(vec![a1.clone(), a2.clone()]); + let mut context = ProcessingContext::default(); + + let result = populate_label_map(head, &mut context); + + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("duplicate label `dup'")); + } + + // resolve_branch_targets #[test] fn test_branch_target_resolved() { let target = command_with_data(CommandData::Label(Some("end".to_string()))); @@ -2230,7 +2262,7 @@ mod tests { let head = link_commands(vec![branch.clone(), target.clone()]); let mut context = ProcessingContext::default(); - populate_label_map(head.clone(), &mut context); + populate_label_map(head.clone(), &mut context).unwrap(); let result = resolve_branch_targets(head.clone(), &mut context); assert!(result.is_ok()); @@ -2242,7 +2274,6 @@ mod tests { } } - // resolve_branch_targets #[test] fn test_branch_target_missing_label_gives_error() { let branch = command_with_data(CommandData::Label(Some("nope".to_string()))); @@ -2298,7 +2329,7 @@ mod tests { let head = link_commands(vec![branch.clone(), block]); let mut context = ProcessingContext::default(); - populate_label_map(Some(label.clone()), &mut context); + populate_label_map(Some(label.clone()), &mut context).unwrap(); let result = resolve_branch_targets(head.clone(), &mut context); assert!(result.is_ok()); From 039d629122ae2d21002844f41f62272f38995b61 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sat, 17 May 2025 19:38:20 +0300 Subject: [PATCH 85/85] Implement processing of branch (b and t) commands --- README.md | 2 + src/uu/sed/src/command.rs | 2 + src/uu/sed/src/compiler.rs | 5 +- src/uu/sed/src/processor.rs | 40 +++++-- src/uu/sed/src/sed.rs | 1 + tests/by-util/test_sed.rs | 112 ++++++++++++++++++ .../sed/output/branch_conditional_boundary | 14 +++ .../sed/output/branch_conditional_simple | 14 +++ tests/fixtures/sed/output/branch_cycle_clears | 5 + tests/fixtures/sed/output/branch_plain | 17 +++ tests/fixtures/sed/output/branch_test_clears | 14 +++ tests/fixtures/sed/output/branch_to_block | 12 ++ 12 files changed, 229 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/sed/output/branch_conditional_boundary create mode 100644 tests/fixtures/sed/output/branch_conditional_simple create mode 100644 tests/fixtures/sed/output/branch_cycle_clears create mode 100644 tests/fixtures/sed/output/branch_plain create mode 100644 tests/fixtures/sed/output/branch_test_clears create mode 100644 tests/fixtures/sed/output/branch_to_block diff --git a/README.md b/README.md index d66a6b86..b50e8292 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ cargo run --release one are empty, then the last line condition will never be triggered. This behavior is consistent with the [original implementation](https://github.com/dspinellis/unix-history-repo/blob/Research-V7/usr/src/cmd/sed/sed1.c#L665). +* Labels are parsed for alphanumeric characters. The BSD version parses them + until the end of the line, preventing ; to be used as a separator. ## License diff --git a/src/uu/sed/src/command.rs b/src/uu/sed/src/command.rs index ac15dfe4..af673f19 100644 --- a/src/uu/sed/src/command.rs +++ b/src/uu/sed/src/command.rs @@ -63,6 +63,8 @@ pub struct ProcessingContext { pub parsed_block_nesting: usize, /// Command associated with each label pub label_to_command_map: HashMap>>, + /// True if a substitution was made as specified in the t command + pub substitution_made: bool, } #[derive(Clone, Debug, Default, PartialEq)] diff --git a/src/uu/sed/src/compiler.rs b/src/uu/sed/src/compiler.rs index 601289ca..084eec8f 100644 --- a/src/uu/sed/src/compiler.rs +++ b/src/uu/sed/src/compiler.rs @@ -196,6 +196,7 @@ fn build_command_map() -> HashMap { formats.into_iter().map(|f| (f.code, f)).collect() } +/// Compile the scripts into an executable data structure. pub fn compile( scripts: Vec, context: &mut ProcessingContext, @@ -205,7 +206,7 @@ pub fn compile( let mut empty_line = ScriptCharProvider::new(""); let result = compile_sequence(&mut make_providers, &mut empty_line, context)?; - // Link the ends of + // Link the ends of command blocks to their following commands. if context.parsed_block_nesting > 0 { return Err(USimpleError::new(1, "unmatched `{'")); } @@ -961,7 +962,7 @@ fn compile_label_command( line.eat_spaces(); // Skip any leading whitespace let mut label = String::new(); - while !line.eol() && line.current().is_ascii_alphabetic() { + while !line.eol() && line.current().is_ascii_alphanumeric() { label.push(line.current()); line.advance(); } diff --git a/src/uu/sed/src/processor.rs b/src/uu/sed/src/processor.rs index d11f4670..a799ceb9 100644 --- a/src/uu/sed/src/processor.rs +++ b/src/uu/sed/src/processor.rs @@ -165,7 +165,7 @@ fn write_chunk( fn substitute( pattern: &mut IOChunk, sub: &mut Substitution, - context: &ProcessingContext, + context: &mut ProcessingContext, output: &mut OutputBuffer, ) -> UResult<()> { let mut count = 0; @@ -208,6 +208,7 @@ fn substitute( if let Some(ref writer) = sub.write_file { writer.borrow_mut().write_line(pattern.as_str()?)?; } + context.substitution_made = true; } Ok(()) @@ -243,11 +244,11 @@ fn process_file( output: &mut OutputBuffer, context: &mut ProcessingContext, ) -> UResult<()> { - // Loop over the input lines + // Loop over the input lines as pattern space. 'lines: while let Some((mut pattern, last_line)) = reader.get_line()? { context.last_line = last_line; context.line_number += 1; - + context.substitution_made = false; // Set the script command from which to start. let mut current: Option>> = if let Some(action) = context.input_action.take() { @@ -265,7 +266,7 @@ fn process_file( }; // Loop over script commands. - while let Some(command_rc) = current { + while let Some(command_rc) = current.clone() { let mut command = command_rc.borrow_mut(); if !applies(&mut command, &mut pattern, context)? { @@ -290,7 +291,18 @@ fn process_file( // TODO } 'b' => { - // TODO + // Branch to the specified label or end if none is given. + let CommandData::BranchTarget(target) = &command.data else { + panic!("Expected BranchTarget command data"); + }; + if target.is_some() { + // New command to execute + current = target.clone(); + continue; + } else { + // Branch to the end of the script. + break; + } } 'c' => { // TODO @@ -387,8 +399,22 @@ fn process_file( substitute(&mut pattern, &mut *subst, context, output)?; } + 't' if !context.substitution_made => { /* Do nothing. */ } 't' => { - // TODO + // Branch to the specified label or end if none is given + // if a substitution was made since last cycle or t. + let CommandData::BranchTarget(target) = &command.data else { + panic!("Expected BranchTarget command data"); + }; + context.substitution_made = false; + if target.is_some() { + // New command to execute + current = target.clone(); + continue; + } else { + // Branch to the end of the script. + break; + } } 'w' => { // TODO @@ -408,7 +434,7 @@ fn process_file( transliterate(&mut pattern, trans)?; } ':' => { - // TODO + // Branch target; do nothing. } '=' => { // TODO diff --git a/src/uu/sed/src/sed.rs b/src/uu/sed/src/sed.rs index 5ada44ed..776edede 100644 --- a/src/uu/sed/src/sed.rs +++ b/src/uu/sed/src/sed.rs @@ -207,6 +207,7 @@ fn build_context(matches: &ArgMatches) -> ProcessingContext { hold: StringSpace::default(), parsed_block_nesting: 0, label_to_command_map: HashMap::new(), + substitution_made: false, } } diff --git a/tests/by-util/test_sed.rs b/tests/by-util/test_sed.rs index 2f8034e5..74517d81 100644 --- a/tests/by-util/test_sed.rs +++ b/tests/by-util/test_sed.rs @@ -353,3 +353,115 @@ check_output!( LINES1 ] ); + +check_output!( + branch_plain, + [ + "-n", + "-e", + r#" +b label4 +:label3 +s/^/label3_/p +b end +:label4 +2,12b label1 +b label2 +:label1 +s/^/label1_/p +b +:label2 +s/^/label2_/p +b label3 +:end +"#, + LINES1 + ] +); + +check_output!( + branch_conditional_simple, + [ + "-n", + "-e", + r#" +s/l1_/l2_/ +t ok +b +:ok +s/^/tested /p +"#, + LINES1, + LINES2 + ] +); + +// SunOS and GNU sed behave as follows: lines 9-$ aren"#,t printed at all +check_output!( + branch_to_block, + [ + "-n", + "-e", + r#" +5,8b inside +1,5 { + s/^/^/p + :inside + s/$/$/p +} +"#, + LINES1 + ] +); + +// Check that t clears the substitution done flag +check_output!( + branch_test_clears, + [ + "-n", + "-e", + r#" +1,8s/^/^/ +t l1 +:l1 +t l2 +s/$/$/p +b +:l2 +s/^/ERROR/ +"#, + LINES1 + ] +); + +// Check that reading a line clears the substitution done flag +check_output!( + branch_cycle_clears, + [ + "-n", + "-e", + r#" +t l2 +1,8s/^/^/p +2,7N +b +:l2 +s/^/ERROR/p +"#, + LINES1 + ] +); + +check_output!( + branch_conditional_boundary, + [ + "-e", + r#" +{ +:b +} +s/l/m/ +tb"#, + LINES1 + ] +); diff --git a/tests/fixtures/sed/output/branch_conditional_boundary b/tests/fixtures/sed/output/branch_conditional_boundary new file mode 100644 index 00000000..1557318a --- /dev/null +++ b/tests/fixtures/sed/output/branch_conditional_boundary @@ -0,0 +1,14 @@ +m1_1 +m1_2 +m1_3 +m1_4 +m1_5 +m1_6 +m1_7 +m1_8 +m1_9 +m1_10 +m1_11 +m1_12 +m1_13 +m1_14 diff --git a/tests/fixtures/sed/output/branch_conditional_simple b/tests/fixtures/sed/output/branch_conditional_simple new file mode 100644 index 00000000..ec339f43 --- /dev/null +++ b/tests/fixtures/sed/output/branch_conditional_simple @@ -0,0 +1,14 @@ +tested l2_1 +tested l2_2 +tested l2_3 +tested l2_4 +tested l2_5 +tested l2_6 +tested l2_7 +tested l2_8 +tested l2_9 +tested l2_10 +tested l2_11 +tested l2_12 +tested l2_13 +tested l2_14 diff --git a/tests/fixtures/sed/output/branch_cycle_clears b/tests/fixtures/sed/output/branch_cycle_clears new file mode 100644 index 00000000..6766fea1 --- /dev/null +++ b/tests/fixtures/sed/output/branch_cycle_clears @@ -0,0 +1,5 @@ +^l1_1 +^l1_2 +^l1_4 +^l1_6 +^l1_8 diff --git a/tests/fixtures/sed/output/branch_plain b/tests/fixtures/sed/output/branch_plain new file mode 100644 index 00000000..ac9202e6 --- /dev/null +++ b/tests/fixtures/sed/output/branch_plain @@ -0,0 +1,17 @@ +label2_l1_1 +label3_label2_l1_1 +label1_l1_2 +label1_l1_3 +label1_l1_4 +label1_l1_5 +label1_l1_6 +label1_l1_7 +label1_l1_8 +label1_l1_9 +label1_l1_10 +label1_l1_11 +label1_l1_12 +label2_l1_13 +label3_label2_l1_13 +label2_l1_14 +label3_label2_l1_14 diff --git a/tests/fixtures/sed/output/branch_test_clears b/tests/fixtures/sed/output/branch_test_clears new file mode 100644 index 00000000..a9ec85bf --- /dev/null +++ b/tests/fixtures/sed/output/branch_test_clears @@ -0,0 +1,14 @@ +^l1_1$ +^l1_2$ +^l1_3$ +^l1_4$ +^l1_5$ +^l1_6$ +^l1_7$ +^l1_8$ +l1_9$ +l1_10$ +l1_11$ +l1_12$ +l1_13$ +l1_14$ diff --git a/tests/fixtures/sed/output/branch_to_block b/tests/fixtures/sed/output/branch_to_block new file mode 100644 index 00000000..e1ddb547 --- /dev/null +++ b/tests/fixtures/sed/output/branch_to_block @@ -0,0 +1,12 @@ +^l1_1 +^l1_1$ +^l1_2 +^l1_2$ +^l1_3 +^l1_3$ +^l1_4 +^l1_4$ +l1_5$ +l1_6$ +l1_7$ +l1_8$