From cf3d94912e0531eafbcb72ae6cd05ed3543cef50 Mon Sep 17 00:00:00 2001 From: CrazyRoka Date: Wed, 17 Dec 2025 00:14:27 +0000 Subject: [PATCH 1/6] ptx: implement -S/--sentence-regexp --- src/uu/ptx/src/ptx.rs | 50 +++++++++++++++++++++++++++++++-------- tests/by-util/test_ptx.rs | 38 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index d3b9d103ce1..953f2d1b4ad 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command}; use regex::Regex; use thiserror::Error; use uucore::display::Quotable; -use uucore::error::{FromIo, UError, UResult, UUsageError}; +use uucore::error::{FromIo, UError, UResult, USimpleError, UUsageError}; use uucore::format_usage; use uucore::translate; @@ -43,6 +43,7 @@ struct Config { context_regex: String, line_width: usize, gap_size: usize, + sentence_regex: Option, } impl Default for Config { @@ -59,6 +60,7 @@ impl Default for Config { context_regex: "\\w+".to_owned(), line_width: 72, gap_size: 3, + sentence_regex: None, } } } @@ -197,9 +199,6 @@ struct WordRef { #[derive(Debug, Error)] enum PtxError { - #[error("{}", translate!("ptx-error-not-implemented", "feature" => (*.0)))] - NotImplemented(&'static str), - #[error("{0}")] ParseError(ParseIntError), } @@ -214,8 +213,18 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { config.format = OutFormat::Roff; "[^ \t\n]+".clone_into(&mut config.context_regex); } - if matches.contains_id(options::SENTENCE_REGEXP) { - return Err(PtxError::NotImplemented("-S").into()); + if let Some(regex) = matches.get_one::(options::SENTENCE_REGEXP) { + config.sentence_regex = Some(regex.clone()); + + // Verify regex is valid and doesn't match empty string + if let Ok(re) = Regex::new(regex) { + if re.is_match("") { + return Err(USimpleError::new( + 1, + "A regular expression cannot match a length zero string", + )); + } + } } config.auto_ref = matches.get_flag(options::AUTO_REFERENCE); config.input_ref = matches.get_flag(options::REFERENCES); @@ -271,17 +280,38 @@ struct FileContent { type FileMap = HashMap; -fn read_input(input_files: &[OsString]) -> std::io::Result { +fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result { let mut file_map: FileMap = HashMap::new(); let mut offset: usize = 0; + + let sentence_splitter = + if let Some(re_str) = &config.sentence_regex { + Some(Regex::new(re_str).map_err(|_| { + std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid regex") + })?) + } else { + None + }; + for filename in input_files { - let reader: BufReader> = BufReader::new(if filename == "-" { + let mut reader: BufReader> = BufReader::new(if filename == "-" { Box::new(stdin()) } else { let file = File::open(Path::new(filename))?; Box::new(file) }); - let lines: Vec = reader.lines().collect::>>()?; + + let lines = if let Some(re) = &sentence_splitter { + let mut buffer = String::new(); + reader.read_to_string(&mut buffer)?; + + re.split(&buffer) + .map(|s| s.replace("\n", " ")) // ptx behavior: newlines become spaces inside sentences + .filter(|s| !s.is_empty()) // remove empty sentences + .collect() + } else { + reader.lines().collect::>>()? + }; // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long. // Since we will be jumping around the line a lot, we dump the content into a Vec, which can be indexed in constant time. @@ -877,7 +907,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } let word_filter = WordFilter::new(&matches, &config)?; - let file_map = read_input(&input_files).map_err_context(String::new)?; + let file_map = read_input(&input_files, &config).map_err_context(String::new)?; let word_set = create_word_set(&config, &word_filter, &file_map); write_traditional_output(&mut config, &file_map, &word_set, &output_file) } diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index 464dcf6aead..b8a87f2afd1 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -257,6 +257,44 @@ fn test_utf8() { .stdout_only("\\xx {}{it’s}{disabled}{}{}\n\\xx {}{}{it’s}{ disabled}{}\n"); } +#[test] +fn test_sentence_regexp_basic() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Hello. World.") + .succeeds() + .stdout_contains("Hello") + .stdout_contains("World"); +} + +#[test] +fn test_sentence_regexp_split_behavior() { + new_ucmd!() + .args(&["-G", "-w", "50", "-S", "[.!]"]) + .pipe_in("One sentence. Two sentence!") + .succeeds() + .stdout_contains("One sentence") + .stdout_contains("Two sentence"); +} + +#[test] +fn test_sentence_regexp_empty_match_failure() { + new_ucmd!() + .args(&["-G", "-S", "^"]) + .pipe_in("Input") + .fails() + .stderr_contains("A regular expression cannot match a length zero string"); +} + +#[test] +fn test_sentence_regexp_newlines_are_spaces() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Start of\nsentence.") + .succeeds() + .stdout_contains("Start of sentence"); +} + #[test] fn test_gnu_mode_dumb_format() { // Test GNU mode (dumb format) - the default mode without -G flag From 64929fb3298669440f2d04cd2f20d68627751d60 Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Wed, 24 Dec 2025 11:03:06 +0000 Subject: [PATCH 2/6] fix: resolve clippy single_char_pattern warning --- src/uu/ptx/src/ptx.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 953f2d1b4ad..b8b47411fdc 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -306,7 +306,7 @@ fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result Date: Wed, 24 Dec 2025 11:52:59 +0000 Subject: [PATCH 3/6] ptx: add TODO comment about regex compatibility --- src/uu/ptx/src/ptx.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index b8b47411fdc..2a53158925c 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -216,6 +216,10 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { if let Some(regex) = matches.get_one::(options::SENTENCE_REGEXP) { config.sentence_regex = Some(regex.clone()); + // TODO: The regex crate used here is not fully compatible with GNU's regex implementation. + // For example, it does not support backreferences. + // In the future, we might want to switch to the onig crate (like expr does) for better compatibility. + // Verify regex is valid and doesn't match empty string if let Ok(re) = Regex::new(regex) { if re.is_match("") { From 4e7785afef2b73c9604cfeaf4704706fecbea8a8 Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Wed, 24 Dec 2025 13:47:42 +0000 Subject: [PATCH 4/6] test: fix broken pipe panic in test_sentence_regexp_empty_match_failure --- tests/by-util/test_ptx.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index b8a87f2afd1..0b1da0d0488 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -281,7 +281,6 @@ fn test_sentence_regexp_split_behavior() { fn test_sentence_regexp_empty_match_failure() { new_ucmd!() .args(&["-G", "-S", "^"]) - .pipe_in("Input") .fails() .stderr_contains("A regular expression cannot match a length zero string"); } From b5e643bd621ec29248256c0dbfc3d01780c61f4b Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Sat, 27 Dec 2025 22:00:17 +0000 Subject: [PATCH 5/6] ptx: add translation and avoid cloning --- src/uu/ptx/locales/en-US.ftl | 2 ++ src/uu/ptx/src/ptx.rs | 37 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/uu/ptx/locales/en-US.ftl b/src/uu/ptx/locales/en-US.ftl index 402b2702b47..9d62b4ae4b6 100644 --- a/src/uu/ptx/locales/en-US.ftl +++ b/src/uu/ptx/locales/en-US.ftl @@ -28,3 +28,5 @@ ptx-error-dumb-format = There is no dumb format with GNU extensions disabled ptx-error-not-implemented = { $feature } not implemented yet ptx-error-write-failed = write failed ptx-error-extra-operand = extra operand { $operand } +ptx-error-empty-regexp = A regular expression cannot match a length zero string +ptx-error-invalid-regexp = Invalid regexp: { $error } diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 2a53158925c..2db894fe5ea 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -205,7 +205,7 @@ enum PtxError { impl UError for PtxError {} -fn get_config(matches: &clap::ArgMatches) -> UResult { +fn get_config(matches: &mut clap::ArgMatches) -> UResult { let mut config = Config::default(); let err_msg = "parsing options failed"; if matches.get_flag(options::TRADITIONAL) { @@ -213,22 +213,19 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { config.format = OutFormat::Roff; "[^ \t\n]+".clone_into(&mut config.context_regex); } - if let Some(regex) = matches.get_one::(options::SENTENCE_REGEXP) { - config.sentence_regex = Some(regex.clone()); - + if let Some(regex) = matches.remove_one::(options::SENTENCE_REGEXP) { // TODO: The regex crate used here is not fully compatible with GNU's regex implementation. // For example, it does not support backreferences. // In the future, we might want to switch to the onig crate (like expr does) for better compatibility. // Verify regex is valid and doesn't match empty string - if let Ok(re) = Regex::new(regex) { + if let Ok(re) = Regex::new(®ex) { if re.is_match("") { - return Err(USimpleError::new( - 1, - "A regular expression cannot match a length zero string", - )); + return Err(USimpleError::new(1, translate!("ptx-error-empty-regexp"))); } } + + config.sentence_regex = Some(regex); } config.auto_ref = matches.get_flag(options::AUTO_REFERENCE); config.input_ref = matches.get_flag(options::REFERENCES); @@ -288,14 +285,16 @@ fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result e), + ) + })?) + } else { + None + }; for filename in input_files { let mut reader: BufReader> = BufReader::new(if filename == "-" { @@ -878,8 +877,8 @@ mod options { #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; - let mut config = get_config(&matches)?; + let mut matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; + let mut config = get_config(&mut matches)?; let input_files; let output_file: OsString; From 4ef57f5d7e338057b92e2ccb51b975df38880a25 Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Sat, 27 Dec 2025 23:07:05 +0000 Subject: [PATCH 6/6] ptx: refactor line reading logic into read_lines helper function --- src/uu/ptx/src/ptx.rs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 2db894fe5ea..103e854c59b 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -304,17 +304,7 @@ fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result>>()? - }; + let lines = read_lines(sentence_splitter.as_ref(), &mut reader)?; // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long. // Since we will be jumping around the line a lot, we dump the content into a Vec, which can be indexed in constant time. @@ -333,6 +323,24 @@ fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result, + reader: &mut dyn BufRead, +) -> std::io::Result> { + if let Some(re) = sentence_splitter { + let mut buffer = String::new(); + reader.read_to_string(&mut buffer)?; + + Ok(re + .split(&buffer) + .map(|s| s.replace('\n', " ")) // ptx behavior: newlines become spaces inside sentences + .filter(|s| !s.is_empty()) // remove empty sentences + .collect()) + } else { + reader.lines().collect() + } +} + /// Go through every lines in the input files and record each match occurrence as a `WordRef`. fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet { let reg = Regex::new(&filter.word_regex).unwrap();