diff --git a/src/uu/ptx/locales/en-US.ftl b/src/uu/ptx/locales/en-US.ftl index 402b2702b47..9d62b4ae4b6 100644 --- a/src/uu/ptx/locales/en-US.ftl +++ b/src/uu/ptx/locales/en-US.ftl @@ -28,3 +28,5 @@ ptx-error-dumb-format = There is no dumb format with GNU extensions disabled ptx-error-not-implemented = { $feature } not implemented yet ptx-error-write-failed = write failed ptx-error-extra-operand = extra operand { $operand } +ptx-error-empty-regexp = A regular expression cannot match a length zero string +ptx-error-invalid-regexp = Invalid regexp: { $error } diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index d3b9d103ce1..103e854c59b 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command}; use regex::Regex; use thiserror::Error; use uucore::display::Quotable; -use uucore::error::{FromIo, UError, UResult, UUsageError}; +use uucore::error::{FromIo, UError, UResult, USimpleError, UUsageError}; use uucore::format_usage; use uucore::translate; @@ -43,6 +43,7 @@ struct Config { context_regex: String, line_width: usize, gap_size: usize, + sentence_regex: Option, } impl Default for Config { @@ -59,6 +60,7 @@ impl Default for Config { context_regex: "\\w+".to_owned(), line_width: 72, gap_size: 3, + sentence_regex: None, } } } @@ -197,16 +199,13 @@ struct WordRef { #[derive(Debug, Error)] enum PtxError { - #[error("{}", translate!("ptx-error-not-implemented", "feature" => (*.0)))] - NotImplemented(&'static str), - #[error("{0}")] ParseError(ParseIntError), } impl UError for PtxError {} -fn get_config(matches: &clap::ArgMatches) -> UResult { +fn get_config(matches: &mut clap::ArgMatches) -> UResult { let mut config = Config::default(); let err_msg = "parsing options failed"; if matches.get_flag(options::TRADITIONAL) { @@ -214,8 +213,19 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { config.format = OutFormat::Roff; "[^ \t\n]+".clone_into(&mut config.context_regex); } - if matches.contains_id(options::SENTENCE_REGEXP) { - return Err(PtxError::NotImplemented("-S").into()); + if let Some(regex) = matches.remove_one::(options::SENTENCE_REGEXP) { + // TODO: The regex crate used here is not fully compatible with GNU's regex implementation. + // For example, it does not support backreferences. + // In the future, we might want to switch to the onig crate (like expr does) for better compatibility. + + // Verify regex is valid and doesn't match empty string + if let Ok(re) = Regex::new(®ex) { + if re.is_match("") { + return Err(USimpleError::new(1, translate!("ptx-error-empty-regexp"))); + } + } + + config.sentence_regex = Some(regex); } config.auto_ref = matches.get_flag(options::AUTO_REFERENCE); config.input_ref = matches.get_flag(options::REFERENCES); @@ -271,17 +281,30 @@ struct FileContent { type FileMap = HashMap; -fn read_input(input_files: &[OsString]) -> std::io::Result { +fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result { let mut file_map: FileMap = HashMap::new(); let mut offset: usize = 0; + + let sentence_splitter = if let Some(re_str) = &config.sentence_regex { + Some(Regex::new(re_str).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + translate!("ptx-error-invalid-regexp", "error" => e), + ) + })?) + } else { + None + }; + for filename in input_files { - let reader: BufReader> = BufReader::new(if filename == "-" { + let mut reader: BufReader> = BufReader::new(if filename == "-" { Box::new(stdin()) } else { let file = File::open(Path::new(filename))?; Box::new(file) }); - let lines: Vec = reader.lines().collect::>>()?; + + let lines = read_lines(sentence_splitter.as_ref(), &mut reader)?; // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long. // Since we will be jumping around the line a lot, we dump the content into a Vec, which can be indexed in constant time. @@ -300,6 +323,24 @@ fn read_input(input_files: &[OsString]) -> std::io::Result { Ok(file_map) } +fn read_lines( + sentence_splitter: Option<&Regex>, + reader: &mut dyn BufRead, +) -> std::io::Result> { + if let Some(re) = sentence_splitter { + let mut buffer = String::new(); + reader.read_to_string(&mut buffer)?; + + Ok(re + .split(&buffer) + .map(|s| s.replace('\n', " ")) // ptx behavior: newlines become spaces inside sentences + .filter(|s| !s.is_empty()) // remove empty sentences + .collect()) + } else { + reader.lines().collect() + } +} + /// Go through every lines in the input files and record each match occurrence as a `WordRef`. fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet { let reg = Regex::new(&filter.word_regex).unwrap(); @@ -844,8 +885,8 @@ mod options { #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; - let mut config = get_config(&matches)?; + let mut matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; + let mut config = get_config(&mut matches)?; let input_files; let output_file: OsString; @@ -877,7 +918,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } let word_filter = WordFilter::new(&matches, &config)?; - let file_map = read_input(&input_files).map_err_context(String::new)?; + let file_map = read_input(&input_files, &config).map_err_context(String::new)?; let word_set = create_word_set(&config, &word_filter, &file_map); write_traditional_output(&mut config, &file_map, &word_set, &output_file) } diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index 464dcf6aead..0b1da0d0488 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -257,6 +257,43 @@ fn test_utf8() { .stdout_only("\\xx {}{it’s}{disabled}{}{}\n\\xx {}{}{it’s}{ disabled}{}\n"); } +#[test] +fn test_sentence_regexp_basic() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Hello. World.") + .succeeds() + .stdout_contains("Hello") + .stdout_contains("World"); +} + +#[test] +fn test_sentence_regexp_split_behavior() { + new_ucmd!() + .args(&["-G", "-w", "50", "-S", "[.!]"]) + .pipe_in("One sentence. Two sentence!") + .succeeds() + .stdout_contains("One sentence") + .stdout_contains("Two sentence"); +} + +#[test] +fn test_sentence_regexp_empty_match_failure() { + new_ucmd!() + .args(&["-G", "-S", "^"]) + .fails() + .stderr_contains("A regular expression cannot match a length zero string"); +} + +#[test] +fn test_sentence_regexp_newlines_are_spaces() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Start of\nsentence.") + .succeeds() + .stdout_contains("Start of sentence"); +} + #[test] fn test_gnu_mode_dumb_format() { // Test GNU mode (dumb format) - the default mode without -G flag