Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/uu/ptx/locales/en-US.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ ptx-error-dumb-format = There is no dumb format with GNU extensions disabled
ptx-error-not-implemented = { $feature } not implemented yet
ptx-error-write-failed = write failed
ptx-error-extra-operand = extra operand { $operand }
ptx-error-empty-regexp = A regular expression cannot match a length zero string
ptx-error-invalid-regexp = Invalid regexp: { $error }
67 changes: 54 additions & 13 deletions src/uu/ptx/src/ptx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command};
use regex::Regex;
use thiserror::Error;
use uucore::display::Quotable;
use uucore::error::{FromIo, UError, UResult, UUsageError};
use uucore::error::{FromIo, UError, UResult, USimpleError, UUsageError};
use uucore::format_usage;
use uucore::translate;

Expand All @@ -43,6 +43,7 @@ struct Config {
context_regex: String,
line_width: usize,
gap_size: usize,
sentence_regex: Option<String>,
}

impl Default for Config {
Expand All @@ -59,6 +60,7 @@ impl Default for Config {
context_regex: "\\w+".to_owned(),
line_width: 72,
gap_size: 3,
sentence_regex: None,
}
}
}
Expand Down Expand Up @@ -197,25 +199,33 @@ struct WordRef {

#[derive(Debug, Error)]
enum PtxError {
#[error("{}", translate!("ptx-error-not-implemented", "feature" => (*.0)))]
NotImplemented(&'static str),

#[error("{0}")]
ParseError(ParseIntError),
}

impl UError for PtxError {}

fn get_config(matches: &clap::ArgMatches) -> UResult<Config> {
fn get_config(matches: &mut clap::ArgMatches) -> UResult<Config> {
let mut config = Config::default();
let err_msg = "parsing options failed";
if matches.get_flag(options::TRADITIONAL) {
config.gnu_ext = false;
config.format = OutFormat::Roff;
"[^ \t\n]+".clone_into(&mut config.context_regex);
}
if matches.contains_id(options::SENTENCE_REGEXP) {
return Err(PtxError::NotImplemented("-S").into());
if let Some(regex) = matches.remove_one::<String>(options::SENTENCE_REGEXP) {
// TODO: The regex crate used here is not fully compatible with GNU's regex implementation.
// For example, it does not support backreferences.
// In the future, we might want to switch to the onig crate (like expr does) for better compatibility.

// Verify regex is valid and doesn't match empty string
if let Ok(re) = Regex::new(&regex) {
if re.is_match("") {
return Err(USimpleError::new(1, translate!("ptx-error-empty-regexp")));
}
}

config.sentence_regex = Some(regex);
}
config.auto_ref = matches.get_flag(options::AUTO_REFERENCE);
config.input_ref = matches.get_flag(options::REFERENCES);
Expand Down Expand Up @@ -271,17 +281,30 @@ struct FileContent {

type FileMap = HashMap<OsString, FileContent>;

fn read_input(input_files: &[OsString]) -> std::io::Result<FileMap> {
fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result<FileMap> {
let mut file_map: FileMap = HashMap::new();
let mut offset: usize = 0;

let sentence_splitter = if let Some(re_str) = &config.sentence_regex {
Some(Regex::new(re_str).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidInput,
translate!("ptx-error-invalid-regexp", "error" => e),
)
})?)
} else {
None
};

for filename in input_files {
let reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
let mut reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
Box::new(stdin())
} else {
let file = File::open(Path::new(filename))?;
Box::new(file)
});
let lines: Vec<String> = reader.lines().collect::<std::io::Result<Vec<String>>>()?;

let lines = read_lines(sentence_splitter.as_ref(), &mut reader)?;

// Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long.
// Since we will be jumping around the line a lot, we dump the content into a Vec<char>, which can be indexed in constant time.
Expand All @@ -300,6 +323,24 @@ fn read_input(input_files: &[OsString]) -> std::io::Result<FileMap> {
Ok(file_map)
}

fn read_lines(
sentence_splitter: Option<&Regex>,
reader: &mut dyn BufRead,
) -> std::io::Result<Vec<String>> {
if let Some(re) = sentence_splitter {
let mut buffer = String::new();
reader.read_to_string(&mut buffer)?;

Ok(re
.split(&buffer)
.map(|s| s.replace('\n', " ")) // ptx behavior: newlines become spaces inside sentences
.filter(|s| !s.is_empty()) // remove empty sentences
.collect())
} else {
reader.lines().collect()
}
}

/// Go through every lines in the input files and record each match occurrence as a `WordRef`.
fn create_word_set(config: &Config, filter: &WordFilter, file_map: &FileMap) -> BTreeSet<WordRef> {
let reg = Regex::new(&filter.word_regex).unwrap();
Expand Down Expand Up @@ -844,8 +885,8 @@ mod options {

#[uucore::main]
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
let mut config = get_config(&matches)?;
let mut matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
let mut config = get_config(&mut matches)?;

let input_files;
let output_file: OsString;
Expand Down Expand Up @@ -877,7 +918,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}

let word_filter = WordFilter::new(&matches, &config)?;
let file_map = read_input(&input_files).map_err_context(String::new)?;
let file_map = read_input(&input_files, &config).map_err_context(String::new)?;
let word_set = create_word_set(&config, &word_filter, &file_map);
write_traditional_output(&mut config, &file_map, &word_set, &output_file)
}
Expand Down
37 changes: 37 additions & 0 deletions tests/by-util/test_ptx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,43 @@ fn test_utf8() {
.stdout_only("\\xx {}{it’s}{disabled}{}{}\n\\xx {}{}{it’s}{ disabled}{}\n");
}

#[test]
fn test_sentence_regexp_basic() {
new_ucmd!()
.args(&["-G", "-S", "\\."])
.pipe_in("Hello. World.")
.succeeds()
.stdout_contains("Hello")
.stdout_contains("World");
}

#[test]
fn test_sentence_regexp_split_behavior() {
new_ucmd!()
.args(&["-G", "-w", "50", "-S", "[.!]"])
.pipe_in("One sentence. Two sentence!")
.succeeds()
.stdout_contains("One sentence")
.stdout_contains("Two sentence");
}

#[test]
fn test_sentence_regexp_empty_match_failure() {
new_ucmd!()
.args(&["-G", "-S", "^"])
.fails()
.stderr_contains("A regular expression cannot match a length zero string");
}

#[test]
fn test_sentence_regexp_newlines_are_spaces() {
new_ucmd!()
.args(&["-G", "-S", "\\."])
.pipe_in("Start of\nsentence.")
.succeeds()
.stdout_contains("Start of sentence");
}

#[test]
fn test_gnu_mode_dumb_format() {
// Test GNU mode (dumb format) - the default mode without -G flag
Expand Down
Loading