From 024c47522c16c4f01b0e8404594ed08cc0aa98ac Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 27 Feb 2024 10:13:05 -0500 Subject: [PATCH 1/5] cut: first take on delimiters refactor --- src/uu/cut/src/cut.rs | 124 ++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index e8956871692..5d205f9b33a 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -6,9 +6,12 @@ // spell-checker:ignore (ToDO) delim sourcefiles use bstr::io::BufReadExt; +use clap::builder::ValueParser; use clap::{crate_version, Arg, ArgAction, Command}; +use std::ffi::OsString; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write}; +use std::os::unix::ffi::OsStrExt; use std::path::Path; use uucore::display::Quotable; use uucore::error::{set_exit_code, FromIo, UResult, USimpleError}; @@ -27,26 +30,25 @@ const ABOUT: &str = help_about!("cut.md"); const AFTER_HELP: &str = help_section!("after help", "cut.md"); struct Options { - out_delim: Option, + out_delimiter: Option, // use OsString without trying to parse into UTF8/char or &[u8] line_ending: LineEnding, + field_opts: Option, } enum Delimiter { Whitespace, - String(String), // FIXME: use char? + String(OsString), // use OsString without trying to parse into UTF8/char or &[u8] } struct FieldOptions { delimiter: Delimiter, - out_delimiter: Option, only_delimited: bool, - line_ending: LineEnding, } enum Mode { Bytes(Vec, Options), Characters(Vec, Options), - Fields(Vec, FieldOptions), + Fields(Vec, Options), } fn stdout_writer() -> Box { @@ -69,10 +71,11 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() let newline_char = opts.line_ending.into(); let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); - let delim = opts - .out_delim + let default_out_delim = &OsString::from("\t"); + let out_delim = opts + .out_delimiter .as_ref() - .map_or("", String::as_str) + .unwrap_or(default_out_delim) .as_bytes(); let result = buf_in.for_byte_record(newline_char, |line| { @@ -82,8 +85,8 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() break; } if print_delim { - out.write_all(delim)?; - } else if opts.out_delim.is_some() { + out.write_all(out_delim)?; + } else if opts.out_delimiter.is_some() { print_delim = true; } // change `low` from 1-indexed value to 0-index value @@ -109,7 +112,7 @@ fn cut_fields_explicit_out_delim( ranges: &[Range], only_delimited: bool, newline_char: u8, - out_delim: &str, + out_delim: &OsString, ) -> UResult<()> { let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); @@ -256,9 +259,10 @@ fn cut_fields_implicit_out_delim( Ok(()) } -fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> { +fn cut_fields(reader: R, ranges: &[Range], opts: &Options) -> UResult<()> { let newline_char = opts.line_ending.into(); - match opts.delimiter { + let field_opts = opts.field_opts.as_ref().unwrap(); // it is safe to unwrap() here - field_opts will always be Some() for cut_fields() call + match field_opts.delimiter { Delimiter::String(ref delim) => { let matcher = ExactMatcher::new(delim.as_bytes()); match opts.out_delimiter { @@ -266,7 +270,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, out_delim, ), @@ -274,19 +278,20 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, ), } } Delimiter::Whitespace => { let matcher = WhitespaceMatcher {}; - let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t"); + let default_out_delim = &OsString::from("\t"); + let out_delim = opts.out_delimiter.as_ref().unwrap_or(default_out_delim); cut_fields_explicit_out_delim( reader, &matcher, ranges, - opts.only_delimited, + field_opts.only_delimited, newline_char, out_delim, ) @@ -352,9 +357,9 @@ mod options { #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let args = args.collect_ignore(); + let args = args.collect::>(); - let delimiter_is_equal = args.contains(&"-d=".to_string()); // special case + let delimiter_is_equal = args.contains(&OsString::from("-d=")); // special case let matches = uu_app().try_get_matches_from(args)?; let complement = matches.get_flag(options::COMPLEMENT); @@ -381,14 +386,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Bytes( ranges, Options { - out_delim: Some( - matches - .get_one::(options::OUTPUT_DELIMITER) - .map(|s| s.as_str()) - .unwrap_or_default() - .to_owned(), - ), + out_delimiter: matches + .get_one::(options::OUTPUT_DELIMITER) + .cloned(), line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + field_opts: None, }, ) }), @@ -396,25 +398,22 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Characters( ranges, Options { - out_delim: Some( - matches - .get_one::(options::OUTPUT_DELIMITER) - .map(|s| s.as_str()) - .unwrap_or_default() - .to_owned(), - ), + out_delimiter: matches + .get_one::(options::OUTPUT_DELIMITER) + .cloned(), line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + field_opts: None, }, ) }), (1, None, None, Some(field_ranges)) => { list_to_ranges(field_ranges, complement).and_then(|ranges| { - let out_delim = match matches.get_one::(options::OUTPUT_DELIMITER) { + let out_delim = match matches.get_one::(options::OUTPUT_DELIMITER).cloned() { Some(s) => { - if s.is_empty() { - Some("\0".to_owned()) + if s.is_empty() || s == "''" { + Some(OsString::from("\0")) } else { - Some(s.clone()) + Some(s) } } None => None, @@ -425,7 +424,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); let line_ending = LineEnding::from_zero_flag(zero_terminated); - match matches.get_one::(options::DELIMITER).map(|s| s.as_str()) { + match matches.get_one::(options::DELIMITER).cloned() { Some(_) if whitespace_delimited => { Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) } @@ -434,43 +433,50 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { // Clap parsing is limited in this situation, see: // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 if delimiter_is_equal { - delim = "="; + delim = OsString::from("="); } else if delim == "''" { // treat `''` as empty delimiter - delim = ""; + delim = OsString::from(""); } - if delim.chars().count() > 1 { + // For delimiter option value - allow both UTF8 (possibly multi-byte) characters + // and Non UTF8 (and not ASCII) single byte "characters", like b"\xff" to align with GNU behavior + if delim.to_str().is_some_and(|d| d.chars().count() > 1) || delim.to_str().is_none() && delim.as_bytes().len() > 1 { Err("the delimiter must be a single character".into()) } else { let delim = if delim.is_empty() { - "\0".to_owned() + OsString::from("\0") } else { - delim.to_owned() + delim }; Ok(Mode::Fields( ranges, - FieldOptions { - delimiter: Delimiter::String(delim), + Options { out_delimiter: out_delim, - only_delimited, line_ending, - }, + field_opts: Some(FieldOptions { + delimiter: Delimiter::String(delim.clone()), + only_delimited, + })}, )) } } - None => Ok(Mode::Fields( - ranges, - FieldOptions { - delimiter: match whitespace_delimited { - true => Delimiter::Whitespace, - false => Delimiter::String("\t".to_owned()), + None => { + let delim = &OsString::from("\t"); + Ok(Mode::Fields( + ranges, + Options { + out_delimiter: out_delim, + line_ending, + field_opts: Some (FieldOptions { + delimiter: match whitespace_delimited { + true => Delimiter::Whitespace, + false => Delimiter::String(delim.clone()), + }, + only_delimited + }) }, - out_delimiter: out_delim, - only_delimited, - line_ending, - }, - )), + ))}, } }) } @@ -554,6 +560,7 @@ pub fn uu_app() -> Command { Arg::new(options::DELIMITER) .short('d') .long(options::DELIMITER) + .value_parser(ValueParser::os_string()) .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.") .value_name("DELIM"), ) @@ -596,6 +603,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(options::OUTPUT_DELIMITER) .long(options::OUTPUT_DELIMITER) + .value_parser(ValueParser::os_string()) .help("in field mode, replace the delimiter in output lines with this option's argument") .value_name("NEW_DELIM"), ) From f83da074f93f15261c32bbb6b084bb7d330dae38 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 27 Feb 2024 14:50:58 -0500 Subject: [PATCH 2/5] cut: clippy and windows errors --- src/uu/cut/src/cut.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 5d205f9b33a..a5d30b5a0f7 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -11,7 +11,6 @@ use clap::{crate_version, Arg, ArgAction, Command}; use std::ffi::OsString; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write}; -use std::os::unix::ffi::OsStrExt; use std::path::Path; use uucore::display::Quotable; use uucore::error::{set_exit_code, FromIo, UResult, USimpleError}; @@ -76,7 +75,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() .out_delimiter .as_ref() .unwrap_or(default_out_delim) - .as_bytes(); + .as_encoded_bytes(); let result = buf_in.for_byte_record(newline_char, |line| { let mut print_delim = false; @@ -148,7 +147,7 @@ fn cut_fields_explicit_out_delim( for _ in 0..=high - low { // skip printing delimiter if this is the first matching field for this line if print_delim { - out.write_all(out_delim.as_bytes())?; + out.write_all(out_delim.as_encoded_bytes())?; } else { print_delim = true; } @@ -264,7 +263,7 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &Options) -> UResult<( let field_opts = opts.field_opts.as_ref().unwrap(); // it is safe to unwrap() here - field_opts will always be Some() for cut_fields() call match field_opts.delimiter { Delimiter::String(ref delim) => { - let matcher = ExactMatcher::new(delim.as_bytes()); + let matcher = ExactMatcher::new(delim.as_encoded_bytes()); match opts.out_delimiter { Some(ref out_delim) => cut_fields_explicit_out_delim( reader, @@ -440,7 +439,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } // For delimiter option value - allow both UTF8 (possibly multi-byte) characters // and Non UTF8 (and not ASCII) single byte "characters", like b"\xff" to align with GNU behavior - if delim.to_str().is_some_and(|d| d.chars().count() > 1) || delim.to_str().is_none() && delim.as_bytes().len() > 1 { + if delim.to_str().is_some_and(|d| d.chars().count() > 1) || delim.to_str().is_none() && delim.as_encoded_bytes().len() > 1 { Err("the delimiter must be a single character".into()) } else { let delim = if delim.is_empty() { @@ -455,8 +454,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { out_delimiter: out_delim, line_ending, field_opts: Some(FieldOptions { - delimiter: Delimiter::String(delim.clone()), only_delimited, + delimiter: Delimiter::String(delim), })}, )) } From ddc35a976d2db2982af8926dcbaac6046f6fff3e Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 29 Feb 2024 12:54:45 -0500 Subject: [PATCH 3/5] cut: refactor delimiters + tests --- src/uu/cut/src/cut.rs | 254 +++++++++++++++++------------- tests/by-util/test_cut.rs | 14 ++ tests/fixtures/cut/8bit-delim.txt | 1 + 3 files changed, 160 insertions(+), 109 deletions(-) create mode 100644 tests/fixtures/cut/8bit-delim.txt diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index a5d30b5a0f7..3d041160a18 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -6,11 +6,12 @@ // spell-checker:ignore (ToDO) delim sourcefiles use bstr::io::BufReadExt; -use clap::builder::ValueParser; -use clap::{crate_version, Arg, ArgAction, Command}; +use clap::{builder::ValueParser, crate_version, Arg, ArgAction, ArgMatches, Command}; use std::ffi::OsString; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, IsTerminal, Read, Write}; +#[cfg(unix)] +use std::os::unix::ffi::OsStrExt; use std::path::Path; use uucore::display::Quotable; use uucore::error::{set_exit_code, FromIo, UResult, USimpleError}; @@ -28,26 +29,38 @@ const USAGE: &str = help_usage!("cut.md"); const ABOUT: &str = help_about!("cut.md"); const AFTER_HELP: &str = help_section!("after help", "cut.md"); -struct Options { - out_delimiter: Option, // use OsString without trying to parse into UTF8/char or &[u8] +struct Options<'a> { + out_delimiter: Option<&'a [u8]>, line_ending: LineEnding, - field_opts: Option, + field_opts: Option>, } -enum Delimiter { +enum Delimiter<'a> { Whitespace, - String(OsString), // use OsString without trying to parse into UTF8/char or &[u8] + Slice(&'a [u8]), } -struct FieldOptions { - delimiter: Delimiter, +struct FieldOptions<'a> { + delimiter: Delimiter<'a>, only_delimited: bool, } -enum Mode { - Bytes(Vec, Options), - Characters(Vec, Options), - Fields(Vec, Options), +enum Mode<'a> { + Bytes(Vec, Options<'a>), + Characters(Vec, Options<'a>), + Fields(Vec, Options<'a>), +} + +impl Default for Delimiter<'_> { + fn default() -> Self { + Self::Slice(b"\t") + } +} + +impl<'a> From<&'a OsString> for Delimiter<'a> { + fn from(s: &'a OsString) -> Self { + Self::Slice(os_string_as_bytes(s).unwrap()) + } } fn stdout_writer() -> Box { @@ -70,12 +83,7 @@ fn cut_bytes(reader: R, ranges: &[Range], opts: &Options) -> UResult<() let newline_char = opts.line_ending.into(); let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); - let default_out_delim = &OsString::from("\t"); - let out_delim = opts - .out_delimiter - .as_ref() - .unwrap_or(default_out_delim) - .as_encoded_bytes(); + let out_delim = opts.out_delimiter.unwrap_or(b"\t"); let result = buf_in.for_byte_record(newline_char, |line| { let mut print_delim = false; @@ -111,7 +119,7 @@ fn cut_fields_explicit_out_delim( ranges: &[Range], only_delimited: bool, newline_char: u8, - out_delim: &OsString, + out_delim: &[u8], ) -> UResult<()> { let mut buf_in = BufReader::new(reader); let mut out = stdout_writer(); @@ -147,7 +155,7 @@ fn cut_fields_explicit_out_delim( for _ in 0..=high - low { // skip printing delimiter if this is the first matching field for this line if print_delim { - out.write_all(out_delim.as_encoded_bytes())?; + out.write_all(out_delim)?; } else { print_delim = true; } @@ -262,10 +270,10 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &Options) -> UResult<( let newline_char = opts.line_ending.into(); let field_opts = opts.field_opts.as_ref().unwrap(); // it is safe to unwrap() here - field_opts will always be Some() for cut_fields() call match field_opts.delimiter { - Delimiter::String(ref delim) => { - let matcher = ExactMatcher::new(delim.as_encoded_bytes()); + Delimiter::Slice(delim) => { + let matcher = ExactMatcher::new(delim); match opts.out_delimiter { - Some(ref out_delim) => cut_fields_explicit_out_delim( + Some(out_delim) => cut_fields_explicit_out_delim( reader, &matcher, ranges, @@ -284,15 +292,13 @@ fn cut_fields(reader: R, ranges: &[Range], opts: &Options) -> UResult<( } Delimiter::Whitespace => { let matcher = WhitespaceMatcher {}; - let default_out_delim = &OsString::from("\t"); - let out_delim = opts.out_delimiter.as_ref().unwrap_or(default_out_delim); cut_fields_explicit_out_delim( reader, &matcher, ranges, field_opts.only_delimited, newline_char, - out_delim, + opts.out_delimiter.unwrap_or(b"\t"), ) } } @@ -341,6 +347,88 @@ fn cut_files(mut filenames: Vec, mode: &Mode) { } } +// This is temporary helper function to convert OSString to &[u8] for unix targets only +// TODO Remove this function and re-implement the functionality in each place that calls it +// for all targets using https://doc.rust-lang.org/nightly/std/ffi/struct.OsStr.html#method.as_encoded_bytes +// once project's MSRV is bumped up to 1.74.0+ so that function becomes available +// For now - support unix targets only and on non-unix (i.e. Windows) will just return an error if delimiter value is not UTF-8 +fn os_string_as_bytes(os_string: &OsString) -> UResult<&[u8]> { + #[cfg(unix)] + let bytes = os_string.as_bytes(); + + #[cfg(not(unix))] + let bytes = os_string + .to_str() + .ok_or_else(|| { + uucore::error::UUsageError::new( + 1, + "invalid UTF-8 was detected in one or more arguments", + ) + })? + .as_bytes(); + + Ok(bytes) +} + +// Get delimiter and output delimiter from `-d`/`--delimiter` and `--output-delimiter` options respectively +// Allow either delimiter to have a value that is neither UTF-8 nor ASCII to align with GNU behavior +fn get_delimiters<'a>( + matches: &'a ArgMatches, + delimiter_is_equal: bool, + os_string_equals: &'a OsString, + os_string_nul: &'a OsString, +) -> UResult<(Delimiter<'a>, Option<&'a [u8]>)> { + let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); + let delim_opt = matches.get_one::(options::DELIMITER); + let delim = match delim_opt { + Some(_) if whitespace_delimited => { + return Err(USimpleError::new( + 1, + "invalid input: Only one of --delimiter (-d) or -w option can be specified", + )); + } + Some(mut os_string) => { + // GNU's `cut` supports `-d=` to set the delimiter to `=`. + // Clap parsing is limited in this situation, see: + // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 + // rewrite the delimiter value os_string before further processing + if delimiter_is_equal { + os_string = os_string_equals; + } else if os_string == "''" || os_string.is_empty() { + // treat `''` as empty delimiter + os_string = os_string_nul; + } + // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters + // and Non UTF-8 (and not ASCII) single byte "characters", like b"\xff" to align with GNU behavior + let bytes = os_string_as_bytes(os_string)?; + if os_string.to_str().is_some_and(|s| s.chars().count() > 1) + || os_string.to_str().is_none() && bytes.len() > 1 + { + return Err(USimpleError::new( + 1, + "the delimiter must be a single character", + )); + } else { + Delimiter::from(os_string) + } + } + None => match whitespace_delimited { + true => Delimiter::Whitespace, + false => Delimiter::default(), + }, + }; + let out_delim = matches + .get_one::(options::OUTPUT_DELIMITER) + .map(|os_string| { + if os_string.is_empty() || os_string == "''" { + "\0".as_bytes() + } else { + os_string_as_bytes(os_string).unwrap() + } + }); + Ok((delim, out_delim)) +} + mod options { pub const BYTES: &str = "bytes"; pub const CHARACTERS: &str = "characters"; @@ -362,6 +450,20 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().try_get_matches_from(args)?; let complement = matches.get_flag(options::COMPLEMENT); + let only_delimited = matches.get_flag(options::ONLY_DELIMITED); + + // since OsString::from creates a new value and it does not by default have 'static lifetime like &str + // we need to create these values here and pass them down to satisfy borrow checker + let os_string_equals = OsString::from("="); + let os_string_nul = OsString::from("\0"); + + let (delimiter, out_delimiter) = get_delimiters( + &matches, + delimiter_is_equal, + &os_string_equals, + &os_string_nul, + )?; + let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)); // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`, // is expected. The number of those arguments is used for parsing a cutting @@ -385,10 +487,8 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Bytes( ranges, Options { - out_delimiter: matches - .get_one::(options::OUTPUT_DELIMITER) - .cloned(), - line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + out_delimiter, + line_ending, field_opts: None, }, ) @@ -397,88 +497,24 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { Mode::Characters( ranges, Options { - out_delimiter: matches - .get_one::(options::OUTPUT_DELIMITER) - .cloned(), - line_ending: LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)), + out_delimiter, + line_ending, field_opts: None, }, ) }), - (1, None, None, Some(field_ranges)) => { - list_to_ranges(field_ranges, complement).and_then(|ranges| { - let out_delim = match matches.get_one::(options::OUTPUT_DELIMITER).cloned() { - Some(s) => { - if s.is_empty() || s == "''" { - Some(OsString::from("\0")) - } else { - Some(s) - } - } - None => None, - }; - - let only_delimited = matches.get_flag(options::ONLY_DELIMITED); - let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED); - let zero_terminated = matches.get_flag(options::ZERO_TERMINATED); - let line_ending = LineEnding::from_zero_flag(zero_terminated); - - match matches.get_one::(options::DELIMITER).cloned() { - Some(_) if whitespace_delimited => { - Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into()) - } - Some(mut delim) => { - // GNU's `cut` supports `-d=` to set the delimiter to `=`. - // Clap parsing is limited in this situation, see: - // https://github.com/uutils/coreutils/issues/2424#issuecomment-863825242 - if delimiter_is_equal { - delim = OsString::from("="); - } else if delim == "''" { - // treat `''` as empty delimiter - delim = OsString::from(""); - } - // For delimiter option value - allow both UTF8 (possibly multi-byte) characters - // and Non UTF8 (and not ASCII) single byte "characters", like b"\xff" to align with GNU behavior - if delim.to_str().is_some_and(|d| d.chars().count() > 1) || delim.to_str().is_none() && delim.as_encoded_bytes().len() > 1 { - Err("the delimiter must be a single character".into()) - } else { - let delim = if delim.is_empty() { - OsString::from("\0") - } else { - delim - }; - - Ok(Mode::Fields( - ranges, - Options { - out_delimiter: out_delim, - line_ending, - field_opts: Some(FieldOptions { - only_delimited, - delimiter: Delimiter::String(delim), - })}, - )) - } - } - None => { - let delim = &OsString::from("\t"); - Ok(Mode::Fields( - ranges, - Options { - out_delimiter: out_delim, - line_ending, - field_opts: Some (FieldOptions { - delimiter: match whitespace_delimited { - true => Delimiter::Whitespace, - false => Delimiter::String(delim.clone()), - }, - only_delimited - }) - }, - ))}, - } - }) - } + (1, None, None, Some(field_ranges)) => list_to_ranges(field_ranges, complement).map(|ranges| { + Mode::Fields( + ranges, + Options { + out_delimiter, + line_ending, + field_opts: Some(FieldOptions { + only_delimited, + delimiter, + })}, + ) + }), (2.., _, _, _) => Err( "invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)".into() ), diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index 2473ead1992..50d158f966b 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -288,3 +288,17 @@ fn test_multiple_mode_args() { .stderr_is("cut: invalid usage: expects no more than one of --fields (-f), --chars (-c) or --bytes (-b)\n"); } } + +#[test] +#[cfg(unix)] +fn test_8bit_non_utf8_delimiter() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + let delim = OsStr::from_bytes(b"\xAD".as_slice()); + new_ucmd!() + .arg("-d") + .arg(delim) + .args(&["--out=_", "-f2,3", "8bit-delim.txt"]) + .succeeds() + .stdout_check(|out| out == "b_c\n".as_bytes()); +} diff --git a/tests/fixtures/cut/8bit-delim.txt b/tests/fixtures/cut/8bit-delim.txt new file mode 100644 index 00000000000..2312c916aef --- /dev/null +++ b/tests/fixtures/cut/8bit-delim.txt @@ -0,0 +1 @@ +a­b­c From 6945ef43d2e3fec2a68bc58860f62a0e7d1b4fb3 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Mar 2024 10:46:49 -0500 Subject: [PATCH 4/5] cut: comments --- src/uu/cut/src/cut.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 3d041160a18..46e532219b7 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -399,7 +399,7 @@ fn get_delimiters<'a>( os_string = os_string_nul; } // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters - // and Non UTF-8 (and not ASCII) single byte "characters", like b"\xff" to align with GNU behavior + // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior let bytes = os_string_as_bytes(os_string)?; if os_string.to_str().is_some_and(|s| s.chars().count() > 1) || os_string.to_str().is_none() && bytes.len() > 1 @@ -453,7 +453,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let only_delimited = matches.get_flag(options::ONLY_DELIMITED); // since OsString::from creates a new value and it does not by default have 'static lifetime like &str - // we need to create these values here and pass them down to satisfy borrow checker + // we need to create these values here and pass them down avoid issues with borrow checker and temporary values let os_string_equals = OsString::from("="); let os_string_nul = OsString::from("\0"); From 7edfac4ad7ed7648d378723c5d895b59a725fd50 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 1 Mar 2024 12:23:22 -0500 Subject: [PATCH 5/5] cut: comments --- src/uu/cut/src/cut.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 46e532219b7..14aa2df9657 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -347,7 +347,7 @@ fn cut_files(mut filenames: Vec, mode: &Mode) { } } -// This is temporary helper function to convert OSString to &[u8] for unix targets only +// This is temporary helper function to convert OsString to &[u8] for unix targets only // TODO Remove this function and re-implement the functionality in each place that calls it // for all targets using https://doc.rust-lang.org/nightly/std/ffi/struct.OsStr.html#method.as_encoded_bytes // once project's MSRV is bumped up to 1.74.0+ so that function becomes available @@ -453,7 +453,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let only_delimited = matches.get_flag(options::ONLY_DELIMITED); // since OsString::from creates a new value and it does not by default have 'static lifetime like &str - // we need to create these values here and pass them down avoid issues with borrow checker and temporary values + // we need to create these values here and pass them down to avoid issues with borrow checker and temporary values let os_string_equals = OsString::from("="); let os_string_nul = OsString::from("\0");