From c3f5205e775231ab62191d9d4c79f4996292bd5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20M=2E=20Bezerra?= Date: Tue, 20 Oct 2020 17:08:37 -0300 Subject: [PATCH] Uniq: fix -s and implement -f --- uniq/src/cli.rs | 16 ++-- uniq/src/main.rs | 193 ++++++++++++++++++++++++++++------------------- 2 files changed, 119 insertions(+), 90 deletions(-) diff --git a/uniq/src/cli.rs b/uniq/src/cli.rs index ffa9466b..aadb0855 100644 --- a/uniq/src/cli.rs +++ b/uniq/src/cli.rs @@ -11,17 +11,9 @@ pub(crate) fn create_app<'a, 'b>() -> App<'a, 'b> { .version_message("Display version information.") .help_short("?") .settings(&[ColoredHelp]) + .arg(Arg::with_name("INPUT").help("Input file path, or '-' for stdin (default).").index(1)) .arg( - Arg::with_name("INPUT") - .help("Input file path, or '-' for stdin (default).") - .index(1) - // .default_value("-"), - ) - .arg( - Arg::with_name("OUTPUT") - .help("Output file path, or '-' for stdin (default).") - .index(2) - // .default_value("-"), + Arg::with_name("OUTPUT").help("Output file path, or '-' for stdin (default).").index(2), ) .arg( Arg::with_name("count") @@ -43,7 +35,9 @@ pub(crate) fn create_app<'a, 'b>() -> App<'a, 'b> { .value_name("N"), ) .arg( - Arg::with_name("skip-chars") + // We chose "skip-bytes" instead of "skip-chars" in the util internal implementation to + // avoid confusion. + Arg::with_name("skip-bytes") .help("Avoid comparing the first N characters.") .short("s") .long("skip-chars") diff --git a/uniq/src/main.rs b/uniq/src/main.rs index 3245cce8..8df5b61f 100644 --- a/uniq/src/main.rs +++ b/uniq/src/main.rs @@ -1,12 +1,12 @@ use std::{ + cmp, fs::File, - io::{self, BufRead, BufReader, ErrorKind, Read, Write}, + io::{self, BufRead, BufReader, Read, Write}, process, }; mod cli; - fn main() { let matches = cli::create_app().get_matches(); let input_filename = matches.value_of("INPUT").unwrap_or("-"); @@ -32,28 +32,29 @@ fn main() { })) }; - if uniq(&mut reader, &mut writer, flags).is_err() { + uniq(&mut reader, &mut writer, flags).unwrap_or_else(|err| { + eprintln!("uniq: {}.", err); process::exit(1); - }; + }); } - +// -f is ignored before -s #[derive(Default)] struct Flags { - show_count: bool, // -c | --show_count - supress_unique: bool, // -d | --repeated - supress_repeated: bool, // -u | --unique - skip_chars: Option, // -s | --skip-chars=N - #[allow(dead_code)] - skip_fields: Option, // -f | --skip-fields=N + show_count: bool, // -c | --show_count + supress_unique: bool, // -d | --repeated + supress_repeated: bool, // -u | --unique + skip_bytes: Option, // -s | --skip-chars=N + skip_fields: Option, // -f | --skip-fields=N } +// skip_utf8_chars: Option, // --skip-utf8=N impl Flags { fn from_matches(matches: &clap::ArgMatches) -> Self { - // Used to capture skip_chars and skip_fields - let try_parse_arg_to_u64 = |arg: Option<&str>, error_msg| { + // Used to capture skip_bytes and skip_fields + let try_parse_arg_to_usize = |arg: Option<&str>, error_msg| { if let Some(arg) = arg { - let number = arg.parse::().unwrap_or_else(|_| { + let number = arg.parse::().unwrap_or_else(|_| { eprintln!("uniq: {} '{}'.", error_msg, arg); process::exit(1); }); @@ -67,11 +68,11 @@ impl Flags { show_count: matches.is_present("show_count"), supress_unique: matches.is_present("repeated"), supress_repeated: matches.is_present("unique"), - skip_chars: try_parse_arg_to_u64( + skip_bytes: try_parse_arg_to_usize( matches.value_of("skip-chars"), "--skip-chars: Invalid number of bytes to skip", ), - skip_fields: try_parse_arg_to_u64( + skip_fields: try_parse_arg_to_usize( matches.value_of("skip-fields"), "--skip-fields: Invalid number of fields to skip", ), @@ -79,84 +80,105 @@ impl Flags { } } +// Return the total of bytes skipped +fn skip_fields_and_bytes(string: &str, fields: usize, bytes: usize) -> usize { + let mut iter = string.char_indices().peekable(); + let mut skipped = 0; + + // Skip fields, regex is "\s*\S*" + for _ in 0..fields { + while let Some((char_bytes, character)) = iter.peek() { + if character.is_whitespace() { + break; + } + skipped += char_bytes; + iter.next(); + } + + while let Some((char_bytes, character)) = iter.peek() { + if !character.is_whitespace() { + break; + } + skipped += char_bytes; + iter.next(); + } + } + + // Skip bytes + // Try to skip them, but don't allow to overflow string.as_bytes().len() + let skipped = cmp::min(string.as_bytes().len(), skipped + bytes); + + skipped +} fn uniq( reader: &mut BufReader, writer: &mut W, flags: Flags, ) -> Result<(), io::Error> { - // Always compared against current_line + // If -s and -f are unset, last_line is guaranteed to be equals to the previous line, + // else, last_line is the first line in the last set of lines that with each other, + // considering the ignored bytes or fields. let mut last_line = String::new(); - let mut last_line_count: u64 = 0; + let mut last_skipped_bytes = 0; + + // Number of times that `last_line` matched + let mut last_count = 0; - // If --skip-chars - let bytes_to_skip = flags.skip_chars.unwrap_or(0) as usize; - let mut bytes_to_skip: Vec = vec![0u8; bytes_to_skip]; + // After reaching EOF, don't exit immediately, still process the `last_line` + let mut reached_eof = false; // Loop for each line read - loop { - let mut current_line = String::new(); - - if flags.skip_chars.is_some() { - let skip_result = reader.read_exact(&mut bytes_to_skip[..]); - // Check error or EOF - match skip_result { - Ok(_) => {}, - Err(err) => { - if let ErrorKind::UnexpectedEof = err.kind() { - // Ignore - } else { - return Err(err); - } - }, - } + while !reached_eof { + let mut new_line = String::new(); + + // Using `reader.read_line()` to capture the line break characters + let bytes_read = reader.read_line(&mut new_line)?; + reached_eof = bytes_read == 0; + + // Only happens on the last line of the input, when reaching EOF + // Add \n at the end if necessary + if bytes_read > 0 && new_line.bytes().next_back() != Some(b'\n') { + new_line.reserve_exact(1); + new_line.push('\n'); } - let size = reader.read_line(&mut current_line); - - let mut should_exit = false; - // Check error or EOF - match size { - Err(err) => { - eprintln!("uniq: Input error: {}.", err); - return Err(err); - }, - // EOF, exit after this loop - Ok(0) => should_exit = true, - // Keep looping - Ok(_) => {}, - } + let skipped_bytes = skip_fields_and_bytes( + new_line.as_str(), + flags.skip_fields.unwrap_or(0), + flags.skip_bytes.unwrap_or(0), + ); + + let new_slice = &new_line.as_bytes()[skipped_bytes..]; + let last_slice = &last_line.as_bytes()[last_skipped_bytes..]; - let line_changed = current_line != last_line; - let current_line_count = if line_changed { 1 } else { last_line_count + 1 }; + let line_changed = new_slice != last_slice; + let current_line_match_count = if line_changed { 1 } else { last_count + 1 }; // The combination of these two flags supress all output if flags.supress_repeated && flags.supress_unique { - if should_exit { - break; - } continue; } - // The following block decides if current_line or last_line should be shown + // The following block decides if new_line or last_line should be shown // The lines are always shown as early as possible - // Output formatting is different depending on flags.show_count + // Formatting changes based on flags.show_count if flags.show_count { if line_changed { let mut should_show_last_line = false; if flags.supress_unique { - if last_line_count > 1 { + if last_count > 1 { should_show_last_line = true; } } else if flags.supress_repeated { - if last_line_count == 1 { + if last_count == 1 { should_show_last_line = true; } } else { should_show_last_line = true; } - if should_show_last_line && last_line_count > 0 { - write!(writer, "{:7} ", last_line_count)?; + if should_show_last_line && last_count > 0 { + write!(writer, "{:7} ", last_count)?; writer.write_all(last_line.as_bytes())?; } } @@ -164,15 +186,15 @@ fn uniq( let mut line_to_show: Option<&str> = None; if flags.supress_unique { - if current_line_count == 2 { - line_to_show = Some(¤t_line); + if current_line_match_count == 2 { + line_to_show = Some(&new_line); } } else if flags.supress_repeated { - if line_changed && last_line_count == 1 { + if line_changed && last_count == 1 { line_to_show = Some(&last_line); } } else if line_changed { - line_to_show = Some(¤t_line); + line_to_show = Some(&new_line); } if let Some(line) = line_to_show { @@ -180,11 +202,9 @@ fn uniq( } } - last_line = current_line; - last_line_count = current_line_count; - if should_exit { - break; - } + last_line = new_line; + last_count = current_line_match_count; + last_skipped_bytes = skipped_bytes; } Ok(()) @@ -211,7 +231,7 @@ mod tests { #[test] fn test_uniq_basic_usage() { let input = "A\nA\nB\nC\nC\nD"; - let expected = "A\nB\nC\nD"; + let expected = "A\nB\nC\nD\n"; assert_eq!(expected, test_uniq(input, flags_none())); } @@ -224,7 +244,7 @@ mod tests { #[test] fn test_uniq_without_line_break() { - assert_eq!("ABC", test_uniq("ABC", flags_none())); + assert_eq!("ABC\n", test_uniq("ABC", flags_none())); } #[test] @@ -232,10 +252,17 @@ mod tests { assert_eq!("", test_uniq("", flags_none())); } + // #[test] + // fn test_uniq_line_endings() { + // let input = "A\r\nA\n\n\r\n\n"; + // let expected = "A\n\n"; + // assert_eq!(expected, test_uniq(input, flags_none())); + // } + #[test] fn test_uniq_flag_count() { let input = "A\nA\nB\nC\nC\nD"; - let expected = " 2 A\n 1 B\n 2 C\n 1 D"; + let expected = " 2 A\n 1 B\n 2 C\n 1 D\n"; let flags = Flags { show_count: true, ..flags_none() }; assert_eq!(expected, test_uniq(input, flags)); } @@ -243,7 +270,7 @@ mod tests { #[test] fn test_uniq_flag_unique() { let input = "A\nA\nB\nC\nC\nD"; - let expected = "B\nD"; + let expected = "B\nD\n"; let flags = Flags { supress_repeated: true, ..flags_none() }; assert_eq!(expected, test_uniq(input, flags)); } @@ -258,15 +285,23 @@ mod tests { #[test] fn test_uniq_flag_skip_chars() { - let input = "_A\n_A\n_B\n_C\n_C\n_D"; - let expected = "A\nB\nC\nD"; - let flags = Flags { skip_chars: Some(1), ..flags_none() }; + let input = "qwe\neee\neeee\n\n0x11\n0b11"; + let expected = "qwe\neeee\n\n0x11\n"; + let flags = Flags { skip_bytes: Some(2), ..flags_none() }; + assert_eq!(expected, test_uniq(input, flags)); + } + + #[test] + fn test_uniq_flag_skip_fields() { + let input = "a a\na b\nb b\nc b"; + let expected = "a a\na b\n"; + let flags = Flags { skip_fields: Some(1), ..flags_none() }; assert_eq!(expected, test_uniq(input, flags)); } #[test] fn test_uniq_combined_flags_count_and_unique() { - let expected = " 1 B\n 1 D"; + let expected = " 1 B\n 1 D\n"; let input = "A\nA\nB\nC\nC\nD"; let flags = Flags { show_count: true, supress_repeated: true, ..flags_none() }; assert_eq!(expected, test_uniq(input, flags));