diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index 70cbd937933..a3b51bfedb6 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -120,6 +120,7 @@ pseudoprimes quantiles readonly reparse +rposition seedable semver semiprime diff --git a/Cargo.lock b/Cargo.lock index 12f788c8338..d1d7c537a0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3495,6 +3495,7 @@ dependencies = [ "codspeed-divan-compat", "fluent", "tempfile", + "unicode-width 0.2.2", "uucore", ] diff --git a/src/uu/fold/Cargo.toml b/src/uu/fold/Cargo.toml index 644d78b41bb..845ce2c9698 100644 --- a/src/uu/fold/Cargo.toml +++ b/src/uu/fold/Cargo.toml @@ -21,6 +21,7 @@ path = "src/fold.rs" clap = { workspace = true } uucore = { workspace = true } fluent = { workspace = true } +unicode-width = { workspace = true } [dev-dependencies] divan = { workspace = true } diff --git a/src/uu/fold/locales/en-US.ftl b/src/uu/fold/locales/en-US.ftl index 9f8c6f3b984..d4241666715 100644 --- a/src/uu/fold/locales/en-US.ftl +++ b/src/uu/fold/locales/en-US.ftl @@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given) to standard output whilst breaking long lines fold-usage = fold [OPTION]... [FILE]... fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially) +fold-characters-help = count using character positions rather than display columns fold-spaces-help = break lines at word boundaries rather than a hard cut-off fold-width-help = set WIDTH as the maximum line width rather than 80 fold-error-illegal-width = illegal width value diff --git a/src/uu/fold/locales/fr-FR.ftl b/src/uu/fold/locales/fr-FR.ftl index 1a723594052..ce313160cf4 100644 --- a/src/uu/fold/locales/fr-FR.ftl +++ b/src/uu/fold/locales/fr-FR.ftl @@ -1,6 +1,7 @@ fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues fold-usage = fold [OPTION]... [FICHIER]... fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement) +fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80 fold-error-illegal-width = valeur de largeur illégale diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index bbaac56bee8..f14ed3cf071 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command}; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout}; use std::path::Path; +use unicode_width::UnicodeWidthChar; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError}; use uucore::format_usage; @@ -21,11 +22,28 @@ const TAB: u8 = b'\t'; mod options { pub const BYTES: &str = "bytes"; + pub const CHARACTERS: &str = "characters"; pub const SPACES: &str = "spaces"; pub const WIDTH: &str = "width"; pub const FILE: &str = "file"; } +#[derive(Clone, Copy, PartialEq, Eq)] +enum WidthMode { + Columns, + Characters, +} + +struct FoldContext<'a, W: Write> { + spaces: bool, + width: usize, + mode: WidthMode, + writer: &'a mut W, + output: &'a mut Vec, + col_count: &'a mut usize, + last_space: &'a mut Option, +} + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let args = args.collect_lossy(); @@ -34,6 +52,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; let bytes = matches.get_flag(options::BYTES); + let characters = matches.get_flag(options::CHARACTERS); let spaces = matches.get_flag(options::SPACES); let poss_width = match matches.get_one::(options::WIDTH) { Some(v) => Some(v.clone()), @@ -55,7 +74,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { None => vec!["-".to_owned()], }; - fold(&files, bytes, spaces, width) + fold(&files, bytes, characters, spaces, width) } pub fn uu_app() -> Command { @@ -72,6 +91,13 @@ pub fn uu_app() -> Command { .help(translate!("fold-bytes-help")) .action(ArgAction::SetTrue), ) + .arg( + Arg::new(options::CHARACTERS) + .long(options::CHARACTERS) + .help(translate!("fold-characters-help")) + .conflicts_with(options::BYTES) + .action(ArgAction::SetTrue), + ) .arg( Arg::new(options::SPACES) .long(options::SPACES) @@ -107,7 +133,13 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { (args.to_vec(), None) } -fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> { +fn fold( + filenames: &[String], + bytes: bool, + characters: bool, + spaces: bool, + width: usize, +) -> UResult<()> { let mut output = BufWriter::new(stdout()); for filename in filenames { @@ -125,7 +157,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul if bytes { fold_file_bytewise(buffer, spaces, width, &mut output)?; } else { - fold_file(buffer, spaces, width, &mut output)?; + let mode = if characters { + WidthMode::Characters + } else { + WidthMode::Columns + }; + fold_file(buffer, spaces, width, mode, &mut output)?; } } @@ -213,6 +250,303 @@ fn fold_file_bytewise( Ok(()) } +fn next_tab_stop(col_count: usize) -> usize { + col_count + TAB_WIDTH - col_count % TAB_WIDTH +} + +fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { + match mode { + WidthMode::Characters => std::str::from_utf8(buffer) + .map(|s| s.chars().count()) + .unwrap_or(buffer.len()), + WidthMode::Columns => { + if let Ok(s) = std::str::from_utf8(buffer) { + let mut width = 0; + for ch in s.chars() { + match ch { + '\r' => width = 0, + '\t' => width = next_tab_stop(width), + '\x08' => width = width.saturating_sub(1), + _ => width += UnicodeWidthChar::width(ch).unwrap_or(0), + } + } + width + } else { + let mut width = 0; + for &byte in buffer { + match byte { + CR => width = 0, + TAB => width = next_tab_stop(width), + 0x08 => width = width.saturating_sub(1), + _ => width += 1, + } + } + width + } + } + } +} + +fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let consume = match *ctx.last_space { + Some(index) => index + 1, + None => ctx.output.len(), + }; + + if consume > 0 { + ctx.writer.write_all(&ctx.output[..consume])?; + } + ctx.writer.write_all(&[NL])?; + + let last_space = *ctx.last_space; + + if consume < ctx.output.len() { + ctx.output.drain(..consume); + } else { + ctx.output.clear(); + } + + *ctx.col_count = compute_col_count(ctx.output, ctx.mode); + + if ctx.spaces { + *ctx.last_space = last_space.and_then(|idx| { + if idx < consume { + None + } else { + Some(idx - consume) + } + }); + } else { + *ctx.last_space = None; + } + Ok(()) +} + +fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let mut idx = 0; + let len = line.len(); + + while idx < len { + match line[idx] { + NL => { + *ctx.last_space = None; + emit_output(ctx)?; + break; + } + CR => { + ctx.output.push(CR); + *ctx.col_count = 0; + idx += 1; + } + 0x08 => { + ctx.output.push(0x08); + *ctx.col_count = ctx.col_count.saturating_sub(1); + idx += 1; + } + TAB if ctx.mode == WidthMode::Columns => { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output.push(TAB); + idx += 1; + } + 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { + ctx.output.push(line[idx]); + if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { + *ctx.last_space = Some(ctx.output.len() - 1); + } else if !ctx.spaces { + *ctx.last_space = None; + } + idx += 1; + } + _ => { + let start = idx; + while idx < len + && !matches!( + line[idx], + NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F + ) + { + idx += 1; + } + push_ascii_segment(&line[start..idx], ctx)?; + } + } + } + + Ok(()) +} + +fn push_ascii_segment(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if segment.is_empty() { + return Ok(()); + } + + let mut remaining = segment; + + while !remaining.is_empty() { + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + continue; + } + + let available = ctx.width - *ctx.col_count; + let take = remaining.len().min(available); + let base_len = ctx.output.len(); + + ctx.output.extend_from_slice(&remaining[..take]); + *ctx.col_count += take; + + if ctx.spaces { + if let Some(pos) = remaining[..take] + .iter() + .rposition(|b| b.is_ascii_whitespace() && *b != CR) + { + *ctx.last_space = Some(base_len + pos); + } + } else { + *ctx.last_space = None; + } + + remaining = &remaining[take..]; + } + + Ok(()) +} + +fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if line.is_ascii() { + return process_ascii_line(line.as_bytes(), ctx); + } + + let line_bytes = line.as_bytes(); + let mut iter = line.char_indices().peekable(); + + while let Some((byte_idx, ch)) = iter.next() { + let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len()); + + if ch == '\n' { + *ctx.last_space = None; + emit_output(ctx)?; + break; + } + + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + } + + if ch == '\r' { + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = 0; + continue; + } + + if ch == '\x08' { + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_sub(1); + continue; + } + + if ctx.mode == WidthMode::Columns && ch == '\t' { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + continue; + } + + let added = match ctx.mode { + WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0), + WidthMode::Characters => 1, + }; + + if ctx.mode == WidthMode::Columns + && added > 0 + && *ctx.col_count + added > ctx.width + && !ctx.output.is_empty() + { + emit_output(ctx)?; + } + + if ctx.spaces && ch.is_ascii_whitespace() { + *ctx.last_space = Some(ctx.output.len()); + } + + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_add(added); + } + + Ok(()) +} + +fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + for &byte in line { + if byte == NL { + *ctx.last_space = None; + emit_output(ctx)?; + break; + } + + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + } + + match byte { + CR => *ctx.col_count = 0, + TAB => { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + } + *ctx.col_count = next_stop; + *ctx.last_space = if ctx.spaces { + Some(ctx.output.len()) + } else { + None + }; + ctx.output.push(byte); + continue; + } + 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), + _ if ctx.spaces && byte.is_ascii_whitespace() => { + *ctx.last_space = Some(ctx.output.len()); + *ctx.col_count = ctx.col_count.saturating_add(1); + } + _ => *ctx.col_count = ctx.col_count.saturating_add(1), + } + + ctx.output.push(byte); + } + + Ok(()) +} + /// Fold `file` to fit `width` (number of columns). /// /// By default `fold` treats tab, backspace, and carriage return specially: @@ -226,6 +560,7 @@ fn fold_file( mut file: BufReader, spaces: bool, width: usize, + mode: WidthMode, writer: &mut W, ) -> UResult<()> { let mut line = Vec::new(); @@ -233,30 +568,6 @@ fn fold_file( let mut col_count = 0; let mut last_space = None; - /// Print the output line, resetting the column and character counts. - /// - /// If `spaces` is `true`, print the output line up to the last - /// encountered whitespace character (inclusive) and set the remaining - /// characters as the start of the next line. - macro_rules! emit_output { - () => { - let consume = match last_space { - Some(i) => i + 1, - None => output.len(), - }; - - writer.write_all(&output[..consume])?; - writer.write_all(&[NL])?; - output.drain(..consume); - - // we know there are no tabs left in output, so each char counts - // as 1 column - col_count = output.len(); - - last_space = None; - }; - } - loop { if file .read_until(NL, &mut line) @@ -266,50 +577,27 @@ fn fold_file( break; } - for ch in &line { - if *ch == NL { - // make sure to _not_ split output at whitespace, since we - // know the entire output will fit - last_space = None; - emit_output!(); - break; - } - - if col_count >= width { - emit_output!(); - } - - match *ch { - CR => col_count = 0, - TAB => { - let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH; - - if next_tab_stop > width && !output.is_empty() { - emit_output!(); - } - - col_count = next_tab_stop; - last_space = if spaces { Some(output.len()) } else { None }; - } - 0x08 => { - col_count = col_count.saturating_sub(1); - } - _ if spaces && ch.is_ascii_whitespace() => { - last_space = Some(output.len()); - col_count += 1; - } - _ => col_count += 1, - } + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; - output.push(*ch); + match std::str::from_utf8(&line) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&line, &mut ctx)?, } - if !output.is_empty() { - writer.write_all(&output)?; - output.truncate(0); - } + line.clear(); + } - line.truncate(0); + if !output.is_empty() { + writer.write_all(&output)?; + output.clear(); } Ok(()) diff --git a/tests/by-util/test_fold.rs b/tests/by-util/test_fold.rs index 4a2d381fafb..04072ab157f 100644 --- a/tests/by-util/test_fold.rs +++ b/tests/by-util/test_fold.rs @@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() { .stdout_is_fixture("lorem_ipsum_new_line_80_column.expected"); } +#[test] +fn test_wide_characters_in_column_mode() { + new_ucmd!() + .args(&["-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\n\u{B250}\n"); +} + +#[test] +fn test_wide_characters_with_characters_option() { + new_ucmd!() + .args(&["--characters", "-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\u{B250}\n"); +} + #[test] fn test_should_preserve_empty_line_without_final_newline() { new_ucmd!()