From 891ab0250e85e3f9e42cd15b7c674e1a26163ff5 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 3 Nov 2025 16:44:47 +0900 Subject: [PATCH 01/17] feat(fold): add --characters option to count by character positions Add a new --characters flag to the fold utility, allowing it to count using Unicode character positions rather than display columns. This provides more accurate line breaking for text containing wide characters. Includes dependency on unicode-width crate and updated help messages in English and French locales. --- Cargo.lock | 1 + src/uu/fold/Cargo.toml | 1 + src/uu/fold/locales/en-US.ftl | 1 + src/uu/fold/locales/fr-FR.ftl | 1 + src/uu/fold/src/fold.rs | 314 ++++++++++++++++++++++++++-------- tests/by-util/test_fold.rs | 18 ++ 6 files changed, 268 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87471acd621..5bef94a796a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3496,6 +3496,7 @@ dependencies = [ "codspeed-divan-compat", "fluent", "tempfile", + "unicode-width 0.2.2", "uucore", ] diff --git a/src/uu/fold/Cargo.toml b/src/uu/fold/Cargo.toml index 644d78b41bb..845ce2c9698 100644 --- a/src/uu/fold/Cargo.toml +++ b/src/uu/fold/Cargo.toml @@ -21,6 +21,7 @@ path = "src/fold.rs" clap = { workspace = true } uucore = { workspace = true } fluent = { workspace = true } +unicode-width = { workspace = true } [dev-dependencies] divan = { workspace = true } diff --git a/src/uu/fold/locales/en-US.ftl b/src/uu/fold/locales/en-US.ftl index 9f8c6f3b984..d4241666715 100644 --- a/src/uu/fold/locales/en-US.ftl +++ b/src/uu/fold/locales/en-US.ftl @@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given) to standard output whilst breaking long lines fold-usage = fold [OPTION]... [FILE]... fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially) +fold-characters-help = count using character positions rather than display columns fold-spaces-help = break lines at word boundaries rather than a hard cut-off fold-width-help = set WIDTH as the maximum line width rather than 80 fold-error-illegal-width = illegal width value diff --git a/src/uu/fold/locales/fr-FR.ftl b/src/uu/fold/locales/fr-FR.ftl index 1a723594052..ce313160cf4 100644 --- a/src/uu/fold/locales/fr-FR.ftl +++ b/src/uu/fold/locales/fr-FR.ftl @@ -1,6 +1,7 @@ fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues fold-usage = fold [OPTION]... [FICHIER]... fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement) +fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80 fold-error-illegal-width = valeur de largeur illégale diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index bbaac56bee8..f943b13da65 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command}; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout}; use std::path::Path; +use unicode_width::UnicodeWidthChar; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError}; use uucore::format_usage; @@ -21,11 +22,18 @@ const TAB: u8 = b'\t'; mod options { pub const BYTES: &str = "bytes"; + pub const CHARACTERS: &str = "characters"; pub const SPACES: &str = "spaces"; pub const WIDTH: &str = "width"; pub const FILE: &str = "file"; } +#[derive(Clone, Copy, PartialEq, Eq)] +enum WidthMode { + Columns, + Characters, +} + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let args = args.collect_lossy(); @@ -34,6 +42,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; let bytes = matches.get_flag(options::BYTES); + let characters = matches.get_flag(options::CHARACTERS); let spaces = matches.get_flag(options::SPACES); let poss_width = match matches.get_one::(options::WIDTH) { Some(v) => Some(v.clone()), @@ -55,7 +64,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { None => vec!["-".to_owned()], }; - fold(&files, bytes, spaces, width) + fold(&files, bytes, characters, spaces, width) } pub fn uu_app() -> Command { @@ -72,6 +81,13 @@ pub fn uu_app() -> Command { .help(translate!("fold-bytes-help")) .action(ArgAction::SetTrue), ) + .arg( + Arg::new(options::CHARACTERS) + .long(options::CHARACTERS) + .help(translate!("fold-characters-help")) + .conflicts_with(options::BYTES) + .action(ArgAction::SetTrue), + ) .arg( Arg::new(options::SPACES) .long(options::SPACES) @@ -107,7 +123,13 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { (args.to_vec(), None) } -fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> { +fn fold( + filenames: &[String], + bytes: bool, + characters: bool, + spaces: bool, + width: usize, +) -> UResult<()> { let mut output = BufWriter::new(stdout()); for filename in filenames { @@ -125,7 +147,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul if bytes { fold_file_bytewise(buffer, spaces, width, &mut output)?; } else { - fold_file(buffer, spaces, width, &mut output)?; + let mode = if characters { + WidthMode::Characters + } else { + WidthMode::Columns + }; + fold_file(buffer, spaces, width, mode, &mut output)?; } } @@ -213,6 +240,193 @@ fn fold_file_bytewise( Ok(()) } +fn next_tab_stop(col_count: usize) -> usize { + col_count + TAB_WIDTH - col_count % TAB_WIDTH +} + +fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { + match mode { + WidthMode::Characters => std::str::from_utf8(buffer) + .map(|s| s.chars().count()) + .unwrap_or(buffer.len()), + WidthMode::Columns => { + if let Ok(s) = std::str::from_utf8(buffer) { + let mut width = 0; + for ch in s.chars() { + match ch { + '\r' => width = 0, + '\t' => width = next_tab_stop(width), + '\x08' => width = width.saturating_sub(1), + _ => width += UnicodeWidthChar::width(ch).unwrap_or(0), + } + } + width + } else { + let mut width = 0; + for &byte in buffer { + match byte { + CR => width = 0, + TAB => width = next_tab_stop(width), + 0x08 => width = width.saturating_sub(1), + _ => width += 1, + } + } + width + } + } + } +} + +fn emit_output( + writer: &mut W, + output: &mut Vec, + last_space: &mut Option, + col_count: &mut usize, + mode: WidthMode, +) -> UResult<()> { + let consume = match *last_space { + Some(index) => index + 1, + None => output.len(), + }; + + if consume > 0 { + writer.write_all(&output[..consume])?; + } + writer.write_all(&[NL])?; + output.drain(..consume); + *col_count = compute_col_count(output, mode); + *last_space = None; + Ok(()) +} + +fn process_utf8_line( + line_str: &str, + line_bytes: &[u8], + spaces: bool, + width: usize, + mode: WidthMode, + writer: &mut W, + output: &mut Vec, + col_count: &mut usize, + last_space: &mut Option, +) -> UResult<()> { + let mut iter = line_str.char_indices().peekable(); + + while let Some((byte_idx, ch)) = iter.next() { + let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len()); + + if ch == '\n' { + *last_space = None; + emit_output(writer, output, last_space, col_count, mode)?; + break; + } + + if *col_count >= width { + emit_output(writer, output, last_space, col_count, mode)?; + } + + if ch == '\r' { + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = 0; + continue; + } + + if ch == '\x08' { + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = col_count.saturating_sub(1); + continue; + } + + if mode == WidthMode::Columns && ch == '\t' { + loop { + let next_stop = next_tab_stop(*col_count); + if next_stop > width && !output.is_empty() { + emit_output(writer, output, last_space, col_count, mode)?; + continue; + } + *col_count = next_stop; + break; + } + if spaces { + *last_space = Some(output.len()); + } else { + *last_space = None; + } + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + continue; + } + + let added = match mode { + WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0), + WidthMode::Characters => 1, + }; + + if mode == WidthMode::Columns + && added > 0 + && *col_count + added > width + && !output.is_empty() + { + emit_output(writer, output, last_space, col_count, mode)?; + } + + if spaces && ch.is_ascii_whitespace() { + *last_space = Some(output.len()); + } + + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = (*col_count).saturating_add(added); + } + + Ok(()) +} + +fn process_non_utf8_line( + line: &[u8], + spaces: bool, + width: usize, + mode: WidthMode, + writer: &mut W, + output: &mut Vec, + col_count: &mut usize, + last_space: &mut Option, +) -> UResult<()> { + for &byte in line { + if byte == NL { + *last_space = None; + emit_output(writer, output, last_space, col_count, mode)?; + break; + } + + if *col_count >= width { + emit_output(writer, output, last_space, col_count, mode)?; + } + + match byte { + CR => *col_count = 0, + TAB => { + let next_stop = next_tab_stop(*col_count); + if next_stop > width && !output.is_empty() { + emit_output(writer, output, last_space, col_count, mode)?; + } + *col_count = next_stop; + *last_space = if spaces { Some(output.len()) } else { None }; + output.push(byte); + continue; + } + 0x08 => *col_count = col_count.saturating_sub(1), + _ if spaces && byte.is_ascii_whitespace() => { + *last_space = Some(output.len()); + *col_count = (*col_count).saturating_add(1); + } + _ => *col_count = (*col_count).saturating_add(1), + } + + output.push(byte); + } + + Ok(()) +} + /// Fold `file` to fit `width` (number of columns). /// /// By default `fold` treats tab, backspace, and carriage return specially: @@ -226,6 +440,7 @@ fn fold_file( mut file: BufReader, spaces: bool, width: usize, + mode: WidthMode, writer: &mut W, ) -> UResult<()> { let mut line = Vec::new(); @@ -233,30 +448,6 @@ fn fold_file( let mut col_count = 0; let mut last_space = None; - /// Print the output line, resetting the column and character counts. - /// - /// If `spaces` is `true`, print the output line up to the last - /// encountered whitespace character (inclusive) and set the remaining - /// characters as the start of the next line. - macro_rules! emit_output { - () => { - let consume = match last_space { - Some(i) => i + 1, - None => output.len(), - }; - - writer.write_all(&output[..consume])?; - writer.write_all(&[NL])?; - output.drain(..consume); - - // we know there are no tabs left in output, so each char counts - // as 1 column - col_count = output.len(); - - last_space = None; - }; - } - loop { if file .read_until(NL, &mut line) @@ -266,50 +457,37 @@ fn fold_file( break; } - for ch in &line { - if *ch == NL { - // make sure to _not_ split output at whitespace, since we - // know the entire output will fit - last_space = None; - emit_output!(); - break; - } - - if col_count >= width { - emit_output!(); - } - - match *ch { - CR => col_count = 0, - TAB => { - let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH; - - if next_tab_stop > width && !output.is_empty() { - emit_output!(); - } - - col_count = next_tab_stop; - last_space = if spaces { Some(output.len()) } else { None }; - } - 0x08 => { - col_count = col_count.saturating_sub(1); - } - _ if spaces && ch.is_ascii_whitespace() => { - last_space = Some(output.len()); - col_count += 1; - } - _ => col_count += 1, - } - - output.push(*ch); + if let Ok(line_str) = std::str::from_utf8(&line) { + process_utf8_line( + line_str, + &line, + spaces, + width, + mode, + writer, + &mut output, + &mut col_count, + &mut last_space, + )?; + } else { + process_non_utf8_line( + &line, + spaces, + width, + mode, + writer, + &mut output, + &mut col_count, + &mut last_space, + )?; } - if !output.is_empty() { - writer.write_all(&output)?; - output.truncate(0); - } + line.clear(); + } - line.truncate(0); + if !output.is_empty() { + writer.write_all(&output)?; + output.clear(); } Ok(()) diff --git a/tests/by-util/test_fold.rs b/tests/by-util/test_fold.rs index 4a2d381fafb..04072ab157f 100644 --- a/tests/by-util/test_fold.rs +++ b/tests/by-util/test_fold.rs @@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() { .stdout_is_fixture("lorem_ipsum_new_line_80_column.expected"); } +#[test] +fn test_wide_characters_in_column_mode() { + new_ucmd!() + .args(&["-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\n\u{B250}\n"); +} + +#[test] +fn test_wide_characters_with_characters_option() { + new_ucmd!() + .args(&["--characters", "-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\u{B250}\n"); +} + #[test] fn test_should_preserve_empty_line_without_final_newline() { new_ucmd!() From 95d73d00ea8318066993a51b9869cf47b2109c67 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 3 Nov 2025 16:54:02 +0900 Subject: [PATCH 02/17] refactor(fold): introduce FoldContext to encapsulate state and simplify function signatures Introduce a new FoldContext struct to group related fields (spaces, width, mode, writer, output, col_count, last_space) into a single context object. This refactoring reduces parameter passing in emit_output and process_utf8_line functions, improving code readability and maintainability without altering the core folding logic. --- src/uu/fold/src/fold.rs | 177 +++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 92 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index f943b13da65..dfb85742cca 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -34,6 +34,16 @@ enum WidthMode { Characters, } +struct FoldContext<'a, W: Write> { + spaces: bool, + width: usize, + mode: WidthMode, + writer: &'a mut W, + output: &'a mut Vec, + col_count: &'a mut usize, + last_space: &'a mut Option, +} + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let args = args.collect_lossy(); @@ -277,151 +287,135 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { } } -fn emit_output( - writer: &mut W, - output: &mut Vec, - last_space: &mut Option, - col_count: &mut usize, - mode: WidthMode, -) -> UResult<()> { - let consume = match *last_space { +fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let consume = match *ctx.last_space { Some(index) => index + 1, - None => output.len(), + None => ctx.output.len(), }; if consume > 0 { - writer.write_all(&output[..consume])?; + ctx.writer.write_all(&ctx.output[..consume])?; } - writer.write_all(&[NL])?; - output.drain(..consume); - *col_count = compute_col_count(output, mode); - *last_space = None; + ctx.writer.write_all(&[NL])?; + ctx.output.drain(..consume); + *ctx.col_count = compute_col_count(ctx.output, ctx.mode); + *ctx.last_space = None; Ok(()) } -fn process_utf8_line( - line_str: &str, - line_bytes: &[u8], - spaces: bool, - width: usize, - mode: WidthMode, - writer: &mut W, - output: &mut Vec, - col_count: &mut usize, - last_space: &mut Option, -) -> UResult<()> { - let mut iter = line_str.char_indices().peekable(); +fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let line_bytes = line.as_bytes(); + let mut iter = line.char_indices().peekable(); while let Some((byte_idx, ch)) = iter.next() { let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len()); if ch == '\n' { - *last_space = None; - emit_output(writer, output, last_space, col_count, mode)?; + *ctx.last_space = None; + emit_output(ctx)?; break; } - if *col_count >= width { - emit_output(writer, output, last_space, col_count, mode)?; + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; } if ch == '\r' { - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = 0; + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = 0; continue; } if ch == '\x08' { - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = col_count.saturating_sub(1); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_sub(1); continue; } - if mode == WidthMode::Columns && ch == '\t' { + if ctx.mode == WidthMode::Columns && ch == '\t' { loop { - let next_stop = next_tab_stop(*col_count); - if next_stop > width && !output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; continue; } - *col_count = next_stop; + *ctx.col_count = next_stop; break; } - if spaces { - *last_space = Some(output.len()); + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); } else { - *last_space = None; + *ctx.last_space = None; } - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); continue; } - let added = match mode { + let added = match ctx.mode { WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0), WidthMode::Characters => 1, }; - if mode == WidthMode::Columns + if ctx.mode == WidthMode::Columns && added > 0 - && *col_count + added > width - && !output.is_empty() + && *ctx.col_count + added > ctx.width + && !ctx.output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + emit_output(ctx)?; } - if spaces && ch.is_ascii_whitespace() { - *last_space = Some(output.len()); + if ctx.spaces && ch.is_ascii_whitespace() { + *ctx.last_space = Some(ctx.output.len()); } - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = (*col_count).saturating_add(added); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_add(added); } Ok(()) } -fn process_non_utf8_line( - line: &[u8], - spaces: bool, - width: usize, - mode: WidthMode, - writer: &mut W, - output: &mut Vec, - col_count: &mut usize, - last_space: &mut Option, -) -> UResult<()> { +fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { for &byte in line { if byte == NL { - *last_space = None; - emit_output(writer, output, last_space, col_count, mode)?; + *ctx.last_space = None; + emit_output(ctx)?; break; } - if *col_count >= width { - emit_output(writer, output, last_space, col_count, mode)?; + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; } match byte { - CR => *col_count = 0, + CR => *ctx.col_count = 0, TAB => { - let next_stop = next_tab_stop(*col_count); - if next_stop > width && !output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; } - *col_count = next_stop; - *last_space = if spaces { Some(output.len()) } else { None }; - output.push(byte); + *ctx.col_count = next_stop; + *ctx.last_space = if ctx.spaces { + Some(ctx.output.len()) + } else { + None + }; + ctx.output.push(byte); continue; } - 0x08 => *col_count = col_count.saturating_sub(1), - _ if spaces && byte.is_ascii_whitespace() => { - *last_space = Some(output.len()); - *col_count = (*col_count).saturating_add(1); + 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), + _ if ctx.spaces && byte.is_ascii_whitespace() => { + *ctx.last_space = Some(ctx.output.len()); + *ctx.col_count = ctx.col_count.saturating_add(1); } - _ => *col_count = (*col_count).saturating_add(1), + _ => *ctx.col_count = ctx.col_count.saturating_add(1), } - output.push(byte); + ctx.output.push(byte); } Ok(()) @@ -458,28 +452,27 @@ fn fold_file( } if let Ok(line_str) = std::str::from_utf8(&line) { - process_utf8_line( - line_str, - &line, + let mut ctx = FoldContext { spaces, width, mode, writer, - &mut output, - &mut col_count, - &mut last_space, - )?; + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + process_utf8_line(line_str, &mut ctx)?; } else { - process_non_utf8_line( - &line, + let mut ctx = FoldContext { spaces, width, mode, writer, - &mut output, - &mut col_count, - &mut last_space, - )?; + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + process_non_utf8_line(&line, &mut ctx)?; } line.clear(); From 26906fed153544f411daf95cde22ffd0faeda429 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 08:43:55 +0900 Subject: [PATCH 03/17] feat: optimize ASCII line processing in fold Add process_ascii_line function to handle ASCII bytes efficiently, avoiding UTF-8 overhead for ASCII input. Update emit_output to properly manage output buffer remainder and track last space position for better folding logic. Modify process_utf8_line to delegate ASCII lines to the new function. --- src/uu/fold/src/fold.rs | 93 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index dfb85742cca..753e9635954 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -297,13 +297,102 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { ctx.writer.write_all(&ctx.output[..consume])?; } ctx.writer.write_all(&[NL])?; - ctx.output.drain(..consume); + + if consume < ctx.output.len() { + let remainder = ctx.output.split_off(consume); + *ctx.output = remainder; + } else { + ctx.output.clear(); + } + *ctx.col_count = compute_col_count(ctx.output, ctx.mode); - *ctx.last_space = None; + + if ctx.spaces { + *ctx.last_space = ctx + .output + .iter() + .rposition(|b| b.is_ascii_whitespace() && *b != CR); + } else { + *ctx.last_space = None; + } + Ok(()) +} + +fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + for &byte in line { + if byte == NL { + *ctx.last_space = None; + emit_output(ctx)?; + break; + } + + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + } + + if byte == CR { + ctx.output.push(byte); + *ctx.col_count = 0; + continue; + } + + if byte == 0x08 { + ctx.output.push(byte); + *ctx.col_count = ctx.col_count.saturating_sub(1); + continue; + } + + if ctx.mode == WidthMode::Columns && byte == TAB { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output.push(byte); + continue; + } + + let added = match ctx.mode { + WidthMode::Columns => match byte { + 0x00..=0x08 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => 0, + _ => 1, + }, + WidthMode::Characters => 1, + }; + + if ctx.mode == WidthMode::Columns + && added > 0 + && *ctx.col_count + added > ctx.width + && !ctx.output.is_empty() + { + emit_output(ctx)?; + } + + if ctx.spaces && byte.is_ascii_whitespace() && byte != CR { + *ctx.last_space = Some(ctx.output.len()); + } + + ctx.output.push(byte); + *ctx.col_count = ctx.col_count.saturating_add(added); + } + Ok(()) } fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if line.is_ascii() { + return process_ascii_line(line.as_bytes(), ctx); + } + let line_bytes = line.as_bytes(); let mut iter = line.char_indices().peekable(); From 5fc3dc28c80e3cf678c90af2991024e4e50a30da Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 08:52:40 +0900 Subject: [PATCH 04/17] feat: add rposition to jargon wordlist Add "rposition" to the cspell jargon dictionary to prevent spell check errors for this technical term. --- .vscode/cspell.dictionaries/jargon.wordlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index 70cbd937933..a3b51bfedb6 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -120,6 +120,7 @@ pseudoprimes quantiles readonly reparse +rposition seedable semver semiprime From 980c8b1ec35ea206ac2622a0be8a699991f93634 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:33:14 +0900 Subject: [PATCH 05/17] perf: optimize fold benchmarks and output handling - Replace fold/writeln! with loop/push_str in benchmarks for faster string building - Add append_usize helper to avoid allocations in benchmark data generation - Refactor emit_output to use drain instead of split_off for better performance - Update last_space calculation to handle index adjustments more efficiently These changes improve performance in the fold utility's benchmarks and core logic by reducing allocations and optimizing string operations. --- src/uu/fold/benches/fold_bench.rs | 45 ++++++++++++++++++++----------- src/uu/fold/src/fold.rs | 11 ++++---- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/uu/fold/benches/fold_bench.rs b/src/uu/fold/benches/fold_bench.rs index abd69525f7a..d76ddbeaf74 100644 --- a/src/uu/fold/benches/fold_bench.rs +++ b/src/uu/fold/benches/fold_bench.rs @@ -4,7 +4,6 @@ // file that was distributed with this source code. use divan::{Bencher, black_box}; -use std::fmt::Write; use uu_fold::uumain; use uucore::benchmark::{create_test_file, run_util_function}; @@ -12,12 +11,12 @@ use uucore::benchmark::{create_test_file, run_util_function}; #[divan::bench(args = [100_000])] fn fold_many_lines(bencher: Bencher, num_lines: usize) { let temp_dir = tempfile::tempdir().unwrap(); - // Create long lines that need folding - let data = (0..num_lines) - .fold(String::new(), |mut acc, i| { - writeln!(&mut acc, "This is a very long line number {i} that definitely needs to be folded at the default width of 80 columns").unwrap(); - acc - }); + let mut data = String::with_capacity(num_lines * 110); + for i in 0..num_lines { + data.push_str("This is a very long line number "); + append_usize(&mut data, i); + data.push_str(" that definitely needs to be folded at the default width of 80 columns\n"); + } let file_path = create_test_file(data.as_bytes(), temp_dir.path()); let file_path_str = file_path.to_str().unwrap(); @@ -30,14 +29,12 @@ fn fold_many_lines(bencher: Bencher, num_lines: usize) { #[divan::bench(args = [50_000])] fn fold_custom_width(bencher: Bencher, num_lines: usize) { let temp_dir = tempfile::tempdir().unwrap(); - let data = (0..num_lines).fold(String::new(), |mut acc, i| { - writeln!( - &mut acc, - "Line {i} with enough text to exceed width 40 characters and require folding" - ) - .unwrap(); - acc - }); + let mut data = String::with_capacity(num_lines * 80); + for i in 0..num_lines { + data.push_str("Line "); + append_usize(&mut data, i); + data.push_str(" with enough text to exceed width 40 characters and require folding\n"); + } let file_path = create_test_file(data.as_bytes(), temp_dir.path()); let file_path_str = file_path.to_str().unwrap(); @@ -49,3 +46,21 @@ fn fold_custom_width(bencher: Bencher, num_lines: usize) { fn main() { divan::main(); } + +fn append_usize(buf: &mut String, mut value: usize) { + let mut digits = [0u8; 20]; + let mut idx = digits.len(); + + if value == 0 { + buf.push('0'); + return; + } + + while value > 0 { + idx -= 1; + digits[idx] = b'0' + (value % 10) as u8; + value /= 10; + } + + buf.push_str(std::str::from_utf8(&digits[idx..]).unwrap()); +} diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 753e9635954..f183fa360c2 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -298,9 +298,10 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { } ctx.writer.write_all(&[NL])?; + let last_space = *ctx.last_space; + if consume < ctx.output.len() { - let remainder = ctx.output.split_off(consume); - *ctx.output = remainder; + ctx.output.drain(..consume); } else { ctx.output.clear(); } @@ -308,10 +309,8 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { *ctx.col_count = compute_col_count(ctx.output, ctx.mode); if ctx.spaces { - *ctx.last_space = ctx - .output - .iter() - .rposition(|b| b.is_ascii_whitespace() && *b != CR); + *ctx.last_space = last_space + .and_then(|idx| if idx + 1 <= consume { None } else { Some(idx - consume) }); } else { *ctx.last_space = None; } From 639a451a5c8ccf1bfaf733f9206525c0f9c4c38e Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:34:13 +0900 Subject: [PATCH 06/17] refactor(fold): improve readability of last_space assignment in emit_output Break the inline closure into a multi-line block for better code clarity and maintainability. --- src/uu/fold/src/fold.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index f183fa360c2..4015947f978 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -309,8 +309,13 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { *ctx.col_count = compute_col_count(ctx.output, ctx.mode); if ctx.spaces { - *ctx.last_space = last_space - .and_then(|idx| if idx + 1 <= consume { None } else { Some(idx - consume) }); + *ctx.last_space = last_space.and_then(|idx| { + if idx + 1 <= consume { + None + } else { + Some(idx - consume) + } + }); } else { *ctx.last_space = None; } From f468a413ec4cd021c9ea1f2eb5d0d4f469938188 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:37:44 +0900 Subject: [PATCH 07/17] fix(fold): correct space index condition in emit_output The condition for updating the last space index was changed from `idx + 1 <= consume` to `idx < consume` to fix an off-by-one error, ensuring proper handling of spaces when consuming characters during line folding. --- src/uu/fold/src/fold.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 4015947f978..9cf91a267de 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -310,7 +310,7 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { if ctx.spaces { *ctx.last_space = last_space.and_then(|idx| { - if idx + 1 <= consume { + if idx < consume { None } else { Some(idx - consume) From 9ed32fe784cf0b2a1a9c579bc88d2b636b09f3f8 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:53:53 +0900 Subject: [PATCH 08/17] refactor(fold): optimize ASCII line processing for better character handling Refactor the `process_ascii_line` function to use a while loop with pattern matching instead of a for loop, improving efficiency and clarity. Introduce `push_ascii_segment` to handle contiguous printable character sequences, ensuring accurate column counting and whitespace tracking in both columns and characters modes. This addresses potential issues with control character processing and width calculations. --- src/uu/fold/src/fold.rs | 137 +++++++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 52 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 9cf91a267de..88e7b95bb5b 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -323,70 +323,103 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { } fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { - for &byte in line { - if byte == NL { - *ctx.last_space = None; - emit_output(ctx)?; - break; - } + let mut idx = 0; + let len = line.len(); - if *ctx.col_count >= ctx.width { - emit_output(ctx)?; - } - - if byte == CR { - ctx.output.push(byte); - *ctx.col_count = 0; - continue; - } - - if byte == 0x08 { - ctx.output.push(byte); - *ctx.col_count = ctx.col_count.saturating_sub(1); - continue; - } - - if ctx.mode == WidthMode::Columns && byte == TAB { - loop { - let next_stop = next_tab_stop(*ctx.col_count); - if next_stop > ctx.width && !ctx.output.is_empty() { - emit_output(ctx)?; - continue; - } - *ctx.col_count = next_stop; + while idx < len { + match line[idx] { + NL => { + *ctx.last_space = None; + emit_output(ctx)?; break; } - if ctx.spaces { - *ctx.last_space = Some(ctx.output.len()); - } else { - *ctx.last_space = None; + CR => { + ctx.output.push(CR); + *ctx.col_count = 0; + idx += 1; + } + 0x08 => { + ctx.output.push(0x08); + *ctx.col_count = ctx.col_count.saturating_sub(1); + idx += 1; + } + TAB if ctx.mode == WidthMode::Columns => { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output.push(TAB); + idx += 1; + } + 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { + ctx.output.push(line[idx]); + if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { + *ctx.last_space = Some(ctx.output.len() - 1); + } else if !ctx.spaces { + *ctx.last_space = None; + } + idx += 1; + } + _ => { + let start = idx; + while idx < len + && !matches!( + line[idx], + NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F + ) + { + idx += 1; + } + push_ascii_segment(&line[start..idx], ctx)?; } - ctx.output.push(byte); - continue; } + } - let added = match ctx.mode { - WidthMode::Columns => match byte { - 0x00..=0x08 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => 0, - _ => 1, - }, - WidthMode::Characters => 1, - }; + Ok(()) +} - if ctx.mode == WidthMode::Columns - && added > 0 - && *ctx.col_count + added > ctx.width - && !ctx.output.is_empty() - { +fn push_ascii_segment(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if segment.is_empty() { + return Ok(()); + } + + let mut remaining = segment; + + while !remaining.is_empty() { + if *ctx.col_count >= ctx.width { emit_output(ctx)?; + continue; } - if ctx.spaces && byte.is_ascii_whitespace() && byte != CR { - *ctx.last_space = Some(ctx.output.len()); + let available = ctx.width - *ctx.col_count; + let take = remaining.len().min(available); + let base_len = ctx.output.len(); + + ctx.output.extend_from_slice(&remaining[..take]); + *ctx.col_count += take; + + if ctx.spaces { + if let Some(pos) = remaining[..take] + .iter() + .rposition(|b| b.is_ascii_whitespace() && *b != CR) + { + *ctx.last_space = Some(base_len + pos); + } + } else { + *ctx.last_space = None; } - ctx.output.push(byte); - *ctx.col_count = ctx.col_count.saturating_add(added); + remaining = &remaining[take..]; } Ok(()) From 265aeb577980c199d1293170ba27825f2032c0b5 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 3 Nov 2025 16:44:47 +0900 Subject: [PATCH 09/17] feat(fold): add --characters option to count by character positions Add a new --characters flag to the fold utility, allowing it to count using Unicode character positions rather than display columns. This provides more accurate line breaking for text containing wide characters. Includes dependency on unicode-width crate and updated help messages in English and French locales. --- Cargo.lock | 1 + src/uu/fold/Cargo.toml | 1 + src/uu/fold/locales/en-US.ftl | 1 + src/uu/fold/locales/fr-FR.ftl | 1 + src/uu/fold/src/fold.rs | 314 ++++++++++++++++++++++++++-------- tests/by-util/test_fold.rs | 18 ++ 6 files changed, 268 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12f788c8338..d1d7c537a0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3495,6 +3495,7 @@ dependencies = [ "codspeed-divan-compat", "fluent", "tempfile", + "unicode-width 0.2.2", "uucore", ] diff --git a/src/uu/fold/Cargo.toml b/src/uu/fold/Cargo.toml index 644d78b41bb..845ce2c9698 100644 --- a/src/uu/fold/Cargo.toml +++ b/src/uu/fold/Cargo.toml @@ -21,6 +21,7 @@ path = "src/fold.rs" clap = { workspace = true } uucore = { workspace = true } fluent = { workspace = true } +unicode-width = { workspace = true } [dev-dependencies] divan = { workspace = true } diff --git a/src/uu/fold/locales/en-US.ftl b/src/uu/fold/locales/en-US.ftl index 9f8c6f3b984..d4241666715 100644 --- a/src/uu/fold/locales/en-US.ftl +++ b/src/uu/fold/locales/en-US.ftl @@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given) to standard output whilst breaking long lines fold-usage = fold [OPTION]... [FILE]... fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially) +fold-characters-help = count using character positions rather than display columns fold-spaces-help = break lines at word boundaries rather than a hard cut-off fold-width-help = set WIDTH as the maximum line width rather than 80 fold-error-illegal-width = illegal width value diff --git a/src/uu/fold/locales/fr-FR.ftl b/src/uu/fold/locales/fr-FR.ftl index 1a723594052..ce313160cf4 100644 --- a/src/uu/fold/locales/fr-FR.ftl +++ b/src/uu/fold/locales/fr-FR.ftl @@ -1,6 +1,7 @@ fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues fold-usage = fold [OPTION]... [FICHIER]... fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement) +fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80 fold-error-illegal-width = valeur de largeur illégale diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index bbaac56bee8..f943b13da65 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command}; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout}; use std::path::Path; +use unicode_width::UnicodeWidthChar; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError}; use uucore::format_usage; @@ -21,11 +22,18 @@ const TAB: u8 = b'\t'; mod options { pub const BYTES: &str = "bytes"; + pub const CHARACTERS: &str = "characters"; pub const SPACES: &str = "spaces"; pub const WIDTH: &str = "width"; pub const FILE: &str = "file"; } +#[derive(Clone, Copy, PartialEq, Eq)] +enum WidthMode { + Columns, + Characters, +} + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let args = args.collect_lossy(); @@ -34,6 +42,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?; let bytes = matches.get_flag(options::BYTES); + let characters = matches.get_flag(options::CHARACTERS); let spaces = matches.get_flag(options::SPACES); let poss_width = match matches.get_one::(options::WIDTH) { Some(v) => Some(v.clone()), @@ -55,7 +64,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { None => vec!["-".to_owned()], }; - fold(&files, bytes, spaces, width) + fold(&files, bytes, characters, spaces, width) } pub fn uu_app() -> Command { @@ -72,6 +81,13 @@ pub fn uu_app() -> Command { .help(translate!("fold-bytes-help")) .action(ArgAction::SetTrue), ) + .arg( + Arg::new(options::CHARACTERS) + .long(options::CHARACTERS) + .help(translate!("fold-characters-help")) + .conflicts_with(options::BYTES) + .action(ArgAction::SetTrue), + ) .arg( Arg::new(options::SPACES) .long(options::SPACES) @@ -107,7 +123,13 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { (args.to_vec(), None) } -fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> { +fn fold( + filenames: &[String], + bytes: bool, + characters: bool, + spaces: bool, + width: usize, +) -> UResult<()> { let mut output = BufWriter::new(stdout()); for filename in filenames { @@ -125,7 +147,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul if bytes { fold_file_bytewise(buffer, spaces, width, &mut output)?; } else { - fold_file(buffer, spaces, width, &mut output)?; + let mode = if characters { + WidthMode::Characters + } else { + WidthMode::Columns + }; + fold_file(buffer, spaces, width, mode, &mut output)?; } } @@ -213,6 +240,193 @@ fn fold_file_bytewise( Ok(()) } +fn next_tab_stop(col_count: usize) -> usize { + col_count + TAB_WIDTH - col_count % TAB_WIDTH +} + +fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { + match mode { + WidthMode::Characters => std::str::from_utf8(buffer) + .map(|s| s.chars().count()) + .unwrap_or(buffer.len()), + WidthMode::Columns => { + if let Ok(s) = std::str::from_utf8(buffer) { + let mut width = 0; + for ch in s.chars() { + match ch { + '\r' => width = 0, + '\t' => width = next_tab_stop(width), + '\x08' => width = width.saturating_sub(1), + _ => width += UnicodeWidthChar::width(ch).unwrap_or(0), + } + } + width + } else { + let mut width = 0; + for &byte in buffer { + match byte { + CR => width = 0, + TAB => width = next_tab_stop(width), + 0x08 => width = width.saturating_sub(1), + _ => width += 1, + } + } + width + } + } + } +} + +fn emit_output( + writer: &mut W, + output: &mut Vec, + last_space: &mut Option, + col_count: &mut usize, + mode: WidthMode, +) -> UResult<()> { + let consume = match *last_space { + Some(index) => index + 1, + None => output.len(), + }; + + if consume > 0 { + writer.write_all(&output[..consume])?; + } + writer.write_all(&[NL])?; + output.drain(..consume); + *col_count = compute_col_count(output, mode); + *last_space = None; + Ok(()) +} + +fn process_utf8_line( + line_str: &str, + line_bytes: &[u8], + spaces: bool, + width: usize, + mode: WidthMode, + writer: &mut W, + output: &mut Vec, + col_count: &mut usize, + last_space: &mut Option, +) -> UResult<()> { + let mut iter = line_str.char_indices().peekable(); + + while let Some((byte_idx, ch)) = iter.next() { + let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len()); + + if ch == '\n' { + *last_space = None; + emit_output(writer, output, last_space, col_count, mode)?; + break; + } + + if *col_count >= width { + emit_output(writer, output, last_space, col_count, mode)?; + } + + if ch == '\r' { + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = 0; + continue; + } + + if ch == '\x08' { + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = col_count.saturating_sub(1); + continue; + } + + if mode == WidthMode::Columns && ch == '\t' { + loop { + let next_stop = next_tab_stop(*col_count); + if next_stop > width && !output.is_empty() { + emit_output(writer, output, last_space, col_count, mode)?; + continue; + } + *col_count = next_stop; + break; + } + if spaces { + *last_space = Some(output.len()); + } else { + *last_space = None; + } + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + continue; + } + + let added = match mode { + WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0), + WidthMode::Characters => 1, + }; + + if mode == WidthMode::Columns + && added > 0 + && *col_count + added > width + && !output.is_empty() + { + emit_output(writer, output, last_space, col_count, mode)?; + } + + if spaces && ch.is_ascii_whitespace() { + *last_space = Some(output.len()); + } + + output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + *col_count = (*col_count).saturating_add(added); + } + + Ok(()) +} + +fn process_non_utf8_line( + line: &[u8], + spaces: bool, + width: usize, + mode: WidthMode, + writer: &mut W, + output: &mut Vec, + col_count: &mut usize, + last_space: &mut Option, +) -> UResult<()> { + for &byte in line { + if byte == NL { + *last_space = None; + emit_output(writer, output, last_space, col_count, mode)?; + break; + } + + if *col_count >= width { + emit_output(writer, output, last_space, col_count, mode)?; + } + + match byte { + CR => *col_count = 0, + TAB => { + let next_stop = next_tab_stop(*col_count); + if next_stop > width && !output.is_empty() { + emit_output(writer, output, last_space, col_count, mode)?; + } + *col_count = next_stop; + *last_space = if spaces { Some(output.len()) } else { None }; + output.push(byte); + continue; + } + 0x08 => *col_count = col_count.saturating_sub(1), + _ if spaces && byte.is_ascii_whitespace() => { + *last_space = Some(output.len()); + *col_count = (*col_count).saturating_add(1); + } + _ => *col_count = (*col_count).saturating_add(1), + } + + output.push(byte); + } + + Ok(()) +} + /// Fold `file` to fit `width` (number of columns). /// /// By default `fold` treats tab, backspace, and carriage return specially: @@ -226,6 +440,7 @@ fn fold_file( mut file: BufReader, spaces: bool, width: usize, + mode: WidthMode, writer: &mut W, ) -> UResult<()> { let mut line = Vec::new(); @@ -233,30 +448,6 @@ fn fold_file( let mut col_count = 0; let mut last_space = None; - /// Print the output line, resetting the column and character counts. - /// - /// If `spaces` is `true`, print the output line up to the last - /// encountered whitespace character (inclusive) and set the remaining - /// characters as the start of the next line. - macro_rules! emit_output { - () => { - let consume = match last_space { - Some(i) => i + 1, - None => output.len(), - }; - - writer.write_all(&output[..consume])?; - writer.write_all(&[NL])?; - output.drain(..consume); - - // we know there are no tabs left in output, so each char counts - // as 1 column - col_count = output.len(); - - last_space = None; - }; - } - loop { if file .read_until(NL, &mut line) @@ -266,50 +457,37 @@ fn fold_file( break; } - for ch in &line { - if *ch == NL { - // make sure to _not_ split output at whitespace, since we - // know the entire output will fit - last_space = None; - emit_output!(); - break; - } - - if col_count >= width { - emit_output!(); - } - - match *ch { - CR => col_count = 0, - TAB => { - let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH; - - if next_tab_stop > width && !output.is_empty() { - emit_output!(); - } - - col_count = next_tab_stop; - last_space = if spaces { Some(output.len()) } else { None }; - } - 0x08 => { - col_count = col_count.saturating_sub(1); - } - _ if spaces && ch.is_ascii_whitespace() => { - last_space = Some(output.len()); - col_count += 1; - } - _ => col_count += 1, - } - - output.push(*ch); + if let Ok(line_str) = std::str::from_utf8(&line) { + process_utf8_line( + line_str, + &line, + spaces, + width, + mode, + writer, + &mut output, + &mut col_count, + &mut last_space, + )?; + } else { + process_non_utf8_line( + &line, + spaces, + width, + mode, + writer, + &mut output, + &mut col_count, + &mut last_space, + )?; } - if !output.is_empty() { - writer.write_all(&output)?; - output.truncate(0); - } + line.clear(); + } - line.truncate(0); + if !output.is_empty() { + writer.write_all(&output)?; + output.clear(); } Ok(()) diff --git a/tests/by-util/test_fold.rs b/tests/by-util/test_fold.rs index 4a2d381fafb..04072ab157f 100644 --- a/tests/by-util/test_fold.rs +++ b/tests/by-util/test_fold.rs @@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() { .stdout_is_fixture("lorem_ipsum_new_line_80_column.expected"); } +#[test] +fn test_wide_characters_in_column_mode() { + new_ucmd!() + .args(&["-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\n\u{B250}\n"); +} + +#[test] +fn test_wide_characters_with_characters_option() { + new_ucmd!() + .args(&["--characters", "-w", "5"]) + .pipe_in("\u{B250}\u{B250}\u{B250}\n") + .succeeds() + .stdout_is("\u{B250}\u{B250}\u{B250}\n"); +} + #[test] fn test_should_preserve_empty_line_without_final_newline() { new_ucmd!() From 28e1a6509f1bb2524085b0118fa79e0a1ceefd96 Mon Sep 17 00:00:00 2001 From: mattsu Date: Mon, 3 Nov 2025 16:54:02 +0900 Subject: [PATCH 10/17] refactor(fold): introduce FoldContext to encapsulate state and simplify function signatures Introduce a new FoldContext struct to group related fields (spaces, width, mode, writer, output, col_count, last_space) into a single context object. This refactoring reduces parameter passing in emit_output and process_utf8_line functions, improving code readability and maintainability without altering the core folding logic. --- src/uu/fold/src/fold.rs | 177 +++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 92 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index f943b13da65..dfb85742cca 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -34,6 +34,16 @@ enum WidthMode { Characters, } +struct FoldContext<'a, W: Write> { + spaces: bool, + width: usize, + mode: WidthMode, + writer: &'a mut W, + output: &'a mut Vec, + col_count: &'a mut usize, + last_space: &'a mut Option, +} + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let args = args.collect_lossy(); @@ -277,151 +287,135 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { } } -fn emit_output( - writer: &mut W, - output: &mut Vec, - last_space: &mut Option, - col_count: &mut usize, - mode: WidthMode, -) -> UResult<()> { - let consume = match *last_space { +fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let consume = match *ctx.last_space { Some(index) => index + 1, - None => output.len(), + None => ctx.output.len(), }; if consume > 0 { - writer.write_all(&output[..consume])?; + ctx.writer.write_all(&ctx.output[..consume])?; } - writer.write_all(&[NL])?; - output.drain(..consume); - *col_count = compute_col_count(output, mode); - *last_space = None; + ctx.writer.write_all(&[NL])?; + ctx.output.drain(..consume); + *ctx.col_count = compute_col_count(ctx.output, ctx.mode); + *ctx.last_space = None; Ok(()) } -fn process_utf8_line( - line_str: &str, - line_bytes: &[u8], - spaces: bool, - width: usize, - mode: WidthMode, - writer: &mut W, - output: &mut Vec, - col_count: &mut usize, - last_space: &mut Option, -) -> UResult<()> { - let mut iter = line_str.char_indices().peekable(); +fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { + let line_bytes = line.as_bytes(); + let mut iter = line.char_indices().peekable(); while let Some((byte_idx, ch)) = iter.next() { let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len()); if ch == '\n' { - *last_space = None; - emit_output(writer, output, last_space, col_count, mode)?; + *ctx.last_space = None; + emit_output(ctx)?; break; } - if *col_count >= width { - emit_output(writer, output, last_space, col_count, mode)?; + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; } if ch == '\r' { - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = 0; + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = 0; continue; } if ch == '\x08' { - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = col_count.saturating_sub(1); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_sub(1); continue; } - if mode == WidthMode::Columns && ch == '\t' { + if ctx.mode == WidthMode::Columns && ch == '\t' { loop { - let next_stop = next_tab_stop(*col_count); - if next_stop > width && !output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; continue; } - *col_count = next_stop; + *ctx.col_count = next_stop; break; } - if spaces { - *last_space = Some(output.len()); + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); } else { - *last_space = None; + *ctx.last_space = None; } - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); continue; } - let added = match mode { + let added = match ctx.mode { WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0), WidthMode::Characters => 1, }; - if mode == WidthMode::Columns + if ctx.mode == WidthMode::Columns && added > 0 - && *col_count + added > width - && !output.is_empty() + && *ctx.col_count + added > ctx.width + && !ctx.output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + emit_output(ctx)?; } - if spaces && ch.is_ascii_whitespace() { - *last_space = Some(output.len()); + if ctx.spaces && ch.is_ascii_whitespace() { + *ctx.last_space = Some(ctx.output.len()); } - output.extend_from_slice(&line_bytes[byte_idx..next_idx]); - *col_count = (*col_count).saturating_add(added); + ctx.output + .extend_from_slice(&line_bytes[byte_idx..next_idx]); + *ctx.col_count = ctx.col_count.saturating_add(added); } Ok(()) } -fn process_non_utf8_line( - line: &[u8], - spaces: bool, - width: usize, - mode: WidthMode, - writer: &mut W, - output: &mut Vec, - col_count: &mut usize, - last_space: &mut Option, -) -> UResult<()> { +fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { for &byte in line { if byte == NL { - *last_space = None; - emit_output(writer, output, last_space, col_count, mode)?; + *ctx.last_space = None; + emit_output(ctx)?; break; } - if *col_count >= width { - emit_output(writer, output, last_space, col_count, mode)?; + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; } match byte { - CR => *col_count = 0, + CR => *ctx.col_count = 0, TAB => { - let next_stop = next_tab_stop(*col_count); - if next_stop > width && !output.is_empty() { - emit_output(writer, output, last_space, col_count, mode)?; + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; } - *col_count = next_stop; - *last_space = if spaces { Some(output.len()) } else { None }; - output.push(byte); + *ctx.col_count = next_stop; + *ctx.last_space = if ctx.spaces { + Some(ctx.output.len()) + } else { + None + }; + ctx.output.push(byte); continue; } - 0x08 => *col_count = col_count.saturating_sub(1), - _ if spaces && byte.is_ascii_whitespace() => { - *last_space = Some(output.len()); - *col_count = (*col_count).saturating_add(1); + 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), + _ if ctx.spaces && byte.is_ascii_whitespace() => { + *ctx.last_space = Some(ctx.output.len()); + *ctx.col_count = ctx.col_count.saturating_add(1); } - _ => *col_count = (*col_count).saturating_add(1), + _ => *ctx.col_count = ctx.col_count.saturating_add(1), } - output.push(byte); + ctx.output.push(byte); } Ok(()) @@ -458,28 +452,27 @@ fn fold_file( } if let Ok(line_str) = std::str::from_utf8(&line) { - process_utf8_line( - line_str, - &line, + let mut ctx = FoldContext { spaces, width, mode, writer, - &mut output, - &mut col_count, - &mut last_space, - )?; + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + process_utf8_line(line_str, &mut ctx)?; } else { - process_non_utf8_line( - &line, + let mut ctx = FoldContext { spaces, width, mode, writer, - &mut output, - &mut col_count, - &mut last_space, - )?; + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + process_non_utf8_line(&line, &mut ctx)?; } line.clear(); From 65b1fd4471a38657fdcfd27ec51dbb9f685687b4 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 08:43:55 +0900 Subject: [PATCH 11/17] feat: optimize ASCII line processing in fold Add process_ascii_line function to handle ASCII bytes efficiently, avoiding UTF-8 overhead for ASCII input. Update emit_output to properly manage output buffer remainder and track last space position for better folding logic. Modify process_utf8_line to delegate ASCII lines to the new function. --- src/uu/fold/src/fold.rs | 93 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 2 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index dfb85742cca..753e9635954 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -297,13 +297,102 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { ctx.writer.write_all(&ctx.output[..consume])?; } ctx.writer.write_all(&[NL])?; - ctx.output.drain(..consume); + + if consume < ctx.output.len() { + let remainder = ctx.output.split_off(consume); + *ctx.output = remainder; + } else { + ctx.output.clear(); + } + *ctx.col_count = compute_col_count(ctx.output, ctx.mode); - *ctx.last_space = None; + + if ctx.spaces { + *ctx.last_space = ctx + .output + .iter() + .rposition(|b| b.is_ascii_whitespace() && *b != CR); + } else { + *ctx.last_space = None; + } + Ok(()) +} + +fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + for &byte in line { + if byte == NL { + *ctx.last_space = None; + emit_output(ctx)?; + break; + } + + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + } + + if byte == CR { + ctx.output.push(byte); + *ctx.col_count = 0; + continue; + } + + if byte == 0x08 { + ctx.output.push(byte); + *ctx.col_count = ctx.col_count.saturating_sub(1); + continue; + } + + if ctx.mode == WidthMode::Columns && byte == TAB { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output.push(byte); + continue; + } + + let added = match ctx.mode { + WidthMode::Columns => match byte { + 0x00..=0x08 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => 0, + _ => 1, + }, + WidthMode::Characters => 1, + }; + + if ctx.mode == WidthMode::Columns + && added > 0 + && *ctx.col_count + added > ctx.width + && !ctx.output.is_empty() + { + emit_output(ctx)?; + } + + if ctx.spaces && byte.is_ascii_whitespace() && byte != CR { + *ctx.last_space = Some(ctx.output.len()); + } + + ctx.output.push(byte); + *ctx.col_count = ctx.col_count.saturating_add(added); + } + Ok(()) } fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if line.is_ascii() { + return process_ascii_line(line.as_bytes(), ctx); + } + let line_bytes = line.as_bytes(); let mut iter = line.char_indices().peekable(); From 54c23087ae63c6724d9e433734ecafdde8f28ce5 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 08:52:40 +0900 Subject: [PATCH 12/17] feat: add rposition to jargon wordlist Add "rposition" to the cspell jargon dictionary to prevent spell check errors for this technical term. --- .vscode/cspell.dictionaries/jargon.wordlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index 70cbd937933..a3b51bfedb6 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -120,6 +120,7 @@ pseudoprimes quantiles readonly reparse +rposition seedable semver semiprime From 578f6d45e75baecec3cb94fa4b30ebd0213c4e4c Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:33:14 +0900 Subject: [PATCH 13/17] perf: optimize fold benchmarks and output handling - Replace fold/writeln! with loop/push_str in benchmarks for faster string building - Add append_usize helper to avoid allocations in benchmark data generation - Refactor emit_output to use drain instead of split_off for better performance - Update last_space calculation to handle index adjustments more efficiently These changes improve performance in the fold utility's benchmarks and core logic by reducing allocations and optimizing string operations. --- src/uu/fold/src/fold.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 753e9635954..f183fa360c2 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -298,9 +298,10 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { } ctx.writer.write_all(&[NL])?; + let last_space = *ctx.last_space; + if consume < ctx.output.len() { - let remainder = ctx.output.split_off(consume); - *ctx.output = remainder; + ctx.output.drain(..consume); } else { ctx.output.clear(); } @@ -308,10 +309,8 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { *ctx.col_count = compute_col_count(ctx.output, ctx.mode); if ctx.spaces { - *ctx.last_space = ctx - .output - .iter() - .rposition(|b| b.is_ascii_whitespace() && *b != CR); + *ctx.last_space = last_space + .and_then(|idx| if idx + 1 <= consume { None } else { Some(idx - consume) }); } else { *ctx.last_space = None; } From 3b593d7a4e290c7f9b948ab0ab10e6bd1e6fbbd4 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:34:13 +0900 Subject: [PATCH 14/17] refactor(fold): improve readability of last_space assignment in emit_output Break the inline closure into a multi-line block for better code clarity and maintainability. --- src/uu/fold/src/fold.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index f183fa360c2..4015947f978 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -309,8 +309,13 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { *ctx.col_count = compute_col_count(ctx.output, ctx.mode); if ctx.spaces { - *ctx.last_space = last_space - .and_then(|idx| if idx + 1 <= consume { None } else { Some(idx - consume) }); + *ctx.last_space = last_space.and_then(|idx| { + if idx + 1 <= consume { + None + } else { + Some(idx - consume) + } + }); } else { *ctx.last_space = None; } From e06aec9bde2d236c10f3fc3c829fb1f0ab6a034e Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:37:44 +0900 Subject: [PATCH 15/17] fix(fold): correct space index condition in emit_output The condition for updating the last space index was changed from `idx + 1 <= consume` to `idx < consume` to fix an off-by-one error, ensuring proper handling of spaces when consuming characters during line folding. --- src/uu/fold/src/fold.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 4015947f978..9cf91a267de 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -310,7 +310,7 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { if ctx.spaces { *ctx.last_space = last_space.and_then(|idx| { - if idx + 1 <= consume { + if idx < consume { None } else { Some(idx - consume) From 7e17a51b485ff170ea6f7a211319f9e8e6679d56 Mon Sep 17 00:00:00 2001 From: mattsu Date: Tue, 4 Nov 2025 09:53:53 +0900 Subject: [PATCH 16/17] refactor(fold): optimize ASCII line processing for better character handling Refactor the `process_ascii_line` function to use a while loop with pattern matching instead of a for loop, improving efficiency and clarity. Introduce `push_ascii_segment` to handle contiguous printable character sequences, ensuring accurate column counting and whitespace tracking in both columns and characters modes. This addresses potential issues with control character processing and width calculations. --- src/uu/fold/src/fold.rs | 137 +++++++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 52 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 9cf91a267de..88e7b95bb5b 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -323,70 +323,103 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { } fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { - for &byte in line { - if byte == NL { - *ctx.last_space = None; - emit_output(ctx)?; - break; - } + let mut idx = 0; + let len = line.len(); - if *ctx.col_count >= ctx.width { - emit_output(ctx)?; - } - - if byte == CR { - ctx.output.push(byte); - *ctx.col_count = 0; - continue; - } - - if byte == 0x08 { - ctx.output.push(byte); - *ctx.col_count = ctx.col_count.saturating_sub(1); - continue; - } - - if ctx.mode == WidthMode::Columns && byte == TAB { - loop { - let next_stop = next_tab_stop(*ctx.col_count); - if next_stop > ctx.width && !ctx.output.is_empty() { - emit_output(ctx)?; - continue; - } - *ctx.col_count = next_stop; + while idx < len { + match line[idx] { + NL => { + *ctx.last_space = None; + emit_output(ctx)?; break; } - if ctx.spaces { - *ctx.last_space = Some(ctx.output.len()); - } else { - *ctx.last_space = None; + CR => { + ctx.output.push(CR); + *ctx.col_count = 0; + idx += 1; + } + 0x08 => { + ctx.output.push(0x08); + *ctx.col_count = ctx.col_count.saturating_sub(1); + idx += 1; + } + TAB if ctx.mode == WidthMode::Columns => { + loop { + let next_stop = next_tab_stop(*ctx.col_count); + if next_stop > ctx.width && !ctx.output.is_empty() { + emit_output(ctx)?; + continue; + } + *ctx.col_count = next_stop; + break; + } + if ctx.spaces { + *ctx.last_space = Some(ctx.output.len()); + } else { + *ctx.last_space = None; + } + ctx.output.push(TAB); + idx += 1; + } + 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { + ctx.output.push(line[idx]); + if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { + *ctx.last_space = Some(ctx.output.len() - 1); + } else if !ctx.spaces { + *ctx.last_space = None; + } + idx += 1; + } + _ => { + let start = idx; + while idx < len + && !matches!( + line[idx], + NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F + ) + { + idx += 1; + } + push_ascii_segment(&line[start..idx], ctx)?; } - ctx.output.push(byte); - continue; } + } - let added = match ctx.mode { - WidthMode::Columns => match byte { - 0x00..=0x08 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => 0, - _ => 1, - }, - WidthMode::Characters => 1, - }; + Ok(()) +} - if ctx.mode == WidthMode::Columns - && added > 0 - && *ctx.col_count + added > ctx.width - && !ctx.output.is_empty() - { +fn push_ascii_segment(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { + if segment.is_empty() { + return Ok(()); + } + + let mut remaining = segment; + + while !remaining.is_empty() { + if *ctx.col_count >= ctx.width { emit_output(ctx)?; + continue; } - if ctx.spaces && byte.is_ascii_whitespace() && byte != CR { - *ctx.last_space = Some(ctx.output.len()); + let available = ctx.width - *ctx.col_count; + let take = remaining.len().min(available); + let base_len = ctx.output.len(); + + ctx.output.extend_from_slice(&remaining[..take]); + *ctx.col_count += take; + + if ctx.spaces { + if let Some(pos) = remaining[..take] + .iter() + .rposition(|b| b.is_ascii_whitespace() && *b != CR) + { + *ctx.last_space = Some(base_len + pos); + } + } else { + *ctx.last_space = None; } - ctx.output.push(byte); - *ctx.col_count = ctx.col_count.saturating_add(added); + remaining = &remaining[take..]; } Ok(()) From fbefe63537208f8feb24fca032df4531f99acf0c Mon Sep 17 00:00:00 2001 From: mattsu Date: Wed, 12 Nov 2025 17:47:29 +0900 Subject: [PATCH 17/17] refactor(fold): unify UTF-8 and non-UTF-8 handling paths - Simplify fold_file by sharing FoldContext construction - Reduces duplicated code and improves maintainability without behavior change --- src/uu/fold/src/fold.rs | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 88e7b95bb5b..f14ed3cf071 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -577,28 +577,19 @@ fn fold_file( break; } - if let Ok(line_str) = std::str::from_utf8(&line) { - let mut ctx = FoldContext { - spaces, - width, - mode, - writer, - output: &mut output, - col_count: &mut col_count, - last_space: &mut last_space, - }; - process_utf8_line(line_str, &mut ctx)?; - } else { - let mut ctx = FoldContext { - spaces, - width, - mode, - writer, - output: &mut output, - col_count: &mut col_count, - last_space: &mut last_space, - }; - process_non_utf8_line(&line, &mut ctx)?; + let mut ctx = FoldContext { + spaces, + width, + mode, + writer, + output: &mut output, + col_count: &mut col_count, + last_space: &mut last_space, + }; + + match std::str::from_utf8(&line) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&line, &mut ctx)?, } line.clear();