-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
fold:fix gnu test fold-zero-width.sh #9274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6d721e5
b21cf35
77a1c31
b0e0033
ac09d10
dba5b9b
556804c
5424072
a1bbd40
d176e65
dbad8db
cd6f536
75c542f
662a011
3b07c8e
1d35753
eb7d8c4
b9b46be
5e1d7f5
e54a74b
5de4b28
0605e24
25b5c11
654c128
8aa2751
71f845c
895cdaa
7331d23
afbd393
3c86fbf
6cd6f50
30b53a8
2616981
96da2d4
9ab1271
fb6e73e
47fb94e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,10 @@ const TAB_WIDTH: usize = 8; | |
| const NL: u8 = b'\n'; | ||
| const CR: u8 = b'\r'; | ||
| const TAB: u8 = b'\t'; | ||
| // Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming. | ||
| // Chosen as a small, fixed cap: large enough to avoid excessive flushes, but | ||
| // small enough to keep memory bounded when the input has no fold points. | ||
| const STREAMING_FLUSH_THRESHOLD: usize = 8 * 1024; | ||
|
|
||
| mod options { | ||
| pub const BYTES: &str = "bytes"; | ||
|
|
@@ -288,6 +292,10 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { | |
| } | ||
|
|
||
| fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| // Emit one folded line: | ||
| // - with `-s`, cut at the last remembered whitespace when possible | ||
| // - otherwise, cut at the current buffer end | ||
| // The remainder (if any) stays in the buffer for the next line. | ||
| let consume = match *ctx.last_space { | ||
| Some(index) => index + 1, | ||
| None => ctx.output.len(), | ||
|
|
@@ -309,6 +317,7 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | |
| *ctx.col_count = compute_col_count(ctx.output, ctx.mode); | ||
|
|
||
| if ctx.spaces { | ||
| // Rebase the remembered whitespace position into the remaining buffer. | ||
| *ctx.last_space = last_space.and_then(|idx| { | ||
| if idx < consume { | ||
| None | ||
|
|
@@ -322,6 +331,36 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | |
| Ok(()) | ||
| } | ||
|
|
||
| fn maybe_flush_unbroken_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add some comments to explain what these functions are doing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
| // In streaming mode without `-s`, avoid unbounded buffering by periodically | ||
| // flushing long unbroken segments. With `-s` we must keep the buffer so we | ||
| // can still break at the last whitespace boundary. | ||
| if ctx.spaces || ctx.output.len() < STREAMING_FLUSH_THRESHOLD { | ||
| return Ok(()); | ||
| } | ||
|
|
||
| // Write raw bytes without inserting a newline; folding will continue | ||
| // based on updated column tracking in the caller. | ||
| ctx.writer.write_all(ctx.output)?; | ||
| ctx.output.clear(); | ||
| Ok(()) | ||
| } | ||
|
|
||
| fn push_byte<W: Write>(ctx: &mut FoldContext<'_, W>, byte: u8) -> UResult<()> { | ||
| // Append a single byte to the buffer. | ||
| ctx.output.push(byte); | ||
| maybe_flush_unbroken_output(ctx) | ||
| } | ||
|
|
||
| fn push_bytes<W: Write>(ctx: &mut FoldContext<'_, W>, bytes: &[u8]) -> UResult<()> { | ||
| // Append a byte slice to the buffer and flush if it grows too large. | ||
| if bytes.is_empty() { | ||
| return Ok(()); | ||
| } | ||
| ctx.output.extend_from_slice(bytes); | ||
| maybe_flush_unbroken_output(ctx) | ||
| } | ||
|
|
||
| fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| let mut idx = 0; | ||
| let len = line.len(); | ||
|
|
@@ -331,15 +370,15 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR | |
| NL => { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| idx += 1; | ||
| } | ||
| CR => { | ||
| ctx.output.push(CR); | ||
| push_byte(ctx, CR)?; | ||
| *ctx.col_count = 0; | ||
| idx += 1; | ||
| } | ||
| 0x08 => { | ||
| ctx.output.push(0x08); | ||
| push_byte(ctx, 0x08)?; | ||
| *ctx.col_count = ctx.col_count.saturating_sub(1); | ||
| idx += 1; | ||
| } | ||
|
|
@@ -358,16 +397,23 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR | |
| } else { | ||
| *ctx.last_space = None; | ||
| } | ||
| ctx.output.push(TAB); | ||
| push_byte(ctx, TAB)?; | ||
| idx += 1; | ||
| } | ||
| 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { | ||
| ctx.output.push(line[idx]); | ||
| push_byte(ctx, line[idx])?; | ||
| if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { | ||
| *ctx.last_space = Some(ctx.output.len() - 1); | ||
| } else if !ctx.spaces { | ||
| *ctx.last_space = None; | ||
| } | ||
|
|
||
| if ctx.mode == WidthMode::Characters { | ||
| *ctx.col_count = ctx.col_count.saturating_add(1); | ||
| if *ctx.col_count >= ctx.width { | ||
| emit_output(ctx)?; | ||
| } | ||
| } | ||
| idx += 1; | ||
| } | ||
| _ => { | ||
|
|
@@ -405,7 +451,7 @@ fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| let take = remaining.len().min(available); | ||
| let base_len = ctx.output.len(); | ||
|
|
||
| ctx.output.extend_from_slice(&remaining[..take]); | ||
| push_bytes(ctx, &remaining[..take])?; | ||
| *ctx.col_count += take; | ||
|
|
||
| if ctx.spaces { | ||
|
|
@@ -430,16 +476,26 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| return process_ascii_line(line.as_bytes(), ctx); | ||
| } | ||
|
|
||
| process_utf8_chars(line, ctx) | ||
| } | ||
|
|
||
| fn process_utf8_chars<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| let line_bytes = line.as_bytes(); | ||
| let mut iter = line.char_indices().peekable(); | ||
|
|
||
| while let Some((byte_idx, ch)) = iter.next() { | ||
mattsu2020 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Include combining characters with the base character | ||
| while let Some(&(_, next_ch)) = iter.peek() { | ||
| if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { | ||
| iter.next(); | ||
| } else { | ||
| break; | ||
| // Include combining characters with the base character when we are | ||
| // measuring by display columns. In character-counting mode every | ||
| // scalar value must advance the counter to match `chars().count()` | ||
| // semantics (see `fold_characters_reference` in the tests), so we do | ||
| // not coalesce zero-width scalars there. | ||
| if ctx.mode == WidthMode::Columns { | ||
| while let Some(&(_, next_ch)) = iter.peek() { | ||
| if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { | ||
| iter.next(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -448,23 +504,21 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| if ch == '\n' { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| continue; | ||
| } | ||
|
|
||
| if *ctx.col_count >= ctx.width { | ||
| emit_output(ctx)?; | ||
| } | ||
|
|
||
| if ch == '\r' { | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = 0; | ||
| continue; | ||
| } | ||
|
|
||
| if ch == '\x08' { | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = ctx.col_count.saturating_sub(1); | ||
| continue; | ||
| } | ||
|
|
@@ -484,8 +538,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| } else { | ||
| *ctx.last_space = None; | ||
| } | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -506,8 +559,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| *ctx.last_space = Some(ctx.output.len()); | ||
| } | ||
|
|
||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = ctx.col_count.saturating_add(added); | ||
| } | ||
|
|
||
|
|
@@ -519,7 +571,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| if byte == NL { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| continue; | ||
| } | ||
|
|
||
| if *ctx.col_count >= ctx.width { | ||
|
|
@@ -539,7 +591,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| } else { | ||
| None | ||
| }; | ||
| ctx.output.push(byte); | ||
| push_byte(ctx, byte)?; | ||
| continue; | ||
| } | ||
| 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), | ||
|
|
@@ -550,7 +602,46 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| _ => *ctx.col_count = ctx.col_count.saturating_add(1), | ||
| } | ||
|
|
||
| ctx.output.push(byte); | ||
| push_byte(ctx, byte)?; | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Process buffered bytes, emitting output for valid UTF-8 prefixes and | ||
| /// deferring incomplete sequences until more input arrives. | ||
| /// | ||
| /// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and | ||
| /// the buffer is fully consumed. | ||
| fn process_pending_chunk<W: Write>( | ||
mattsu2020 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| pending: &mut Vec<u8>, | ||
| ctx: &mut FoldContext<'_, W>, | ||
| ) -> UResult<()> { | ||
| while !pending.is_empty() { | ||
| match std::str::from_utf8(pending) { | ||
| Ok(valid) => { | ||
| process_utf8_line(valid, ctx)?; | ||
| pending.clear(); | ||
| break; | ||
| } | ||
| Err(err) => { | ||
| if err.error_len().is_some() { | ||
| let res = process_non_utf8_line(pending, ctx); | ||
| pending.clear(); | ||
mattsu2020 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| res?; | ||
| break; | ||
| } | ||
|
|
||
| let valid_up_to = err.valid_up_to(); | ||
| if valid_up_to == 0 { | ||
| break; | ||
| } | ||
|
|
||
| let valid = std::str::from_utf8(&pending[..valid_up_to]).expect("valid prefix"); | ||
| process_utf8_line(valid, ctx)?; | ||
| pending.drain(..valid_up_to); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
|
|
@@ -572,20 +663,12 @@ fn fold_file<T: Read, W: Write>( | |
| mode: WidthMode, | ||
| writer: &mut W, | ||
| ) -> UResult<()> { | ||
| let mut line = Vec::new(); | ||
| let mut output = Vec::new(); | ||
| let mut col_count = 0; | ||
| let mut last_space = None; | ||
| let mut pending = Vec::with_capacity(8 * 1024); | ||
|
|
||
| loop { | ||
| if file | ||
| .read_until(NL, &mut line) | ||
| .map_err_context(|| translate!("fold-error-readline"))? | ||
| == 0 | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| { | ||
| let mut ctx = FoldContext { | ||
| spaces, | ||
| width, | ||
|
|
@@ -596,17 +679,32 @@ fn fold_file<T: Read, W: Write>( | |
| last_space: &mut last_space, | ||
| }; | ||
|
|
||
| match std::str::from_utf8(&line) { | ||
| Ok(s) => process_utf8_line(s, &mut ctx)?, | ||
| Err(_) => process_non_utf8_line(&line, &mut ctx)?, | ||
| loop { | ||
| let buffer = file | ||
| .fill_buf() | ||
| .map_err_context(|| translate!("fold-error-readline"))?; | ||
| if buffer.is_empty() { | ||
| break; | ||
| } | ||
| pending.extend_from_slice(buffer); | ||
| let consumed = buffer.len(); | ||
| file.consume(consumed); | ||
|
|
||
| process_pending_chunk(&mut pending, &mut ctx)?; | ||
| } | ||
|
|
||
| line.clear(); | ||
| } | ||
| if !pending.is_empty() { | ||
| match std::str::from_utf8(&pending) { | ||
| Ok(s) => process_utf8_line(s, &mut ctx)?, | ||
| Err(_) => process_non_utf8_line(&pending, &mut ctx)?, | ||
| } | ||
| pending.clear(); | ||
| } | ||
|
|
||
| if !output.is_empty() { | ||
| writer.write_all(&output)?; | ||
| output.clear(); | ||
| if !ctx.output.is_empty() { | ||
| ctx.writer.write_all(ctx.output)?; | ||
| ctx.output.clear(); | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please document this magic number
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add