From 2e75372c404fc41842fe8c07d5fb36cd74e16e57 Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 30 Mar 2023 11:27:04 -0700 Subject: [PATCH 1/6] remove utf8 crate dependency. --- Cargo.lock | 7 -- src/uu/wc/Cargo.toml | 1 - src/uu/wc/src/utf8/LICENSE | 25 +++++++ src/uu/wc/src/utf8/mod.rs | 124 +++++++++++++++++++++++++++++++++ src/uu/wc/src/utf8/read.rs | 136 +++++++++++++++++++++++++++++++++++++ src/uu/wc/src/wc.rs | 1 + 6 files changed, 286 insertions(+), 8 deletions(-) create mode 100644 src/uu/wc/src/utf8/LICENSE create mode 100644 src/uu/wc/src/utf8/mod.rs create mode 100644 src/uu/wc/src/utf8/read.rs diff --git a/Cargo.lock b/Cargo.lock index 89d67bf8146..38032bcae3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2339,12 +2339,6 @@ dependencies = [ "log", ] -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "uu_arch" version = "0.0.17" @@ -3314,7 +3308,6 @@ dependencies = [ "libc", "nix", "unicode-width", - "utf-8", "uucore", ] diff --git a/src/uu/wc/Cargo.toml b/src/uu/wc/Cargo.toml index 40e0fd03b42..54f24a99b92 100644 --- a/src/uu/wc/Cargo.toml +++ b/src/uu/wc/Cargo.toml @@ -18,7 +18,6 @@ path = "src/wc.rs" clap = { workspace=true } uucore = { workspace=true, features=["pipes"] } bytecount = { workspace=true } -utf-8 = { workspace=true } unicode-width = { workspace=true } [target.'cfg(unix)'.dependencies] diff --git a/src/uu/wc/src/utf8/LICENSE b/src/uu/wc/src/utf8/LICENSE new file mode 100644 index 00000000000..1e031b3aa9c --- /dev/null +++ b/src/uu/wc/src/utf8/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) Simon Sapin and many others + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs new file mode 100644 index 00000000000..55db28ff476 --- /dev/null +++ b/src/uu/wc/src/utf8/mod.rs @@ -0,0 +1,124 @@ +mod read; + +pub use read::{BufReadDecoder, BufReadDecoderError}; + +use std::cmp; +use std::str; + +/// +/// Incremental, zero-copy UTF-8 decoding with error handling +/// +/// The original implemention was written by Simon Sapin in the utf-8 crate . +/// uu_wc used to depend on that crate. +/// The author archived the repository . +/// They suggested incorporating the source directly into uu_wc . +/// + +#[derive(Debug, Copy, Clone)] +pub struct Incomplete { + pub buffer: [u8; 4], + pub buffer_len: u8, +} + +impl Incomplete { + pub fn empty() -> Self { + Self { + buffer: [0, 0, 0, 0], + buffer_len: 0, + } + } + + pub fn is_empty(&self) -> bool { + self.buffer_len == 0 + } + + pub fn new(bytes: &[u8]) -> Self { + let mut buffer = [0, 0, 0, 0]; + let len = bytes.len(); + buffer[..len].copy_from_slice(bytes); + Self { + buffer, + buffer_len: len as u8, + } + } + + fn take_buffer(&mut self) -> &[u8] { + let len = self.buffer_len as usize; + self.buffer_len = 0; + &self.buffer[..len] + } + + /// (consumed_from_input, None): not enough input + /// (consumed_from_input, Some(Err(()))): error bytes in buffer + /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer + fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { + let initial_buffer_len = self.buffer_len as usize; + let copied_from_input; + { + let unwritten = &mut self.buffer[initial_buffer_len..]; + copied_from_input = cmp::min(unwritten.len(), input.len()); + unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); + } + let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; + match str::from_utf8(spliced) { + Ok(_) => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, Some(Ok(()))) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = valid_up_to as u8; + (consumed, Some(Ok(()))) + } else { + match error.error_len() { + Some(invalid_sequence_length) => { + let consumed = invalid_sequence_length + .checked_sub(initial_buffer_len) + .unwrap(); + self.buffer_len = invalid_sequence_length as u8; + (consumed, Some(Err(()))) + } + None => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, None) + } + } + } + } + } + } +} +#[cfg(test)] +mod test { + use std::collections::VecDeque; + use std::io; + + struct Chunks<'a>(VecDeque<&'a [u8]>); + + impl<'a> io::Read for Chunks<'a> { + fn read(&mut self, _: &mut [u8]) -> io::Result { + unimplemented!() + } + } + + impl<'a> io::BufRead for Chunks<'a> { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + Ok(*self.0.front().unwrap()) + } + + fn consume(&mut self, bytes: usize) { + { + let front = self.0.front_mut().unwrap(); + *front = &front[bytes..]; + if !front.is_empty() { + return; + } + } + if self.0.len() > 1 { + self.0.pop_front(); + } + } + } +} diff --git a/src/uu/wc/src/utf8/read.rs b/src/uu/wc/src/utf8/read.rs new file mode 100644 index 00000000000..75247952443 --- /dev/null +++ b/src/uu/wc/src/utf8/read.rs @@ -0,0 +1,136 @@ +use super::*; +use std::error::Error; +use std::fmt; +use std::io::{self, BufRead}; +use std::str; + +/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. +pub struct BufReadDecoder { + buf_read: B, + bytes_consumed: usize, + incomplete: Incomplete, +} + +#[derive(Debug)] +pub enum BufReadDecoderError<'a> { + /// Represents one UTF-8 error in the byte stream. + /// + /// In lossy decoding, each such error should be replaced with U+FFFD. + /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) + InvalidByteSequence(&'a [u8]), + + /// An I/O error from the underlying byte stream + Io(io::Error), +} + +impl<'a> fmt::Display for BufReadDecoderError<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + BufReadDecoderError::InvalidByteSequence(bytes) => { + write!(f, "invalid byte sequence: {:02x?}", bytes) + } + BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), + } + } +} + +impl<'a> Error for BufReadDecoderError<'a> { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match *self { + BufReadDecoderError::InvalidByteSequence(_) => None, + BufReadDecoderError::Io(ref err) => Some(err), + } + } +} + +impl BufReadDecoder { + pub fn new(buf_read: B) -> Self { + Self { + buf_read, + bytes_consumed: 0, + incomplete: Incomplete::empty(), + } + } + + /// Decode and consume the next chunk of UTF-8 input. + /// + /// This method is intended to be called repeatedly until it returns `None`, + /// which represents EOF from the underlying byte stream. + /// This is similar to `Iterator::next`, + /// except that decoded chunks borrow the decoder (~iterator) + /// so they need to be handled or copied before the next chunk can start decoding. + pub fn next_strict(&mut self) -> Option> { + enum BytesSource { + BufRead(usize), + Incomplete, + } + macro_rules! try_io { + ($io_result: expr) => { + match $io_result { + Ok(value) => value, + Err(error) => return Some(Err(BufReadDecoderError::Io(error))), + } + }; + } + let (source, result) = loop { + if self.bytes_consumed > 0 { + self.buf_read.consume(self.bytes_consumed); + self.bytes_consumed = 0; + } + let buf = try_io!(self.buf_read.fill_buf()); + + // Force loop iteration to go through an explicit `continue` + enum Unreachable {} + let _: Unreachable = if self.incomplete.is_empty() { + if buf.is_empty() { + return None; // EOF + } + match str::from_utf8(buf) { + Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())), + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + break (BytesSource::BufRead(valid_up_to), Ok(())); + } + match error.error_len() { + Some(invalid_sequence_length) => { + break (BytesSource::BufRead(invalid_sequence_length), Err(())) + } + None => { + self.bytes_consumed = buf.len(); + self.incomplete = Incomplete::new(buf); + // need more input bytes + continue; + } + } + } + } + } else { + if buf.is_empty() { + break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point + } + let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); + self.bytes_consumed = consumed; + match opt_result { + None => { + // need more input bytes + continue; + } + Some(result) => break (BytesSource::Incomplete, result), + } + }; + }; + let bytes = match source { + BytesSource::BufRead(byte_count) => { + self.bytes_consumed = byte_count; + let buf = try_io!(self.buf_read.fill_buf()); + &buf[..byte_count] + } + BytesSource::Incomplete => self.incomplete.take_buffer(), + }; + match result { + Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), + Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), + } + } +} diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs index 0b7b164a810..f2d30d58a9e 100644 --- a/src/uu/wc/src/wc.rs +++ b/src/uu/wc/src/wc.rs @@ -9,6 +9,7 @@ mod count_fast; mod countable; +mod utf8; mod word_count; use clap::builder::ValueParser; use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast}; From 858ed8b4f4af0e6335b4085206bf79d029cf7115 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 5 Apr 2023 10:54:26 +0200 Subject: [PATCH 2/6] ignore word Co-authored-by: Daniel Hofstetter --- src/uu/wc/src/utf8/LICENSE | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/wc/src/utf8/LICENSE b/src/uu/wc/src/utf8/LICENSE index 1e031b3aa9c..6f3c83e6872 100644 --- a/src/uu/wc/src/utf8/LICENSE +++ b/src/uu/wc/src/utf8/LICENSE @@ -1,3 +1,4 @@ +// spell-checker:ignore Sapin Copyright (c) Simon Sapin and many others Permission is hereby granted, free of charge, to any From b35f2ef9ef88e2ae058ddd10ca02a243f1505bf8 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 5 Apr 2023 10:54:38 +0200 Subject: [PATCH 3/6] fix typo Co-authored-by: Daniel Hofstetter --- src/uu/wc/src/utf8/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs index 55db28ff476..e23122c34ce 100644 --- a/src/uu/wc/src/utf8/mod.rs +++ b/src/uu/wc/src/utf8/mod.rs @@ -8,7 +8,7 @@ use std::str; /// /// Incremental, zero-copy UTF-8 decoding with error handling /// -/// The original implemention was written by Simon Sapin in the utf-8 crate . +/// The original implementation was written by Simon Sapin in the utf-8 crate . /// uu_wc used to depend on that crate. /// The author archived the repository . /// They suggested incorporating the source directly into uu_wc . From ab0c4f5949d777a2b6c4597be5454b248aba94e5 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Wed, 5 Apr 2023 10:54:53 +0200 Subject: [PATCH 4/6] ignore word Co-authored-by: Daniel Hofstetter --- src/uu/wc/src/utf8/read.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/wc/src/utf8/read.rs b/src/uu/wc/src/utf8/read.rs index 75247952443..79b8ad8aeb2 100644 --- a/src/uu/wc/src/utf8/read.rs +++ b/src/uu/wc/src/utf8/read.rs @@ -1,3 +1,4 @@ +// spell-checker:ignore bytestream use super::*; use std::error::Error; use std::fmt; From 018c85b7dd23071ee241f096936c7213233470e8 Mon Sep 17 00:00:00 2001 From: Daniel Hofstetter Date: Wed, 5 Apr 2023 11:06:57 +0200 Subject: [PATCH 5/6] ignore word --- src/uu/wc/src/utf8/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs index e23122c34ce..d43003fc31a 100644 --- a/src/uu/wc/src/utf8/mod.rs +++ b/src/uu/wc/src/utf8/mod.rs @@ -1,3 +1,4 @@ +// spell-checker:ignore Sapin mod read; pub use read::{BufReadDecoder, BufReadDecoderError}; From 1a9df8550cacb353ef3d9acc36245a7399c4ae44 Mon Sep 17 00:00:00 2001 From: Daniel Hofstetter Date: Fri, 14 Apr 2023 14:58:44 +0200 Subject: [PATCH 6/6] wc: remove "test" module without tests --- src/uu/wc/src/utf8/mod.rs | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs index d43003fc31a..31638e7589c 100644 --- a/src/uu/wc/src/utf8/mod.rs +++ b/src/uu/wc/src/utf8/mod.rs @@ -91,35 +91,3 @@ impl Incomplete { } } } -#[cfg(test)] -mod test { - use std::collections::VecDeque; - use std::io; - - struct Chunks<'a>(VecDeque<&'a [u8]>); - - impl<'a> io::Read for Chunks<'a> { - fn read(&mut self, _: &mut [u8]) -> io::Result { - unimplemented!() - } - } - - impl<'a> io::BufRead for Chunks<'a> { - fn fill_buf(&mut self) -> io::Result<&[u8]> { - Ok(*self.0.front().unwrap()) - } - - fn consume(&mut self, bytes: usize) { - { - let front = self.0.front_mut().unwrap(); - *front = &front[bytes..]; - if !front.is_empty() { - return; - } - } - if self.0.len() > 1 { - self.0.pop_front(); - } - } - } -}