From 2e75372c404fc41842fe8c07d5fb36cd74e16e57 Mon Sep 17 00:00:00 2001
From: matt <git@publicmatt.com>
Date: Thu, 30 Mar 2023 11:27:04 -0700
Subject: [PATCH 1/6] remove utf8 crate dependency.

---
 Cargo.lock                 |   7 --
 src/uu/wc/Cargo.toml       |   1 -
 src/uu/wc/src/utf8/LICENSE |  25 +++++++
 src/uu/wc/src/utf8/mod.rs  | 124 +++++++++++++++++++++++++++++++++
 src/uu/wc/src/utf8/read.rs | 136 +++++++++++++++++++++++++++++++++++++
 src/uu/wc/src/wc.rs        |   1 +
 6 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 src/uu/wc/src/utf8/LICENSE
 create mode 100644 src/uu/wc/src/utf8/mod.rs
 create mode 100644 src/uu/wc/src/utf8/read.rs

diff --git a/Cargo.lock b/Cargo.lock
index 89d67bf8146..38032bcae3c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2339,12 +2339,6 @@ dependencies = [
  "log",
 ]
 
-[[package]]
-name = "utf-8"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
-
 [[package]]
 name = "uu_arch"
 version = "0.0.17"
@@ -3314,7 +3308,6 @@ dependencies = [
  "libc",
  "nix",
  "unicode-width",
- "utf-8",
  "uucore",
 ]
 
diff --git a/src/uu/wc/Cargo.toml b/src/uu/wc/Cargo.toml
index 40e0fd03b42..54f24a99b92 100644
--- a/src/uu/wc/Cargo.toml
+++ b/src/uu/wc/Cargo.toml
@@ -18,7 +18,6 @@ path = "src/wc.rs"
 clap = { workspace=true }
 uucore = { workspace=true, features=["pipes"] }
 bytecount = { workspace=true }
-utf-8 = { workspace=true }
 unicode-width = { workspace=true }
 
 [target.'cfg(unix)'.dependencies]
diff --git a/src/uu/wc/src/utf8/LICENSE b/src/uu/wc/src/utf8/LICENSE
new file mode 100644
index 00000000000..1e031b3aa9c
--- /dev/null
+++ b/src/uu/wc/src/utf8/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) Simon Sapin and many others
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs
new file mode 100644
index 00000000000..55db28ff476
--- /dev/null
+++ b/src/uu/wc/src/utf8/mod.rs
@@ -0,0 +1,124 @@
+mod read;
+
+pub use read::{BufReadDecoder, BufReadDecoderError};
+
+use std::cmp;
+use std::str;
+
+///
+/// Incremental, zero-copy UTF-8 decoding with error handling
+///
+/// The original implemention was written by Simon Sapin in the utf-8 crate <https://crates.io/crates/utf-8>.
+/// uu_wc used to depend on that crate.
+/// The author archived the repository <https://github.com/SimonSapin/rust-utf8>.
+/// They suggested incorporating the source directly into uu_wc <https://github.com/uutils/coreutils/issues/4289>.
+///
+
+#[derive(Debug, Copy, Clone)]
+pub struct Incomplete {
+    pub buffer: [u8; 4],
+    pub buffer_len: u8,
+}
+
+impl Incomplete {
+    pub fn empty() -> Self {
+        Self {
+            buffer: [0, 0, 0, 0],
+            buffer_len: 0,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.buffer_len == 0
+    }
+
+    pub fn new(bytes: &[u8]) -> Self {
+        let mut buffer = [0, 0, 0, 0];
+        let len = bytes.len();
+        buffer[..len].copy_from_slice(bytes);
+        Self {
+            buffer,
+            buffer_len: len as u8,
+        }
+    }
+
+    fn take_buffer(&mut self) -> &[u8] {
+        let len = self.buffer_len as usize;
+        self.buffer_len = 0;
+        &self.buffer[..len]
+    }
+
+    /// (consumed_from_input, None): not enough input
+    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
+    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
+    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
+        let initial_buffer_len = self.buffer_len as usize;
+        let copied_from_input;
+        {
+            let unwritten = &mut self.buffer[initial_buffer_len..];
+            copied_from_input = cmp::min(unwritten.len(), input.len());
+            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
+        }
+        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
+        match str::from_utf8(spliced) {
+            Ok(_) => {
+                self.buffer_len = spliced.len() as u8;
+                (copied_from_input, Some(Ok(())))
+            }
+            Err(error) => {
+                let valid_up_to = error.valid_up_to();
+                if valid_up_to > 0 {
+                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
+                    self.buffer_len = valid_up_to as u8;
+                    (consumed, Some(Ok(())))
+                } else {
+                    match error.error_len() {
+                        Some(invalid_sequence_length) => {
+                            let consumed = invalid_sequence_length
+                                .checked_sub(initial_buffer_len)
+                                .unwrap();
+                            self.buffer_len = invalid_sequence_length as u8;
+                            (consumed, Some(Err(())))
+                        }
+                        None => {
+                            self.buffer_len = spliced.len() as u8;
+                            (copied_from_input, None)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+#[cfg(test)]
+mod test {
+    use std::collections::VecDeque;
+    use std::io;
+
+    struct Chunks<'a>(VecDeque<&'a [u8]>);
+
+    impl<'a> io::Read for Chunks<'a> {
+        fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
+            unimplemented!()
+        }
+    }
+
+    impl<'a> io::BufRead for Chunks<'a> {
+        fn fill_buf(&mut self) -> io::Result<&[u8]> {
+            Ok(*self.0.front().unwrap())
+        }
+
+        fn consume(&mut self, bytes: usize) {
+            {
+                let front = self.0.front_mut().unwrap();
+                *front = &front[bytes..];
+                if !front.is_empty() {
+                    return;
+                }
+            }
+            if self.0.len() > 1 {
+                self.0.pop_front();
+            }
+        }
+    }
+}
diff --git a/src/uu/wc/src/utf8/read.rs b/src/uu/wc/src/utf8/read.rs
new file mode 100644
index 00000000000..75247952443
--- /dev/null
+++ b/src/uu/wc/src/utf8/read.rs
@@ -0,0 +1,136 @@
+use super::*;
+use std::error::Error;
+use std::fmt;
+use std::io::{self, BufRead};
+use std::str;
+
+/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
+pub struct BufReadDecoder<B: BufRead> {
+    buf_read: B,
+    bytes_consumed: usize,
+    incomplete: Incomplete,
+}
+
+#[derive(Debug)]
+pub enum BufReadDecoderError<'a> {
+    /// Represents one UTF-8 error in the byte stream.
+    ///
+    /// In lossy decoding, each such error should be replaced with U+FFFD.
+    /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
+    InvalidByteSequence(&'a [u8]),
+
+    /// An I/O error from the underlying byte stream
+    Io(io::Error),
+}
+
+impl<'a> fmt::Display for BufReadDecoderError<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(bytes) => {
+                write!(f, "invalid byte sequence: {:02x?}", bytes)
+            }
+            BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
+        }
+    }
+}
+
+impl<'a> Error for BufReadDecoderError<'a> {
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        match *self {
+            BufReadDecoderError::InvalidByteSequence(_) => None,
+            BufReadDecoderError::Io(ref err) => Some(err),
+        }
+    }
+}
+
+impl<B: BufRead> BufReadDecoder<B> {
+    pub fn new(buf_read: B) -> Self {
+        Self {
+            buf_read,
+            bytes_consumed: 0,
+            incomplete: Incomplete::empty(),
+        }
+    }
+
+    /// Decode and consume the next chunk of UTF-8 input.
+    ///
+    /// This method is intended to be called repeatedly until it returns `None`,
+    /// which represents EOF from the underlying byte stream.
+    /// This is similar to `Iterator::next`,
+    /// except that decoded chunks borrow the decoder (~iterator)
+    /// so they need to be handled or copied before the next chunk can start decoding.
+    pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
+        enum BytesSource {
+            BufRead(usize),
+            Incomplete,
+        }
+        macro_rules! try_io {
+            ($io_result: expr) => {
+                match $io_result {
+                    Ok(value) => value,
+                    Err(error) => return Some(Err(BufReadDecoderError::Io(error))),
+                }
+            };
+        }
+        let (source, result) = loop {
+            if self.bytes_consumed > 0 {
+                self.buf_read.consume(self.bytes_consumed);
+                self.bytes_consumed = 0;
+            }
+            let buf = try_io!(self.buf_read.fill_buf());
+
+            // Force loop iteration to go through an explicit `continue`
+            enum Unreachable {}
+            let _: Unreachable = if self.incomplete.is_empty() {
+                if buf.is_empty() {
+                    return None; // EOF
+                }
+                match str::from_utf8(buf) {
+                    Ok(_) => break (BytesSource::BufRead(buf.len()), Ok(())),
+                    Err(error) => {
+                        let valid_up_to = error.valid_up_to();
+                        if valid_up_to > 0 {
+                            break (BytesSource::BufRead(valid_up_to), Ok(()));
+                        }
+                        match error.error_len() {
+                            Some(invalid_sequence_length) => {
+                                break (BytesSource::BufRead(invalid_sequence_length), Err(()))
+                            }
+                            None => {
+                                self.bytes_consumed = buf.len();
+                                self.incomplete = Incomplete::new(buf);
+                                // need more input bytes
+                                continue;
+                            }
+                        }
+                    }
+                }
+            } else {
+                if buf.is_empty() {
+                    break (BytesSource::Incomplete, Err(())); // EOF with incomplete code point
+                }
+                let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
+                self.bytes_consumed = consumed;
+                match opt_result {
+                    None => {
+                        // need more input bytes
+                        continue;
+                    }
+                    Some(result) => break (BytesSource::Incomplete, result),
+                }
+            };
+        };
+        let bytes = match source {
+            BytesSource::BufRead(byte_count) => {
+                self.bytes_consumed = byte_count;
+                let buf = try_io!(self.buf_read.fill_buf());
+                &buf[..byte_count]
+            }
+            BytesSource::Incomplete => self.incomplete.take_buffer(),
+        };
+        match result {
+            Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
+            Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
+        }
+    }
+}
diff --git a/src/uu/wc/src/wc.rs b/src/uu/wc/src/wc.rs
index 0b7b164a810..f2d30d58a9e 100644
--- a/src/uu/wc/src/wc.rs
+++ b/src/uu/wc/src/wc.rs
@@ -9,6 +9,7 @@
 
 mod count_fast;
 mod countable;
+mod utf8;
 mod word_count;
 use clap::builder::ValueParser;
 use count_fast::{count_bytes_chars_and_lines_fast, count_bytes_fast};

From 858ed8b4f4af0e6335b4085206bf79d029cf7115 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sledru@mozilla.com>
Date: Wed, 5 Apr 2023 10:54:26 +0200
Subject: [PATCH 2/6] ignore word

Co-authored-by: Daniel Hofstetter <daniel.hofstetter@42dh.com>
---
 src/uu/wc/src/utf8/LICENSE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/uu/wc/src/utf8/LICENSE b/src/uu/wc/src/utf8/LICENSE
index 1e031b3aa9c..6f3c83e6872 100644
--- a/src/uu/wc/src/utf8/LICENSE
+++ b/src/uu/wc/src/utf8/LICENSE
@@ -1,3 +1,4 @@
+// spell-checker:ignore Sapin
 Copyright (c) Simon Sapin and many others
 
 Permission is hereby granted, free of charge, to any

From b35f2ef9ef88e2ae058ddd10ca02a243f1505bf8 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sledru@mozilla.com>
Date: Wed, 5 Apr 2023 10:54:38 +0200
Subject: [PATCH 3/6] fix typo

Co-authored-by: Daniel Hofstetter <daniel.hofstetter@42dh.com>
---
 src/uu/wc/src/utf8/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs
index 55db28ff476..e23122c34ce 100644
--- a/src/uu/wc/src/utf8/mod.rs
+++ b/src/uu/wc/src/utf8/mod.rs
@@ -8,7 +8,7 @@ use std::str;
 ///
 /// Incremental, zero-copy UTF-8 decoding with error handling
 ///
-/// The original implemention was written by Simon Sapin in the utf-8 crate <https://crates.io/crates/utf-8>.
+/// The original implementation was written by Simon Sapin in the utf-8 crate <https://crates.io/crates/utf-8>.
 /// uu_wc used to depend on that crate.
 /// The author archived the repository <https://github.com/SimonSapin/rust-utf8>.
 /// They suggested incorporating the source directly into uu_wc <https://github.com/uutils/coreutils/issues/4289>.

From ab0c4f5949d777a2b6c4597be5454b248aba94e5 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sledru@mozilla.com>
Date: Wed, 5 Apr 2023 10:54:53 +0200
Subject: [PATCH 4/6] ignore word

Co-authored-by: Daniel Hofstetter <daniel.hofstetter@42dh.com>
---
 src/uu/wc/src/utf8/read.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/uu/wc/src/utf8/read.rs b/src/uu/wc/src/utf8/read.rs
index 75247952443..79b8ad8aeb2 100644
--- a/src/uu/wc/src/utf8/read.rs
+++ b/src/uu/wc/src/utf8/read.rs
@@ -1,3 +1,4 @@
+// spell-checker:ignore bytestream
 use super::*;
 use std::error::Error;
 use std::fmt;

From 018c85b7dd23071ee241f096936c7213233470e8 Mon Sep 17 00:00:00 2001
From: Daniel Hofstetter <daniel.hofstetter@42dh.com>
Date: Wed, 5 Apr 2023 11:06:57 +0200
Subject: [PATCH 5/6] ignore word

---
 src/uu/wc/src/utf8/mod.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs
index e23122c34ce..d43003fc31a 100644
--- a/src/uu/wc/src/utf8/mod.rs
+++ b/src/uu/wc/src/utf8/mod.rs
@@ -1,3 +1,4 @@
+// spell-checker:ignore Sapin
 mod read;
 
 pub use read::{BufReadDecoder, BufReadDecoderError};

From 1a9df8550cacb353ef3d9acc36245a7399c4ae44 Mon Sep 17 00:00:00 2001
From: Daniel Hofstetter <daniel.hofstetter@42dh.com>
Date: Fri, 14 Apr 2023 14:58:44 +0200
Subject: [PATCH 6/6] wc: remove "test" module without tests

---
 src/uu/wc/src/utf8/mod.rs | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/src/uu/wc/src/utf8/mod.rs b/src/uu/wc/src/utf8/mod.rs
index d43003fc31a..31638e7589c 100644
--- a/src/uu/wc/src/utf8/mod.rs
+++ b/src/uu/wc/src/utf8/mod.rs
@@ -91,35 +91,3 @@ impl Incomplete {
         }
     }
 }
-#[cfg(test)]
-mod test {
-    use std::collections::VecDeque;
-    use std::io;
-
-    struct Chunks<'a>(VecDeque<&'a [u8]>);
-
-    impl<'a> io::Read for Chunks<'a> {
-        fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
-            unimplemented!()
-        }
-    }
-
-    impl<'a> io::BufRead for Chunks<'a> {
-        fn fill_buf(&mut self) -> io::Result<&[u8]> {
-            Ok(*self.0.front().unwrap())
-        }
-
-        fn consume(&mut self, bytes: usize) {
-            {
-                let front = self.0.front_mut().unwrap();
-                *front = &front[bytes..];
-                if !front.is_empty() {
-                    return;
-                }
-            }
-            if self.0.len() > 1 {
-                self.0.pop_front();
-            }
-        }
-    }
-}