From 21fe060d8a76c06066a339bbf1884290cc1f9d75 Mon Sep 17 00:00:00 2001
From: Daniel Szoke <daniel.szoke@sentry.io>
Date: Fri, 10 Jan 2025 14:51:47 +0100
Subject: [PATCH 1/6] ref(sourcebundle): Check UTF-8 validity memory
 efficiently

The current check to ensure a sourcebundle is valid UTF-8 reads the entire sourcebundle file into memory. This is inefficient for large files.

This PR introduces a UTF8Reader which wraps any reader. The UTF8Reader ensures that the stream is valid UTF8 as it is being read, while only requiring a small amount of memory (currently 8 KiB) to be allocated as a buffer.
---
 CHANGELOG.md                                  |   4 +
 .../{sourcebundle.rs => sourcebundle/mod.rs}  |  36 ++-
 .../src/sourcebundle/utf8_reader.rs           | 295 ++++++++++++++++++
 3 files changed, 322 insertions(+), 13 deletions(-)
 rename symbolic-debuginfo/src/{sourcebundle.rs => sourcebundle/mod.rs} (98%)
 create mode 100644 symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f6b57e0a..ca211b388 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## 12.13.2
 
+**Improvements**
+
+- Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890))
+
 **Fixes**
 
 - Fixed GHA-based Windows builds ([#891](https://github.com/getsentry/symbolic/pull/891)).
diff --git a/symbolic-debuginfo/src/sourcebundle.rs b/symbolic-debuginfo/src/sourcebundle/mod.rs
similarity index 98%
rename from symbolic-debuginfo/src/sourcebundle.rs
rename to symbolic-debuginfo/src/sourcebundle/mod.rs
index 1dbce62c7..7d6cb6050 100644
--- a/symbolic-debuginfo/src/sourcebundle.rs
+++ b/symbolic-debuginfo/src/sourcebundle/mod.rs
@@ -42,15 +42,17 @@
 //! bundle a file entry has a `url` and might carry `headers` or individual debug IDs
 //! per source file.
 
+mod utf8_reader;
+
 use std::borrow::Cow;
 use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::error::Error;
-use std::fmt;
 use std::fmt::{Display, Formatter};
 use std::fs::{File, OpenOptions};
-use std::io::{BufReader, BufWriter, Read, Seek, Write};
+use std::io::{BufReader, BufWriter, ErrorKind, Read, Seek, Write};
 use std::path::Path;
 use std::sync::Arc;
+use std::{fmt, io};
 
 use parking_lot::Mutex;
 use regex::Regex;
@@ -60,6 +62,7 @@ use zip::{write::SimpleFileOptions, ZipWriter};
 
 use symbolic_common::{Arch, AsSelf, CodeId, DebugId, SourceLinkMappings};
 
+use self::utf8_reader::Utf8Reader;
 use crate::base::*;
 use crate::js::{
     discover_debug_id, discover_sourcemap_embedded_debug_id, discover_sourcemaps_location,
@@ -1217,18 +1220,14 @@ where
     pub fn add_file<S, R>(
         &mut self,
         path: S,
-        mut file: R,
+        file: R,
         info: SourceFileInfo,
     ) -> Result<(), SourceBundleError>
     where
         S: AsRef<str>,
         R: Read,
     {
-        let mut buf = String::new();
-
-        if let Err(e) = file.read_to_string(&mut buf) {
-            return Err(SourceBundleError::new(SourceBundleErrorKind::ReadFailed, e));
-        }
+        let mut file_reader = Utf8Reader::new(file);
 
         let full_path = self.file_path(path.as_ref());
         let unique_path = self.unique_path(full_path);
@@ -1236,12 +1235,23 @@ where
         self.writer
             .start_file(unique_path.clone(), default_file_options())
             .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
-        self.writer
-            .write_all(buf.as_bytes())
-            .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
 
-        self.manifest.files.insert(unique_path, info);
-        Ok(())
+        if let Err(e) = io::copy(&mut file_reader, &mut self.writer) {
+            self.writer
+                .abort_file()
+                .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
+
+            // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8.
+            let error_kind = match e.kind() {
+                ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed,
+                _ => SourceBundleErrorKind::WriteFailed,
+            };
+
+            Err(SourceBundleError::new(error_kind, e))
+        } else {
+            self.manifest.files.insert(unique_path, info);
+            Ok(())
+        }
     }
 
     /// Calls add_file, and handles any ReadFailed errors by calling the skipped_file_callback.
diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
new file mode 100644
index 000000000..dba964c39
--- /dev/null
+++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
@@ -0,0 +1,295 @@
+//! UTF-8 reader used by the sourcebundle module to read files.
+
+use std::io::{BufRead, Error, ErrorKind, Read, Result};
+use thiserror::Error;
+
+const MAX_UTF8_SEQUENCE_SIZE: usize = 4;
+const MAX_BUFFER_SIZE: usize = 8 * 1024;
+
+#[derive(Debug, Error)]
+#[error("Invalid UTF-8 sequence")]
+pub(crate) struct UTF8ReaderError;
+
+pub struct Utf8Reader<R> {
+    inner: R,
+
+    /// buffer_array[..buffer_end] always contains a valid UTF-8 sequence. It is possible that
+    /// buffer_array[buffer_start..buffer_end] does not contain a valid UTF-8 sequence, if
+    /// buffer_start is in the middle of a multi-byte UTF-8 character, which would happen if that
+    /// character has only partially been read.
+    buffer_array: Box<[u8; MAX_BUFFER_SIZE]>,
+    buffer_start: usize,
+    buffer_end: usize,
+}
+
+impl<R> Utf8Reader<R> {
+    pub fn new(inner: R) -> Self {
+        Self {
+            inner,
+            buffer_array: Box::new([0; MAX_BUFFER_SIZE]),
+            buffer_start: 0,
+            buffer_end: 0,
+        }
+    }
+
+    fn buffer(&self) -> &[u8] {
+        &self.buffer_array[self.buffer_start..self.buffer_end]
+    }
+}
+
+impl<R> Utf8Reader<R> where R: Read {}
+
+impl<R> Read for Utf8Reader<R>
+where
+    R: Read,
+{
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        if self.buffer().is_empty() && buf.len() > MAX_BUFFER_SIZE {
+            // If buf is bigger than our internal buffer, and there is nothing left in
+            // our internal buffer, just read directly into buf.
+            return read_utf8(&mut self.inner, buf);
+        }
+
+        self.fill_buf()?;
+
+        let bytes_to_copy = std::cmp::min(buf.len(), self.buffer().len());
+        buf[..bytes_to_copy].copy_from_slice(&self.buffer()[..bytes_to_copy]);
+        self.consume(bytes_to_copy);
+
+        Ok(bytes_to_copy)
+    }
+}
+
+impl<R> BufRead for Utf8Reader<R>
+where
+    R: Read,
+{
+    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
+        if !self.buffer().is_empty() {
+            return Ok(self.buffer());
+        }
+
+        let bytes_read = read_utf8(&mut self.inner, &mut self.buffer_array[..])?;
+        self.buffer_start = 0;
+        self.buffer_end = bytes_read;
+
+        Ok(self.buffer())
+    }
+
+    fn consume(&mut self, amt: usize) {
+        self.buffer_start += amt;
+    }
+}
+
+/// Reads a UTF-8 sequence from the inner reader into the buffer, returning the number
+/// of bytes read. Errors if this is not possible.
+///
+/// The function guarantees that it will return an Ok variant with a positive
+/// number of bytes read if it is possible to read a valid UTF-8 sequence from the inner
+/// reader. The function also guarantees that all of the bytes read from the inner reader will
+/// be stored in in the buffer, at indices up to the value returned by the function (in other
+/// words, no bytes are lost).
+///
+/// Panics if the buffer is not at least `MAX_UTF8_SEQUENCE_SIZE` bytes long.
+fn read_utf8(reader: &mut impl Read, buf: &mut [u8]) -> Result<usize> {
+    if buf.len() < MAX_UTF8_SEQUENCE_SIZE {
+        panic!("Buffer needs to be at least {MAX_UTF8_SEQUENCE_SIZE} bytes long");
+    }
+
+    // We need to leave at least three bytes in the buffer in case the first read
+    // ends in the middle of a UTF-8 character. In the worst case, we end at the first
+    // byte of a 4-byte UTF-8 character, and we need to read 3 more bytes to get a
+    // valid sequence.
+    let bytes_to_read = buf.len() - MAX_UTF8_SEQUENCE_SIZE + 1;
+    let read_buf = &mut buf[..bytes_to_read];
+
+    let bytes_read = reader.read(read_buf)?;
+    let read_portion = &buf[..bytes_read];
+
+    let valid_up_to = utf8_up_to(read_portion);
+    let invalid_portion_len = read_portion.len() - valid_up_to;
+
+    if invalid_portion_len >= MAX_UTF8_SEQUENCE_SIZE {
+        return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
+    }
+
+    // read_until_utf8 will not read anything if the buffer is empty,
+    // since an empty buffer is a valid UTF-8 sequence.
+    let bytes_until_utf8 = read_until_utf8(reader, &mut buf[valid_up_to..], invalid_portion_len)?;
+    Ok(valid_up_to + bytes_until_utf8)
+}
+
+/// Reads a single byte at a time from the inner reader into the buffer, starting from
+/// current_index, until either the buffer contains a valid UTF-8 sequence, the reader is
+/// exhausted, or the 4 bytes total have been read into the buffer (the maximum size of a
+/// UTF-8 sequence). The 4 bytes also includes bytes already in the buffer when the function is
+/// called, as indicated by the current_index parameter.
+///
+/// If the buffer is empty (i.e. current_index is 0), we will return Ok(0) without reading
+/// anything from the reader. An empty byte sequence is a valid UTF-8 sequence (the
+/// empty string).
+///
+/// Returns the number of bytes read into the buffer (including bytes already in the buffer
+/// when the function is called), or an error if the reader errors or if reading a valid UTF-8
+/// sequence is not possible.
+///
+/// The buffer must have a length of at least 4, otherwise this function can panic.
+fn read_until_utf8(
+    reader: &mut impl Read,
+    buffer: &mut [u8],
+    mut current_index: usize,
+) -> Result<usize> {
+    while std::str::from_utf8(&buffer[..current_index]).is_err() {
+        if current_index >= MAX_UTF8_SEQUENCE_SIZE {
+            // We already have 4 bytes in the buffer (maximum UTF-8 sequence size)
+            // so we cannot form a valid UTF-8 sequence by reading more bytes.
+            return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
+        }
+
+        if reader.read(&mut buffer[current_index..current_index + 1])? == 0 {
+            // Stream has been exhausted without finding
+            // a valid UTF-8 sequence.
+            return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
+        }
+
+        current_index += 1;
+    }
+
+    Ok(current_index)
+}
+
+/// Returns the index of the first invalid UTF-8 sequence
+/// in the given bytes. If the sequence is valid, returns the
+/// length of the bytes.
+fn utf8_up_to(bytes: &[u8]) -> usize {
+    match std::str::from_utf8(bytes) {
+        Ok(_) => bytes.len(),
+        Err(e) => e.valid_up_to(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::io::Cursor;
+
+    #[test]
+    fn test_read_empty() {
+        let mut empty_reader = Utf8Reader::new(Cursor::new(b""));
+
+        let mut buf = vec![];
+        empty_reader
+            .read_to_end(&mut buf)
+            .expect("read_to_end errored");
+
+        assert_eq!(buf, b"");
+    }
+
+    #[test]
+    fn test_read_ascii_simple() {
+        let mut reader = Utf8Reader::new(Cursor::new(b"Hello, world!"));
+
+        let mut buf = vec![];
+        reader.read_to_end(&mut buf).expect("read_to_end errored");
+
+        assert_eq!(buf, b"Hello, world!");
+    }
+
+    #[test]
+    fn test_read_utf8_simple() {
+        const HELLO_WORLD: &str = "你好世界！";
+        let mut reader = Utf8Reader::new(Cursor::new(HELLO_WORLD.as_bytes()));
+
+        let mut buf = vec![];
+        reader.read_to_end(&mut buf).expect("read_to_end errored");
+
+        assert_eq!(buf, HELLO_WORLD.as_bytes());
+    }
+
+    #[test]
+    fn multibyte_character_at_end_of_buffer() {
+        // Having a multibyte character at the end of the buffer will cause us to hit
+        // read_until_utf8.
+        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE];
+        read_buffer.extend("🙂".as_bytes());
+
+        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
+
+        let mut buf = [0; MAX_BUFFER_SIZE];
+        let bytes_read = reader.read(&mut buf).expect("read errored");
+
+        // We expect the buffer to be filled with the read bytes. We first read all but the last
+        // three bytes. But, we will need to fill these three bytes to get a valid UTF-8 sequence,
+        // since "🙂" is a 4-byte UTF-8 sequence.
+        assert_eq!(bytes_read, buf.len(), "buffer not filled");
+        assert_eq!(&buf[..], read_buffer);
+    }
+
+    #[test]
+    fn multibyte_character_at_end_of_big_read() {
+        // Big reads bypass buffering, so basically, we retest multibyte_character_at_end_of_buffer
+        // for this case.
+        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE + 10];
+        read_buffer.extend("🙂".as_bytes());
+
+        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
+
+        let mut buf = [0; MAX_BUFFER_SIZE + MAX_UTF8_SEQUENCE_SIZE + 10];
+        let bytes_read = reader.read(&mut buf).expect("read errored");
+
+        assert_eq!(bytes_read, buf.len(), "buffer not filled");
+        assert_eq!(&buf[..], read_buffer);
+    }
+
+    #[test]
+    fn small_reads_splitting_sequence() {
+        let mut reader = Utf8Reader::new(Cursor::new("🙂".as_bytes()));
+
+        let mut buf = [0; MAX_UTF8_SEQUENCE_SIZE];
+
+        for i in 0..MAX_UTF8_SEQUENCE_SIZE {
+            // Read at most one byte at a time.
+            let bytes_read = reader.read(&mut buf[i..i + 1]).expect("read errored");
+            assert_eq!(bytes_read, 1, "bytes read");
+        }
+
+        assert_eq!(&buf[..], "🙂".as_bytes());
+    }
+
+    #[test]
+    fn invalid_utf8_sequence() {
+        let mut reader = Utf8Reader::new(Cursor::new([0b11111111]));
+
+        let mut buf = [0; 1];
+        reader.read(&mut buf).expect_err("read should have errored");
+    }
+
+    #[test]
+    fn invalid_utf8_sequence_at_end_of_reader() {
+        let mut read_buffer = Vec::from(b"Hello, world!");
+
+        // Cutting off the last byte will invalidate the UTF-8 sequence.
+        let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1];
+        read_buffer.extend(invalid_sequence);
+
+        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
+        reader
+            .read_to_end(&mut vec![])
+            .expect_err("read should have errored");
+    }
+
+    #[test]
+    fn invalid_utf8_sequence_at_end_of_reader_and_buffer() {
+        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE];
+
+        // Cutting off the last byte will invalidate the UTF-8 sequence.
+        let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1];
+        read_buffer.extend(invalid_sequence);
+
+        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
+
+        let mut buf = [0; MAX_BUFFER_SIZE];
+        reader.read(&mut buf).expect_err("read should have errored");
+    }
+}

From 4f6f0c57ce4d304fa11407d343785ce82a1a7ab7 Mon Sep 17 00:00:00 2001
From: Daniel Szoke <daniel.szoke@sentry.io>
Date: Thu, 16 Jan 2025 11:59:24 +0100
Subject: [PATCH 2/6] ref: `Utf8Reader` no longer is buffering

---
 .../src/sourcebundle/utf8_reader.rs           | 253 +++++++++---------
 1 file changed, 120 insertions(+), 133 deletions(-)

diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
index dba964c39..7d95916e1 100644
--- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
+++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
@@ -1,10 +1,12 @@
 //! UTF-8 reader used by the sourcebundle module to read files.
 
-use std::io::{BufRead, Error, ErrorKind, Read, Result};
+use std::cmp;
+use std::io::{Error, ErrorKind, Read, Result};
+use std::str;
+
 use thiserror::Error;
 
 const MAX_UTF8_SEQUENCE_SIZE: usize = 4;
-const MAX_BUFFER_SIZE: usize = 8 * 1024;
 
 #[derive(Debug, Error)]
 #[error("Invalid UTF-8 sequence")]
@@ -13,12 +15,17 @@ pub(crate) struct UTF8ReaderError;
 pub struct Utf8Reader<R> {
     inner: R,
 
-    /// buffer_array[..buffer_end] always contains a valid UTF-8 sequence. It is possible that
-    /// buffer_array[buffer_start..buffer_end] does not contain a valid UTF-8 sequence, if
-    /// buffer_start is in the middle of a multi-byte UTF-8 character, which would happen if that
-    /// character has only partially been read.
-    buffer_array: Box<[u8; MAX_BUFFER_SIZE]>,
+    /// A buffer of `MAX_UTF8_SEQUENCE_SIZE` bytes, which we use when the end of a read is not a
+    /// valid UTF-8 sequence. We read into this buffer until we have a valid UTF-8 sequence, or
+    /// the reader is exhausted, or the buffer is full. Assuming we get a valid UTF-8 sequence,
+    /// we then on next read read from this buffer.
+    buffer: [u8; MAX_UTF8_SEQUENCE_SIZE],
+
+    /// The index of the first byte in the buffer that has not been read.
     buffer_start: usize,
+
+    /// The index of the last byte in the buffer that is part of the UTF-8 sequence. We only read
+    /// up to this index, bytes after this index are discarded.
     buffer_end: usize,
 }
 
@@ -26,97 +33,92 @@ impl<R> Utf8Reader<R> {
     pub fn new(inner: R) -> Self {
         Self {
             inner,
-            buffer_array: Box::new([0; MAX_BUFFER_SIZE]),
+            buffer: [0; MAX_UTF8_SEQUENCE_SIZE],
             buffer_start: 0,
             buffer_end: 0,
         }
     }
 
-    fn buffer(&self) -> &[u8] {
-        &self.buffer_array[self.buffer_start..self.buffer_end]
+    /// The slice of the buffer that should be read next.
+    fn buffer_to_read(&self) -> &[u8] {
+        &self.buffer[self.buffer_start..self.buffer_end]
     }
-}
-
-impl<R> Utf8Reader<R> where R: Read {}
-
-impl<R> Read for Utf8Reader<R>
-where
-    R: Read,
-{
-    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
-        if self.buffer().is_empty() && buf.len() > MAX_BUFFER_SIZE {
-            // If buf is bigger than our internal buffer, and there is nothing left in
-            // our internal buffer, just read directly into buf.
-            return read_utf8(&mut self.inner, buf);
-        }
 
-        self.fill_buf()?;
+    /// Advances the buffer start index by the given amount.
+    fn advance(&mut self, amt: usize) {
+        self.buffer_start += amt;
+    }
 
-        let bytes_to_copy = std::cmp::min(buf.len(), self.buffer().len());
-        buf[..bytes_to_copy].copy_from_slice(&self.buffer()[..bytes_to_copy]);
-        self.consume(bytes_to_copy);
+    /// Reads bytes from the buffer into the given buffer.
+    fn read_from_buffer(&mut self, buf: &mut [u8]) -> Result<usize> {
+        let bytes_copied = slice_copy(buf, self.buffer_to_read());
+        self.advance(bytes_copied);
 
-        Ok(bytes_to_copy)
+        Ok(bytes_copied)
     }
 }
 
-impl<R> BufRead for Utf8Reader<R>
+impl<R> Utf8Reader<R>
 where
     R: Read,
 {
-    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
-        if !self.buffer().is_empty() {
-            return Ok(self.buffer());
-        }
+    /// Reads bytes from the inner reader into the given buffer, if needed, filling self.buffer
+    /// with the remaining bytes of the UTF-8 sequence at the end of the buffer, which did not fit
+    /// in the read.
+    fn read_from_inner(&mut self, buf: &mut [u8]) -> Result<usize> {
+        let read_from_inner = self.inner.read(buf)?;
+        let read_portion = &buf[..read_from_inner];
 
-        let bytes_read = read_utf8(&mut self.inner, &mut self.buffer_array[..])?;
-        self.buffer_start = 0;
-        self.buffer_end = bytes_read;
+        let invalid_portion = ending_incomplete_utf8_sequence(read_portion)?;
 
-        Ok(self.buffer())
-    }
+        slice_copy(&mut self.buffer, invalid_portion);
+        self.read_into_buffer_until_utf8(invalid_portion.len())?;
 
-    fn consume(&mut self, amt: usize) {
-        self.buffer_start += amt;
+        Ok(read_from_inner)
     }
-}
 
-/// Reads a UTF-8 sequence from the inner reader into the buffer, returning the number
-/// of bytes read. Errors if this is not possible.
-///
-/// The function guarantees that it will return an Ok variant with a positive
-/// number of bytes read if it is possible to read a valid UTF-8 sequence from the inner
-/// reader. The function also guarantees that all of the bytes read from the inner reader will
-/// be stored in in the buffer, at indices up to the value returned by the function (in other
-/// words, no bytes are lost).
-///
-/// Panics if the buffer is not at least `MAX_UTF8_SEQUENCE_SIZE` bytes long.
-fn read_utf8(reader: &mut impl Read, buf: &mut [u8]) -> Result<usize> {
-    if buf.len() < MAX_UTF8_SEQUENCE_SIZE {
-        panic!("Buffer needs to be at least {MAX_UTF8_SEQUENCE_SIZE} bytes long");
+    /// Reads bytes from the inner reader into self.buffer until the buffer contains a valid UTF-8
+    /// sequence.
+    ///
+    /// Before calling this method, self.buffer[..start_index] should contain the bytes that are
+    /// part of the incomplete UTF-8 sequence that we are trying to complete (or, it should be
+    /// empty, in which case, this function is a no-op)
+    ///
+    /// Then, we read one byte at a time from the inner reader into self.buffer, until we have a
+    /// valid UTF-8 sequence.
+    ///
+    /// Lastly, we set self.buffer_start to start_index (so we don't reread the bytes that started
+    /// the incomplete UTF-8 sequence) and self.buffer_end to the index of the last byte in the
+    /// buffer that is part of the UTF-8 sequence.
+    ///
+    /// The next read from the Utf8Reader will read from
+    /// self.buffer[self.buffer_start..self.buffer_end].
+    fn read_into_buffer_until_utf8(&mut self, start_index: usize) -> Result<()> {
+        let bytes_until_utf8 = read_until_utf8(&mut self.inner, &mut self.buffer, start_index)?;
+
+        self.buffer_start = start_index;
+        self.buffer_end = bytes_until_utf8;
+
+        Ok(())
     }
+}
 
-    // We need to leave at least three bytes in the buffer in case the first read
-    // ends in the middle of a UTF-8 character. In the worst case, we end at the first
-    // byte of a 4-byte UTF-8 character, and we need to read 3 more bytes to get a
-    // valid sequence.
-    let bytes_to_read = buf.len() - MAX_UTF8_SEQUENCE_SIZE + 1;
-    let read_buf = &mut buf[..bytes_to_read];
-
-    let bytes_read = reader.read(read_buf)?;
-    let read_portion = &buf[..bytes_read];
+impl<R> Read for Utf8Reader<R>
+where
+    R: Read,
+{
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        let read_from_buffer = self.read_from_buffer(buf)?;
 
-    let valid_up_to = utf8_up_to(read_portion);
-    let invalid_portion_len = read_portion.len() - valid_up_to;
+        // self.read_from_inner overwrites self.buffer, so we can only call it if we have read
+        // everything from the buffer.
+        let read_from_inner = match self.buffer_to_read() {
+            [] => self.read_from_inner(&mut buf[read_from_buffer..])?,
+            _ => 0,
+        };
 
-    if invalid_portion_len >= MAX_UTF8_SEQUENCE_SIZE {
-        return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
+        Ok(read_from_buffer + read_from_inner)
     }
-
-    // read_until_utf8 will not read anything if the buffer is empty,
-    // since an empty buffer is a valid UTF-8 sequence.
-    let bytes_until_utf8 = read_until_utf8(reader, &mut buf[valid_up_to..], invalid_portion_len)?;
-    Ok(valid_up_to + bytes_until_utf8)
 }
 
 /// Reads a single byte at a time from the inner reader into the buffer, starting from
@@ -139,16 +141,12 @@ fn read_until_utf8(
     buffer: &mut [u8],
     mut current_index: usize,
 ) -> Result<usize> {
-    while std::str::from_utf8(&buffer[..current_index]).is_err() {
-        if current_index >= MAX_UTF8_SEQUENCE_SIZE {
-            // We already have 4 bytes in the buffer (maximum UTF-8 sequence size)
-            // so we cannot form a valid UTF-8 sequence by reading more bytes.
-            return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
-        }
-
-        if reader.read(&mut buffer[current_index..current_index + 1])? == 0 {
-            // Stream has been exhausted without finding
-            // a valid UTF-8 sequence.
+    while str::from_utf8(&buffer[..current_index]).is_err() {
+        if current_index >= MAX_UTF8_SEQUENCE_SIZE
+            || reader.read(&mut buffer[current_index..current_index + 1])? == 0
+        {
+            // We already have 4 bytes in the buffer (maximum UTF-8 sequence size), or the stream
+            // has been exhausted without finding a valid UTF-8 sequence.
             return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError));
         }
 
@@ -162,12 +160,50 @@ fn read_until_utf8(
 /// in the given bytes. If the sequence is valid, returns the
 /// length of the bytes.
 fn utf8_up_to(bytes: &[u8]) -> usize {
-    match std::str::from_utf8(bytes) {
+    match str::from_utf8(bytes) {
         Ok(_) => bytes.len(),
         Err(e) => e.valid_up_to(),
     }
 }
 
+/// Given a byte slice, determines if the slice ends in what might be an incomplete UTF-8 sequence,
+/// returning the incomplete sequence if so.
+///
+/// The following return values are possible:
+///   - Ok([]) if the byte slice is valid UTF-8, in its entirety.
+///   - Ok(incomplete_sequence) if the byte slice is valid up to the `incomplete_sequence` slice
+///     returned by this function. `incomplete_sequence` is always a slice of at most 3 bytes
+///     occurring at the end of the input slice. In this case, it might be possible to make the
+///     input slice a valid UTF-8 sequence by appending bytes to the input slice.
+///   - Err(e) if the first UTF-8 violation in the input slice occurs more than 3 bytes from the
+///     end of the input slice. In this case, it is definitely not possible to make the sequence
+///     valid by appending bytes to the input slice.
+fn ending_incomplete_utf8_sequence(bytes: &[u8]) -> Result<&[u8]> {
+    let valid_up_to = utf8_up_to(bytes);
+    let invalid_portion = &bytes[valid_up_to..];
+
+    if invalid_portion.len() >= MAX_UTF8_SEQUENCE_SIZE {
+        Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError))
+    } else {
+        Ok(invalid_portion)
+    }
+}
+
+/// Copies as many elements as possible (i.e. the smaller of the two slice lengths) from the
+/// beginning of the source slice to the beginning of the destination slice, overwriting anything
+/// already in the destination slice.
+///
+/// Returns the number of elements copied.
+fn slice_copy<T>(dst: &mut [T], src: &[T]) -> usize
+where
+    T: Copy,
+{
+    let elements_to_copy = cmp::min(dst.len(), src.len());
+    dst[..elements_to_copy].copy_from_slice(&src[..elements_to_copy]);
+
+    elements_to_copy
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -207,41 +243,6 @@ mod tests {
         assert_eq!(buf, HELLO_WORLD.as_bytes());
     }
 
-    #[test]
-    fn multibyte_character_at_end_of_buffer() {
-        // Having a multibyte character at the end of the buffer will cause us to hit
-        // read_until_utf8.
-        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE];
-        read_buffer.extend("🙂".as_bytes());
-
-        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
-
-        let mut buf = [0; MAX_BUFFER_SIZE];
-        let bytes_read = reader.read(&mut buf).expect("read errored");
-
-        // We expect the buffer to be filled with the read bytes. We first read all but the last
-        // three bytes. But, we will need to fill these three bytes to get a valid UTF-8 sequence,
-        // since "🙂" is a 4-byte UTF-8 sequence.
-        assert_eq!(bytes_read, buf.len(), "buffer not filled");
-        assert_eq!(&buf[..], read_buffer);
-    }
-
-    #[test]
-    fn multibyte_character_at_end_of_big_read() {
-        // Big reads bypass buffering, so basically, we retest multibyte_character_at_end_of_buffer
-        // for this case.
-        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE + 10];
-        read_buffer.extend("🙂".as_bytes());
-
-        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
-
-        let mut buf = [0; MAX_BUFFER_SIZE + MAX_UTF8_SEQUENCE_SIZE + 10];
-        let bytes_read = reader.read(&mut buf).expect("read errored");
-
-        assert_eq!(bytes_read, buf.len(), "buffer not filled");
-        assert_eq!(&buf[..], read_buffer);
-    }
-
     #[test]
     fn small_reads_splitting_sequence() {
         let mut reader = Utf8Reader::new(Cursor::new("🙂".as_bytes()));
@@ -278,18 +279,4 @@ mod tests {
             .read_to_end(&mut vec![])
             .expect_err("read should have errored");
     }
-
-    #[test]
-    fn invalid_utf8_sequence_at_end_of_reader_and_buffer() {
-        let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE];
-
-        // Cutting off the last byte will invalidate the UTF-8 sequence.
-        let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1];
-        read_buffer.extend(invalid_sequence);
-
-        let mut reader = Utf8Reader::new(Cursor::new(&read_buffer));
-
-        let mut buf = [0; MAX_BUFFER_SIZE];
-        reader.read(&mut buf).expect_err("read should have errored");
-    }
 }

From 7e78aa4d62f94eed73cb9e248050bf045fe1a3ad Mon Sep 17 00:00:00 2001
From: Daniel Szoke <daniel.szoke@sentry.io>
Date: Thu, 16 Jan 2025 12:15:21 +0100
Subject: [PATCH 3/6] ref: `match` instead of `if`

---
 symbolic-debuginfo/src/sourcebundle/mod.rs | 33 ++++++++++++----------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/symbolic-debuginfo/src/sourcebundle/mod.rs b/symbolic-debuginfo/src/sourcebundle/mod.rs
index 7d6cb6050..01bbac277 100644
--- a/symbolic-debuginfo/src/sourcebundle/mod.rs
+++ b/symbolic-debuginfo/src/sourcebundle/mod.rs
@@ -1236,21 +1236,24 @@ where
             .start_file(unique_path.clone(), default_file_options())
             .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
 
-        if let Err(e) = io::copy(&mut file_reader, &mut self.writer) {
-            self.writer
-                .abort_file()
-                .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
-
-            // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8.
-            let error_kind = match e.kind() {
-                ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed,
-                _ => SourceBundleErrorKind::WriteFailed,
-            };
-
-            Err(SourceBundleError::new(error_kind, e))
-        } else {
-            self.manifest.files.insert(unique_path, info);
-            Ok(())
+        match io::copy(&mut file_reader, &mut self.writer) {
+            Err(e) => {
+                self.writer
+                    .abort_file()
+                    .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
+
+                // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8.
+                let error_kind = match e.kind() {
+                    ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed,
+                    _ => SourceBundleErrorKind::WriteFailed,
+                };
+
+                Err(SourceBundleError::new(error_kind, e))
+            }
+            Ok(_) => {
+                self.manifest.files.insert(unique_path, info);
+                Ok(())
+            }
         }
     }
 

From 946bbf7b21bc7e3cc7a71c0d22284f07ccbb643a Mon Sep 17 00:00:00 2001
From: Daniel Szoke <7881302+szokeasaurusrex@users.noreply.github.com>
Date: Thu, 16 Jan 2025 13:57:44 +0100
Subject: [PATCH 4/6] Update symbolic-debuginfo/src/sourcebundle/utf8_reader.rs

Co-authored-by: Sebastian Zivota <loewenheim@users.noreply.github.com>
---
 symbolic-debuginfo/src/sourcebundle/utf8_reader.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
index 7d95916e1..88c9f7384 100644
--- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
+++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
@@ -62,9 +62,10 @@ impl<R> Utf8Reader<R>
 where
     R: Read,
 {
-    /// Reads bytes from the inner reader into the given buffer, if needed, filling self.buffer
-    /// with the remaining bytes of the UTF-8 sequence at the end of the buffer, which did not fit
-    /// in the read.
+    /// Reads bytes from the inner reader into the given buffer.
+    ///
+    /// If a UTF-8 sequence only partially fits into the buffer, the remaining
+    /// bytes are written to `self.buffer`.
     fn read_from_inner(&mut self, buf: &mut [u8]) -> Result<usize> {
         let read_from_inner = self.inner.read(buf)?;
         let read_portion = &buf[..read_from_inner];

From c81365a68c7e6761f1847bc340517b02e72c8566 Mon Sep 17 00:00:00 2001
From: Sebastian Zivota <loewenheim@mailbox.org>
Date: Thu, 16 Jan 2025 16:01:16 +0100
Subject: [PATCH 5/6] proptest utf8-reader

---
 Cargo.lock                                    | 115 ++++++++++++++++++
 Cargo.toml                                    |   1 +
 symbolic-debuginfo/Cargo.toml                 |   1 +
 .../src/sourcebundle/utf8_reader.rs           |  23 ++++
 4 files changed, 140 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index 4d55e2799..1f1a8b8b9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -227,6 +227,21 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab"
 
+[[package]]
+name = "bit-set"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -295,6 +310,12 @@ version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "bytes"
 version = "1.9.0"
@@ -693,6 +714,12 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -1646,6 +1673,15 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 
+[[package]]
+name = "ppv-lite86"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
+dependencies = [
+ "zerocopy",
+]
+
 [[package]]
 name = "precomputed-hash"
 version = "0.1.1"
@@ -1683,6 +1719,26 @@ dependencies = [
  "watto",
 ]
 
+[[package]]
+name = "proptest"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50"
+dependencies = [
+ "bit-set",
+ "bit-vec",
+ "bitflags 2.7.0",
+ "lazy_static",
+ "num-traits",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax 0.8.5",
+ "rusty-fork",
+ "tempfile",
+ "unarray",
+]
+
 [[package]]
 name = "psm"
 version = "0.1.24"
@@ -1692,6 +1748,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
 [[package]]
 name = "quote"
 version = "1.0.38"
@@ -1713,6 +1775,18 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
  "rand_core",
 ]
 
@@ -1721,6 +1795,18 @@ name = "rand_core"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
 
 [[package]]
 name = "range-collections"
@@ -1849,6 +1935,18 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "rusty-fork"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
+dependencies = [
+ "fnv",
+ "quick-error",
+ "tempfile",
+ "wait-timeout",
+]
+
 [[package]]
 name = "ryu"
 version = "1.0.18"
@@ -2381,6 +2479,7 @@ dependencies = [
  "once_cell",
  "parking_lot",
  "pdb-addr2line",
+ "proptest",
  "regex",
  "scroll 0.12.0",
  "serde",
@@ -2741,6 +2840,12 @@ version = "2.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
 
+[[package]]
+name = "unarray"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
+
 [[package]]
 name = "unicode-id-start"
 version = "1.0.4"
@@ -2823,6 +2928,15 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
 
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.5.0"
@@ -3191,6 +3305,7 @@ version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
+ "byteorder",
  "zerocopy-derive",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 82346d281..e8f17957d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ once_cell = "1.17.1"
 parking_lot = "0.12.1"
 pdb-addr2line = "0.10.4"
 proguard = { version = "5.4.0", features = ["uuid"] }
+proptest = "1.6.0"
 regex = "1.7.1"
 rustc-demangle = "0.1.21"
 # keep this in sync with whatever version `goblin` uses
diff --git a/symbolic-debuginfo/Cargo.toml b/symbolic-debuginfo/Cargo.toml
index 2f08e1d32..f20ee6472 100644
--- a/symbolic-debuginfo/Cargo.toml
+++ b/symbolic-debuginfo/Cargo.toml
@@ -115,6 +115,7 @@ zstd = { workspace = true, optional = true }
 [dev-dependencies]
 criterion = { workspace = true }
 insta = { workspace = true }
+proptest = {workspace = true }
 tempfile = { workspace = true }
 similar-asserts = { workspace = true }
 symbolic-testutils = { path = "../symbolic-testutils" }
diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
index 88c9f7384..64713814e 100644
--- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
+++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs
@@ -209,6 +209,9 @@ where
 mod tests {
     use super::*;
 
+    use proptest::prelude::*;
+
+    use core::str;
     use std::io::Cursor;
 
     #[test]
@@ -280,4 +283,24 @@ mod tests {
             .read_to_end(&mut vec![])
             .expect_err("read should have errored");
     }
+
+    proptest! {
+        #[test]
+        fn read_arbitrary_string(s in any::<String>()) {
+            let mut buf = vec![];
+            let mut reader = Utf8Reader::new(Cursor::new(&s));
+
+            reader.read_to_end(&mut buf).unwrap();
+            assert_eq!(str::from_utf8(&buf).unwrap(), s);
+        }
+
+        #[test]
+        fn dont_read_arbitrary_nonstring(s in any::<Vec<u8>>()) {
+            prop_assume!(str::from_utf8(&s).is_err());
+            let mut buf = vec![];
+            let mut reader = Utf8Reader::new(Cursor::new(&s));
+
+            reader.read_to_end(&mut buf).unwrap_err();
+        }
+    }
 }

From 811dd44af10ad99f471c1c7a5c3ca4c80225b298 Mon Sep 17 00:00:00 2001
From: Daniel Szoke <daniel.szoke@sentry.io>
Date: Mon, 20 Jan 2025 10:02:23 +0100
Subject: [PATCH 6/6] Update Changelog

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca211b388..c5962c6c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,13 @@
 # Changelog
 
-## 12.13.2
+## Unreleased
 
 **Improvements**
 
 - Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890))
 
+## 12.13.2
+
 **Fixes**
 
 - Fixed GHA-based Windows builds ([#891](https://github.com/getsentry/symbolic/pull/891)).