From 21fe060d8a76c06066a339bbf1884290cc1f9d75 Mon Sep 17 00:00:00 2001 From: Daniel Szoke Date: Fri, 10 Jan 2025 14:51:47 +0100 Subject: [PATCH 1/6] ref(sourcebundle): Check UTF-8 validity memory efficiently The current check to ensure a sourcebundle is valid UTF-8 reads the entire sourcebundle file into memory. This is inefficient for large files. This PR introduces a UTF8Reader which wraps any reader. The UTF8Reader ensures that the stream is valid UTF8 as it is being read, while only requiring a small amount of memory (currently 8 KiB) to be allocated as a buffer. --- CHANGELOG.md | 4 + .../{sourcebundle.rs => sourcebundle/mod.rs} | 36 ++- .../src/sourcebundle/utf8_reader.rs | 295 ++++++++++++++++++ 3 files changed, 322 insertions(+), 13 deletions(-) rename symbolic-debuginfo/src/{sourcebundle.rs => sourcebundle/mod.rs} (98%) create mode 100644 symbolic-debuginfo/src/sourcebundle/utf8_reader.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f6b57e0a..ca211b388 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## 12.13.2 +**Improvements** + +- Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890)) + **Fixes** - Fixed GHA-based Windows builds ([#891](https://github.com/getsentry/symbolic/pull/891)). diff --git a/symbolic-debuginfo/src/sourcebundle.rs b/symbolic-debuginfo/src/sourcebundle/mod.rs similarity index 98% rename from symbolic-debuginfo/src/sourcebundle.rs rename to symbolic-debuginfo/src/sourcebundle/mod.rs index 1dbce62c7..7d6cb6050 100644 --- a/symbolic-debuginfo/src/sourcebundle.rs +++ b/symbolic-debuginfo/src/sourcebundle/mod.rs @@ -42,15 +42,17 @@ //! bundle a file entry has a `url` and might carry `headers` or individual debug IDs //! per source file. +mod utf8_reader; + use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::error::Error; -use std::fmt; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{BufReader, BufWriter, Read, Seek, Write}; +use std::io::{BufReader, BufWriter, ErrorKind, Read, Seek, Write}; use std::path::Path; use std::sync::Arc; +use std::{fmt, io}; use parking_lot::Mutex; use regex::Regex; @@ -60,6 +62,7 @@ use zip::{write::SimpleFileOptions, ZipWriter}; use symbolic_common::{Arch, AsSelf, CodeId, DebugId, SourceLinkMappings}; +use self::utf8_reader::Utf8Reader; use crate::base::*; use crate::js::{ discover_debug_id, discover_sourcemap_embedded_debug_id, discover_sourcemaps_location, @@ -1217,18 +1220,14 @@ where pub fn add_file( &mut self, path: S, - mut file: R, + file: R, info: SourceFileInfo, ) -> Result<(), SourceBundleError> where S: AsRef, R: Read, { - let mut buf = String::new(); - - if let Err(e) = file.read_to_string(&mut buf) { - return Err(SourceBundleError::new(SourceBundleErrorKind::ReadFailed, e)); - } + let mut file_reader = Utf8Reader::new(file); let full_path = self.file_path(path.as_ref()); let unique_path = self.unique_path(full_path); @@ -1236,12 +1235,23 @@ where self.writer .start_file(unique_path.clone(), default_file_options()) .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - self.writer - .write_all(buf.as_bytes()) - .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - self.manifest.files.insert(unique_path, info); - Ok(()) + if let Err(e) = io::copy(&mut file_reader, &mut self.writer) { + self.writer + .abort_file() + .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; + + // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8. + let error_kind = match e.kind() { + ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed, + _ => SourceBundleErrorKind::WriteFailed, + }; + + Err(SourceBundleError::new(error_kind, e)) + } else { + self.manifest.files.insert(unique_path, info); + Ok(()) + } } /// Calls add_file, and handles any ReadFailed errors by calling the skipped_file_callback. diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs new file mode 100644 index 000000000..dba964c39 --- /dev/null +++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs @@ -0,0 +1,295 @@ +//! UTF-8 reader used by the sourcebundle module to read files. + +use std::io::{BufRead, Error, ErrorKind, Read, Result}; +use thiserror::Error; + +const MAX_UTF8_SEQUENCE_SIZE: usize = 4; +const MAX_BUFFER_SIZE: usize = 8 * 1024; + +#[derive(Debug, Error)] +#[error("Invalid UTF-8 sequence")] +pub(crate) struct UTF8ReaderError; + +pub struct Utf8Reader { + inner: R, + + /// buffer_array[..buffer_end] always contains a valid UTF-8 sequence. It is possible that + /// buffer_array[buffer_start..buffer_end] does not contain a valid UTF-8 sequence, if + /// buffer_start is in the middle of a multi-byte UTF-8 character, which would happen if that + /// character has only partially been read. + buffer_array: Box<[u8; MAX_BUFFER_SIZE]>, + buffer_start: usize, + buffer_end: usize, +} + +impl Utf8Reader { + pub fn new(inner: R) -> Self { + Self { + inner, + buffer_array: Box::new([0; MAX_BUFFER_SIZE]), + buffer_start: 0, + buffer_end: 0, + } + } + + fn buffer(&self) -> &[u8] { + &self.buffer_array[self.buffer_start..self.buffer_end] + } +} + +impl Utf8Reader where R: Read {} + +impl Read for Utf8Reader +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + if self.buffer().is_empty() && buf.len() > MAX_BUFFER_SIZE { + // If buf is bigger than our internal buffer, and there is nothing left in + // our internal buffer, just read directly into buf. + return read_utf8(&mut self.inner, buf); + } + + self.fill_buf()?; + + let bytes_to_copy = std::cmp::min(buf.len(), self.buffer().len()); + buf[..bytes_to_copy].copy_from_slice(&self.buffer()[..bytes_to_copy]); + self.consume(bytes_to_copy); + + Ok(bytes_to_copy) + } +} + +impl BufRead for Utf8Reader +where + R: Read, +{ + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + if !self.buffer().is_empty() { + return Ok(self.buffer()); + } + + let bytes_read = read_utf8(&mut self.inner, &mut self.buffer_array[..])?; + self.buffer_start = 0; + self.buffer_end = bytes_read; + + Ok(self.buffer()) + } + + fn consume(&mut self, amt: usize) { + self.buffer_start += amt; + } +} + +/// Reads a UTF-8 sequence from the inner reader into the buffer, returning the number +/// of bytes read. Errors if this is not possible. +/// +/// The function guarantees that it will return an Ok variant with a positive +/// number of bytes read if it is possible to read a valid UTF-8 sequence from the inner +/// reader. The function also guarantees that all of the bytes read from the inner reader will +/// be stored in in the buffer, at indices up to the value returned by the function (in other +/// words, no bytes are lost). +/// +/// Panics if the buffer is not at least `MAX_UTF8_SEQUENCE_SIZE` bytes long. +fn read_utf8(reader: &mut impl Read, buf: &mut [u8]) -> Result { + if buf.len() < MAX_UTF8_SEQUENCE_SIZE { + panic!("Buffer needs to be at least {MAX_UTF8_SEQUENCE_SIZE} bytes long"); + } + + // We need to leave at least three bytes in the buffer in case the first read + // ends in the middle of a UTF-8 character. In the worst case, we end at the first + // byte of a 4-byte UTF-8 character, and we need to read 3 more bytes to get a + // valid sequence. + let bytes_to_read = buf.len() - MAX_UTF8_SEQUENCE_SIZE + 1; + let read_buf = &mut buf[..bytes_to_read]; + + let bytes_read = reader.read(read_buf)?; + let read_portion = &buf[..bytes_read]; + + let valid_up_to = utf8_up_to(read_portion); + let invalid_portion_len = read_portion.len() - valid_up_to; + + if invalid_portion_len >= MAX_UTF8_SEQUENCE_SIZE { + return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); + } + + // read_until_utf8 will not read anything if the buffer is empty, + // since an empty buffer is a valid UTF-8 sequence. + let bytes_until_utf8 = read_until_utf8(reader, &mut buf[valid_up_to..], invalid_portion_len)?; + Ok(valid_up_to + bytes_until_utf8) +} + +/// Reads a single byte at a time from the inner reader into the buffer, starting from +/// current_index, until either the buffer contains a valid UTF-8 sequence, the reader is +/// exhausted, or the 4 bytes total have been read into the buffer (the maximum size of a +/// UTF-8 sequence). The 4 bytes also includes bytes already in the buffer when the function is +/// called, as indicated by the current_index parameter. +/// +/// If the buffer is empty (i.e. current_index is 0), we will return Ok(0) without reading +/// anything from the reader. An empty byte sequence is a valid UTF-8 sequence (the +/// empty string). +/// +/// Returns the number of bytes read into the buffer (including bytes already in the buffer +/// when the function is called), or an error if the reader errors or if reading a valid UTF-8 +/// sequence is not possible. +/// +/// The buffer must have a length of at least 4, otherwise this function can panic. +fn read_until_utf8( + reader: &mut impl Read, + buffer: &mut [u8], + mut current_index: usize, +) -> Result { + while std::str::from_utf8(&buffer[..current_index]).is_err() { + if current_index >= MAX_UTF8_SEQUENCE_SIZE { + // We already have 4 bytes in the buffer (maximum UTF-8 sequence size) + // so we cannot form a valid UTF-8 sequence by reading more bytes. + return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); + } + + if reader.read(&mut buffer[current_index..current_index + 1])? == 0 { + // Stream has been exhausted without finding + // a valid UTF-8 sequence. + return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); + } + + current_index += 1; + } + + Ok(current_index) +} + +/// Returns the index of the first invalid UTF-8 sequence +/// in the given bytes. If the sequence is valid, returns the +/// length of the bytes. +fn utf8_up_to(bytes: &[u8]) -> usize { + match std::str::from_utf8(bytes) { + Ok(_) => bytes.len(), + Err(e) => e.valid_up_to(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::io::Cursor; + + #[test] + fn test_read_empty() { + let mut empty_reader = Utf8Reader::new(Cursor::new(b"")); + + let mut buf = vec![]; + empty_reader + .read_to_end(&mut buf) + .expect("read_to_end errored"); + + assert_eq!(buf, b""); + } + + #[test] + fn test_read_ascii_simple() { + let mut reader = Utf8Reader::new(Cursor::new(b"Hello, world!")); + + let mut buf = vec![]; + reader.read_to_end(&mut buf).expect("read_to_end errored"); + + assert_eq!(buf, b"Hello, world!"); + } + + #[test] + fn test_read_utf8_simple() { + const HELLO_WORLD: &str = "你好世界!"; + let mut reader = Utf8Reader::new(Cursor::new(HELLO_WORLD.as_bytes())); + + let mut buf = vec![]; + reader.read_to_end(&mut buf).expect("read_to_end errored"); + + assert_eq!(buf, HELLO_WORLD.as_bytes()); + } + + #[test] + fn multibyte_character_at_end_of_buffer() { + // Having a multibyte character at the end of the buffer will cause us to hit + // read_until_utf8. + let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE]; + read_buffer.extend("🙂".as_bytes()); + + let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); + + let mut buf = [0; MAX_BUFFER_SIZE]; + let bytes_read = reader.read(&mut buf).expect("read errored"); + + // We expect the buffer to be filled with the read bytes. We first read all but the last + // three bytes. But, we will need to fill these three bytes to get a valid UTF-8 sequence, + // since "🙂" is a 4-byte UTF-8 sequence. + assert_eq!(bytes_read, buf.len(), "buffer not filled"); + assert_eq!(&buf[..], read_buffer); + } + + #[test] + fn multibyte_character_at_end_of_big_read() { + // Big reads bypass buffering, so basically, we retest multibyte_character_at_end_of_buffer + // for this case. + let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE + 10]; + read_buffer.extend("🙂".as_bytes()); + + let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); + + let mut buf = [0; MAX_BUFFER_SIZE + MAX_UTF8_SEQUENCE_SIZE + 10]; + let bytes_read = reader.read(&mut buf).expect("read errored"); + + assert_eq!(bytes_read, buf.len(), "buffer not filled"); + assert_eq!(&buf[..], read_buffer); + } + + #[test] + fn small_reads_splitting_sequence() { + let mut reader = Utf8Reader::new(Cursor::new("🙂".as_bytes())); + + let mut buf = [0; MAX_UTF8_SEQUENCE_SIZE]; + + for i in 0..MAX_UTF8_SEQUENCE_SIZE { + // Read at most one byte at a time. + let bytes_read = reader.read(&mut buf[i..i + 1]).expect("read errored"); + assert_eq!(bytes_read, 1, "bytes read"); + } + + assert_eq!(&buf[..], "🙂".as_bytes()); + } + + #[test] + fn invalid_utf8_sequence() { + let mut reader = Utf8Reader::new(Cursor::new([0b11111111])); + + let mut buf = [0; 1]; + reader.read(&mut buf).expect_err("read should have errored"); + } + + #[test] + fn invalid_utf8_sequence_at_end_of_reader() { + let mut read_buffer = Vec::from(b"Hello, world!"); + + // Cutting off the last byte will invalidate the UTF-8 sequence. + let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1]; + read_buffer.extend(invalid_sequence); + + let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); + reader + .read_to_end(&mut vec![]) + .expect_err("read should have errored"); + } + + #[test] + fn invalid_utf8_sequence_at_end_of_reader_and_buffer() { + let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE]; + + // Cutting off the last byte will invalidate the UTF-8 sequence. + let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1]; + read_buffer.extend(invalid_sequence); + + let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); + + let mut buf = [0; MAX_BUFFER_SIZE]; + reader.read(&mut buf).expect_err("read should have errored"); + } +} From 4f6f0c57ce4d304fa11407d343785ce82a1a7ab7 Mon Sep 17 00:00:00 2001 From: Daniel Szoke Date: Thu, 16 Jan 2025 11:59:24 +0100 Subject: [PATCH 2/6] ref: `Utf8Reader` no longer is buffering --- .../src/sourcebundle/utf8_reader.rs | 253 +++++++++--------- 1 file changed, 120 insertions(+), 133 deletions(-) diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs index dba964c39..7d95916e1 100644 --- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs +++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs @@ -1,10 +1,12 @@ //! UTF-8 reader used by the sourcebundle module to read files. -use std::io::{BufRead, Error, ErrorKind, Read, Result}; +use std::cmp; +use std::io::{Error, ErrorKind, Read, Result}; +use std::str; + use thiserror::Error; const MAX_UTF8_SEQUENCE_SIZE: usize = 4; -const MAX_BUFFER_SIZE: usize = 8 * 1024; #[derive(Debug, Error)] #[error("Invalid UTF-8 sequence")] @@ -13,12 +15,17 @@ pub(crate) struct UTF8ReaderError; pub struct Utf8Reader { inner: R, - /// buffer_array[..buffer_end] always contains a valid UTF-8 sequence. It is possible that - /// buffer_array[buffer_start..buffer_end] does not contain a valid UTF-8 sequence, if - /// buffer_start is in the middle of a multi-byte UTF-8 character, which would happen if that - /// character has only partially been read. - buffer_array: Box<[u8; MAX_BUFFER_SIZE]>, + /// A buffer of `MAX_UTF8_SEQUENCE_SIZE` bytes, which we use when the end of a read is not a + /// valid UTF-8 sequence. We read into this buffer until we have a valid UTF-8 sequence, or + /// the reader is exhausted, or the buffer is full. Assuming we get a valid UTF-8 sequence, + /// we then on next read read from this buffer. + buffer: [u8; MAX_UTF8_SEQUENCE_SIZE], + + /// The index of the first byte in the buffer that has not been read. buffer_start: usize, + + /// The index of the last byte in the buffer that is part of the UTF-8 sequence. We only read + /// up to this index, bytes after this index are discarded. buffer_end: usize, } @@ -26,97 +33,92 @@ impl Utf8Reader { pub fn new(inner: R) -> Self { Self { inner, - buffer_array: Box::new([0; MAX_BUFFER_SIZE]), + buffer: [0; MAX_UTF8_SEQUENCE_SIZE], buffer_start: 0, buffer_end: 0, } } - fn buffer(&self) -> &[u8] { - &self.buffer_array[self.buffer_start..self.buffer_end] + /// The slice of the buffer that should be read next. + fn buffer_to_read(&self) -> &[u8] { + &self.buffer[self.buffer_start..self.buffer_end] } -} - -impl Utf8Reader where R: Read {} - -impl Read for Utf8Reader -where - R: Read, -{ - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - if self.buffer().is_empty() && buf.len() > MAX_BUFFER_SIZE { - // If buf is bigger than our internal buffer, and there is nothing left in - // our internal buffer, just read directly into buf. - return read_utf8(&mut self.inner, buf); - } - self.fill_buf()?; + /// Advances the buffer start index by the given amount. + fn advance(&mut self, amt: usize) { + self.buffer_start += amt; + } - let bytes_to_copy = std::cmp::min(buf.len(), self.buffer().len()); - buf[..bytes_to_copy].copy_from_slice(&self.buffer()[..bytes_to_copy]); - self.consume(bytes_to_copy); + /// Reads bytes from the buffer into the given buffer. + fn read_from_buffer(&mut self, buf: &mut [u8]) -> Result { + let bytes_copied = slice_copy(buf, self.buffer_to_read()); + self.advance(bytes_copied); - Ok(bytes_to_copy) + Ok(bytes_copied) } } -impl BufRead for Utf8Reader +impl Utf8Reader where R: Read, { - fn fill_buf(&mut self) -> std::io::Result<&[u8]> { - if !self.buffer().is_empty() { - return Ok(self.buffer()); - } + /// Reads bytes from the inner reader into the given buffer, if needed, filling self.buffer + /// with the remaining bytes of the UTF-8 sequence at the end of the buffer, which did not fit + /// in the read. + fn read_from_inner(&mut self, buf: &mut [u8]) -> Result { + let read_from_inner = self.inner.read(buf)?; + let read_portion = &buf[..read_from_inner]; - let bytes_read = read_utf8(&mut self.inner, &mut self.buffer_array[..])?; - self.buffer_start = 0; - self.buffer_end = bytes_read; + let invalid_portion = ending_incomplete_utf8_sequence(read_portion)?; - Ok(self.buffer()) - } + slice_copy(&mut self.buffer, invalid_portion); + self.read_into_buffer_until_utf8(invalid_portion.len())?; - fn consume(&mut self, amt: usize) { - self.buffer_start += amt; + Ok(read_from_inner) } -} -/// Reads a UTF-8 sequence from the inner reader into the buffer, returning the number -/// of bytes read. Errors if this is not possible. -/// -/// The function guarantees that it will return an Ok variant with a positive -/// number of bytes read if it is possible to read a valid UTF-8 sequence from the inner -/// reader. The function also guarantees that all of the bytes read from the inner reader will -/// be stored in in the buffer, at indices up to the value returned by the function (in other -/// words, no bytes are lost). -/// -/// Panics if the buffer is not at least `MAX_UTF8_SEQUENCE_SIZE` bytes long. -fn read_utf8(reader: &mut impl Read, buf: &mut [u8]) -> Result { - if buf.len() < MAX_UTF8_SEQUENCE_SIZE { - panic!("Buffer needs to be at least {MAX_UTF8_SEQUENCE_SIZE} bytes long"); + /// Reads bytes from the inner reader into self.buffer until the buffer contains a valid UTF-8 + /// sequence. + /// + /// Before calling this method, self.buffer[..start_index] should contain the bytes that are + /// part of the incomplete UTF-8 sequence that we are trying to complete (or, it should be + /// empty, in which case, this function is a no-op) + /// + /// Then, we read one byte at a time from the inner reader into self.buffer, until we have a + /// valid UTF-8 sequence. + /// + /// Lastly, we set self.buffer_start to start_index (so we don't reread the bytes that started + /// the incomplete UTF-8 sequence) and self.buffer_end to the index of the last byte in the + /// buffer that is part of the UTF-8 sequence. + /// + /// The next read from the Utf8Reader will read from + /// self.buffer[self.buffer_start..self.buffer_end]. + fn read_into_buffer_until_utf8(&mut self, start_index: usize) -> Result<()> { + let bytes_until_utf8 = read_until_utf8(&mut self.inner, &mut self.buffer, start_index)?; + + self.buffer_start = start_index; + self.buffer_end = bytes_until_utf8; + + Ok(()) } +} - // We need to leave at least three bytes in the buffer in case the first read - // ends in the middle of a UTF-8 character. In the worst case, we end at the first - // byte of a 4-byte UTF-8 character, and we need to read 3 more bytes to get a - // valid sequence. - let bytes_to_read = buf.len() - MAX_UTF8_SEQUENCE_SIZE + 1; - let read_buf = &mut buf[..bytes_to_read]; - - let bytes_read = reader.read(read_buf)?; - let read_portion = &buf[..bytes_read]; +impl Read for Utf8Reader +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + let read_from_buffer = self.read_from_buffer(buf)?; - let valid_up_to = utf8_up_to(read_portion); - let invalid_portion_len = read_portion.len() - valid_up_to; + // self.read_from_inner overwrites self.buffer, so we can only call it if we have read + // everything from the buffer. + let read_from_inner = match self.buffer_to_read() { + [] => self.read_from_inner(&mut buf[read_from_buffer..])?, + _ => 0, + }; - if invalid_portion_len >= MAX_UTF8_SEQUENCE_SIZE { - return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); + Ok(read_from_buffer + read_from_inner) } - - // read_until_utf8 will not read anything if the buffer is empty, - // since an empty buffer is a valid UTF-8 sequence. - let bytes_until_utf8 = read_until_utf8(reader, &mut buf[valid_up_to..], invalid_portion_len)?; - Ok(valid_up_to + bytes_until_utf8) } /// Reads a single byte at a time from the inner reader into the buffer, starting from @@ -139,16 +141,12 @@ fn read_until_utf8( buffer: &mut [u8], mut current_index: usize, ) -> Result { - while std::str::from_utf8(&buffer[..current_index]).is_err() { - if current_index >= MAX_UTF8_SEQUENCE_SIZE { - // We already have 4 bytes in the buffer (maximum UTF-8 sequence size) - // so we cannot form a valid UTF-8 sequence by reading more bytes. - return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); - } - - if reader.read(&mut buffer[current_index..current_index + 1])? == 0 { - // Stream has been exhausted without finding - // a valid UTF-8 sequence. + while str::from_utf8(&buffer[..current_index]).is_err() { + if current_index >= MAX_UTF8_SEQUENCE_SIZE + || reader.read(&mut buffer[current_index..current_index + 1])? == 0 + { + // We already have 4 bytes in the buffer (maximum UTF-8 sequence size), or the stream + // has been exhausted without finding a valid UTF-8 sequence. return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); } @@ -162,12 +160,50 @@ fn read_until_utf8( /// in the given bytes. If the sequence is valid, returns the /// length of the bytes. fn utf8_up_to(bytes: &[u8]) -> usize { - match std::str::from_utf8(bytes) { + match str::from_utf8(bytes) { Ok(_) => bytes.len(), Err(e) => e.valid_up_to(), } } +/// Given a byte slice, determines if the slice ends in what might be an incomplete UTF-8 sequence, +/// returning the incomplete sequence if so. +/// +/// The following return values are possible: +/// - Ok([]) if the byte slice is valid UTF-8, in its entirety. +/// - Ok(incomplete_sequence) if the byte slice is valid up to the `incomplete_sequence` slice +/// returned by this function. `incomplete_sequence` is always a slice of at most 3 bytes +/// occurring at the end of the input slice. In this case, it might be possible to make the +/// input slice a valid UTF-8 sequence by appending bytes to the input slice. +/// - Err(e) if the first UTF-8 violation in the input slice occurs more than 3 bytes from the +/// end of the input slice. In this case, it is definitely not possible to make the sequence +/// valid by appending bytes to the input slice. +fn ending_incomplete_utf8_sequence(bytes: &[u8]) -> Result<&[u8]> { + let valid_up_to = utf8_up_to(bytes); + let invalid_portion = &bytes[valid_up_to..]; + + if invalid_portion.len() >= MAX_UTF8_SEQUENCE_SIZE { + Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)) + } else { + Ok(invalid_portion) + } +} + +/// Copies as many elements as possible (i.e. the smaller of the two slice lengths) from the +/// beginning of the source slice to the beginning of the destination slice, overwriting anything +/// already in the destination slice. +/// +/// Returns the number of elements copied. +fn slice_copy(dst: &mut [T], src: &[T]) -> usize +where + T: Copy, +{ + let elements_to_copy = cmp::min(dst.len(), src.len()); + dst[..elements_to_copy].copy_from_slice(&src[..elements_to_copy]); + + elements_to_copy +} + #[cfg(test)] mod tests { use super::*; @@ -207,41 +243,6 @@ mod tests { assert_eq!(buf, HELLO_WORLD.as_bytes()); } - #[test] - fn multibyte_character_at_end_of_buffer() { - // Having a multibyte character at the end of the buffer will cause us to hit - // read_until_utf8. - let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE]; - read_buffer.extend("🙂".as_bytes()); - - let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); - - let mut buf = [0; MAX_BUFFER_SIZE]; - let bytes_read = reader.read(&mut buf).expect("read errored"); - - // We expect the buffer to be filled with the read bytes. We first read all but the last - // three bytes. But, we will need to fill these three bytes to get a valid UTF-8 sequence, - // since "🙂" is a 4-byte UTF-8 sequence. - assert_eq!(bytes_read, buf.len(), "buffer not filled"); - assert_eq!(&buf[..], read_buffer); - } - - #[test] - fn multibyte_character_at_end_of_big_read() { - // Big reads bypass buffering, so basically, we retest multibyte_character_at_end_of_buffer - // for this case. - let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE + 10]; - read_buffer.extend("🙂".as_bytes()); - - let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); - - let mut buf = [0; MAX_BUFFER_SIZE + MAX_UTF8_SEQUENCE_SIZE + 10]; - let bytes_read = reader.read(&mut buf).expect("read errored"); - - assert_eq!(bytes_read, buf.len(), "buffer not filled"); - assert_eq!(&buf[..], read_buffer); - } - #[test] fn small_reads_splitting_sequence() { let mut reader = Utf8Reader::new(Cursor::new("🙂".as_bytes())); @@ -278,18 +279,4 @@ mod tests { .read_to_end(&mut vec![]) .expect_err("read should have errored"); } - - #[test] - fn invalid_utf8_sequence_at_end_of_reader_and_buffer() { - let mut read_buffer = vec![b'a'; MAX_BUFFER_SIZE - MAX_UTF8_SEQUENCE_SIZE]; - - // Cutting off the last byte will invalidate the UTF-8 sequence. - let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1]; - read_buffer.extend(invalid_sequence); - - let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); - - let mut buf = [0; MAX_BUFFER_SIZE]; - reader.read(&mut buf).expect_err("read should have errored"); - } } From 7e78aa4d62f94eed73cb9e248050bf045fe1a3ad Mon Sep 17 00:00:00 2001 From: Daniel Szoke Date: Thu, 16 Jan 2025 12:15:21 +0100 Subject: [PATCH 3/6] ref: `match` instead of `if` --- symbolic-debuginfo/src/sourcebundle/mod.rs | 33 ++++++++++++---------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/symbolic-debuginfo/src/sourcebundle/mod.rs b/symbolic-debuginfo/src/sourcebundle/mod.rs index 7d6cb6050..01bbac277 100644 --- a/symbolic-debuginfo/src/sourcebundle/mod.rs +++ b/symbolic-debuginfo/src/sourcebundle/mod.rs @@ -1236,21 +1236,24 @@ where .start_file(unique_path.clone(), default_file_options()) .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - if let Err(e) = io::copy(&mut file_reader, &mut self.writer) { - self.writer - .abort_file() - .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - - // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8. - let error_kind = match e.kind() { - ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed, - _ => SourceBundleErrorKind::WriteFailed, - }; - - Err(SourceBundleError::new(error_kind, e)) - } else { - self.manifest.files.insert(unique_path, info); - Ok(()) + match io::copy(&mut file_reader, &mut self.writer) { + Err(e) => { + self.writer + .abort_file() + .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; + + // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8. + let error_kind = match e.kind() { + ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed, + _ => SourceBundleErrorKind::WriteFailed, + }; + + Err(SourceBundleError::new(error_kind, e)) + } + Ok(_) => { + self.manifest.files.insert(unique_path, info); + Ok(()) + } } } From 946bbf7b21bc7e3cc7a71c0d22284f07ccbb643a Mon Sep 17 00:00:00 2001 From: Daniel Szoke <7881302+szokeasaurusrex@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:57:44 +0100 Subject: [PATCH 4/6] Update symbolic-debuginfo/src/sourcebundle/utf8_reader.rs Co-authored-by: Sebastian Zivota --- symbolic-debuginfo/src/sourcebundle/utf8_reader.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs index 7d95916e1..88c9f7384 100644 --- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs +++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs @@ -62,9 +62,10 @@ impl Utf8Reader where R: Read, { - /// Reads bytes from the inner reader into the given buffer, if needed, filling self.buffer - /// with the remaining bytes of the UTF-8 sequence at the end of the buffer, which did not fit - /// in the read. + /// Reads bytes from the inner reader into the given buffer. + /// + /// If a UTF-8 sequence only partially fits into the buffer, the remaining + /// bytes are written to `self.buffer`. fn read_from_inner(&mut self, buf: &mut [u8]) -> Result { let read_from_inner = self.inner.read(buf)?; let read_portion = &buf[..read_from_inner]; From c81365a68c7e6761f1847bc340517b02e72c8566 Mon Sep 17 00:00:00 2001 From: Sebastian Zivota Date: Thu, 16 Jan 2025 16:01:16 +0100 Subject: [PATCH 5/6] proptest utf8-reader --- Cargo.lock | 115 ++++++++++++++++++ Cargo.toml | 1 + symbolic-debuginfo/Cargo.toml | 1 + .../src/sourcebundle/utf8_reader.rs | 23 ++++ 4 files changed, 140 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 4d55e2799..1f1a8b8b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -227,6 +227,21 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -295,6 +310,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.9.0" @@ -693,6 +714,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1646,6 +1673,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "precomputed-hash" version = "0.1.1" @@ -1683,6 +1719,26 @@ dependencies = [ "watto", ] +[[package]] +name = "proptest" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.7.0", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax 0.8.5", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "psm" version = "0.1.24" @@ -1692,6 +1748,12 @@ dependencies = [ "cc", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.38" @@ -1713,6 +1775,18 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", "rand_core", ] @@ -1721,6 +1795,18 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] [[package]] name = "range-collections" @@ -1849,6 +1935,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.18" @@ -2381,6 +2479,7 @@ dependencies = [ "once_cell", "parking_lot", "pdb-addr2line", + "proptest", "regex", "scroll 0.12.0", "serde", @@ -2741,6 +2840,12 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-id-start" version = "1.0.4" @@ -2823,6 +2928,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -3191,6 +3305,7 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 82346d281..e8f17957d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ once_cell = "1.17.1" parking_lot = "0.12.1" pdb-addr2line = "0.10.4" proguard = { version = "5.4.0", features = ["uuid"] } +proptest = "1.6.0" regex = "1.7.1" rustc-demangle = "0.1.21" # keep this in sync with whatever version `goblin` uses diff --git a/symbolic-debuginfo/Cargo.toml b/symbolic-debuginfo/Cargo.toml index 2f08e1d32..f20ee6472 100644 --- a/symbolic-debuginfo/Cargo.toml +++ b/symbolic-debuginfo/Cargo.toml @@ -115,6 +115,7 @@ zstd = { workspace = true, optional = true } [dev-dependencies] criterion = { workspace = true } insta = { workspace = true } +proptest = {workspace = true } tempfile = { workspace = true } similar-asserts = { workspace = true } symbolic-testutils = { path = "../symbolic-testutils" } diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs index 88c9f7384..64713814e 100644 --- a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs +++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs @@ -209,6 +209,9 @@ where mod tests { use super::*; + use proptest::prelude::*; + + use core::str; use std::io::Cursor; #[test] @@ -280,4 +283,24 @@ mod tests { .read_to_end(&mut vec![]) .expect_err("read should have errored"); } + + proptest! { + #[test] + fn read_arbitrary_string(s in any::()) { + let mut buf = vec![]; + let mut reader = Utf8Reader::new(Cursor::new(&s)); + + reader.read_to_end(&mut buf).unwrap(); + assert_eq!(str::from_utf8(&buf).unwrap(), s); + } + + #[test] + fn dont_read_arbitrary_nonstring(s in any::>()) { + prop_assume!(str::from_utf8(&s).is_err()); + let mut buf = vec![]; + let mut reader = Utf8Reader::new(Cursor::new(&s)); + + reader.read_to_end(&mut buf).unwrap_err(); + } + } } From 811dd44af10ad99f471c1c7a5c3ca4c80225b298 Mon Sep 17 00:00:00 2001 From: Daniel Szoke Date: Mon, 20 Jan 2025 10:02:23 +0100 Subject: [PATCH 6/6] Update Changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca211b388..c5962c6c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,13 @@ # Changelog -## 12.13.2 +## Unreleased **Improvements** - Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890)) +## 12.13.2 + **Fixes** - Fixed GHA-based Windows builds ([#891](https://github.com/getsentry/symbolic/pull/891)).