diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f6b57e0a..c5962c6c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Unreleased + +**Improvements** + +- Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890)) + ## 12.13.2 **Fixes** diff --git a/Cargo.lock b/Cargo.lock index 4d55e2799..1f1a8b8b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -227,6 +227,21 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -295,6 +310,12 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.9.0" @@ -693,6 +714,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1646,6 +1673,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + [[package]] name = "precomputed-hash" version = "0.1.1" @@ -1683,6 +1719,26 @@ dependencies = [ "watto", ] +[[package]] +name = "proptest" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14cae93065090804185d3b75f0bf93b8eeda30c7a9b4a33d3bdb3988d6229e50" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.7.0", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax 0.8.5", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "psm" version = "0.1.24" @@ -1692,6 +1748,12 @@ dependencies = [ "cc", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.38" @@ -1713,6 +1775,18 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", "rand_core", ] @@ -1721,6 +1795,18 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] [[package]] name = "range-collections" @@ -1849,6 +1935,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.18" @@ -2381,6 +2479,7 @@ dependencies = [ "once_cell", "parking_lot", "pdb-addr2line", + "proptest", "regex", "scroll 0.12.0", "serde", @@ -2741,6 +2840,12 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-id-start" version = "1.0.4" @@ -2823,6 +2928,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -3191,6 +3305,7 @@ version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 82346d281..e8f17957d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ once_cell = "1.17.1" parking_lot = "0.12.1" pdb-addr2line = "0.10.4" proguard = { version = "5.4.0", features = ["uuid"] } +proptest = "1.6.0" regex = "1.7.1" rustc-demangle = "0.1.21" # keep this in sync with whatever version `goblin` uses diff --git a/symbolic-debuginfo/Cargo.toml b/symbolic-debuginfo/Cargo.toml index 2f08e1d32..f20ee6472 100644 --- a/symbolic-debuginfo/Cargo.toml +++ b/symbolic-debuginfo/Cargo.toml @@ -115,6 +115,7 @@ zstd = { workspace = true, optional = true } [dev-dependencies] criterion = { workspace = true } insta = { workspace = true } +proptest = {workspace = true } tempfile = { workspace = true } similar-asserts = { workspace = true } symbolic-testutils = { path = "../symbolic-testutils" } diff --git a/symbolic-debuginfo/src/sourcebundle.rs b/symbolic-debuginfo/src/sourcebundle/mod.rs similarity index 98% rename from symbolic-debuginfo/src/sourcebundle.rs rename to symbolic-debuginfo/src/sourcebundle/mod.rs index 1dbce62c7..01bbac277 100644 --- a/symbolic-debuginfo/src/sourcebundle.rs +++ b/symbolic-debuginfo/src/sourcebundle/mod.rs @@ -42,15 +42,17 @@ //! bundle a file entry has a `url` and might carry `headers` or individual debug IDs //! per source file. +mod utf8_reader; + use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::error::Error; -use std::fmt; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{BufReader, BufWriter, Read, Seek, Write}; +use std::io::{BufReader, BufWriter, ErrorKind, Read, Seek, Write}; use std::path::Path; use std::sync::Arc; +use std::{fmt, io}; use parking_lot::Mutex; use regex::Regex; @@ -60,6 +62,7 @@ use zip::{write::SimpleFileOptions, ZipWriter}; use symbolic_common::{Arch, AsSelf, CodeId, DebugId, SourceLinkMappings}; +use self::utf8_reader::Utf8Reader; use crate::base::*; use crate::js::{ discover_debug_id, discover_sourcemap_embedded_debug_id, discover_sourcemaps_location, @@ -1217,18 +1220,14 @@ where pub fn add_file( &mut self, path: S, - mut file: R, + file: R, info: SourceFileInfo, ) -> Result<(), SourceBundleError> where S: AsRef, R: Read, { - let mut buf = String::new(); - - if let Err(e) = file.read_to_string(&mut buf) { - return Err(SourceBundleError::new(SourceBundleErrorKind::ReadFailed, e)); - } + let mut file_reader = Utf8Reader::new(file); let full_path = self.file_path(path.as_ref()); let unique_path = self.unique_path(full_path); @@ -1236,12 +1235,26 @@ where self.writer .start_file(unique_path.clone(), default_file_options()) .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - self.writer - .write_all(buf.as_bytes()) - .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; - self.manifest.files.insert(unique_path, info); - Ok(()) + match io::copy(&mut file_reader, &mut self.writer) { + Err(e) => { + self.writer + .abort_file() + .map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?; + + // ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8. + let error_kind = match e.kind() { + ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed, + _ => SourceBundleErrorKind::WriteFailed, + }; + + Err(SourceBundleError::new(error_kind, e)) + } + Ok(_) => { + self.manifest.files.insert(unique_path, info); + Ok(()) + } + } } /// Calls add_file, and handles any ReadFailed errors by calling the skipped_file_callback. diff --git a/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs new file mode 100644 index 000000000..64713814e --- /dev/null +++ b/symbolic-debuginfo/src/sourcebundle/utf8_reader.rs @@ -0,0 +1,306 @@ +//! UTF-8 reader used by the sourcebundle module to read files. + +use std::cmp; +use std::io::{Error, ErrorKind, Read, Result}; +use std::str; + +use thiserror::Error; + +const MAX_UTF8_SEQUENCE_SIZE: usize = 4; + +#[derive(Debug, Error)] +#[error("Invalid UTF-8 sequence")] +pub(crate) struct UTF8ReaderError; + +pub struct Utf8Reader { + inner: R, + + /// A buffer of `MAX_UTF8_SEQUENCE_SIZE` bytes, which we use when the end of a read is not a + /// valid UTF-8 sequence. We read into this buffer until we have a valid UTF-8 sequence, or + /// the reader is exhausted, or the buffer is full. Assuming we get a valid UTF-8 sequence, + /// we then on next read read from this buffer. + buffer: [u8; MAX_UTF8_SEQUENCE_SIZE], + + /// The index of the first byte in the buffer that has not been read. + buffer_start: usize, + + /// The index of the last byte in the buffer that is part of the UTF-8 sequence. We only read + /// up to this index, bytes after this index are discarded. + buffer_end: usize, +} + +impl Utf8Reader { + pub fn new(inner: R) -> Self { + Self { + inner, + buffer: [0; MAX_UTF8_SEQUENCE_SIZE], + buffer_start: 0, + buffer_end: 0, + } + } + + /// The slice of the buffer that should be read next. + fn buffer_to_read(&self) -> &[u8] { + &self.buffer[self.buffer_start..self.buffer_end] + } + + /// Advances the buffer start index by the given amount. + fn advance(&mut self, amt: usize) { + self.buffer_start += amt; + } + + /// Reads bytes from the buffer into the given buffer. + fn read_from_buffer(&mut self, buf: &mut [u8]) -> Result { + let bytes_copied = slice_copy(buf, self.buffer_to_read()); + self.advance(bytes_copied); + + Ok(bytes_copied) + } +} + +impl Utf8Reader +where + R: Read, +{ + /// Reads bytes from the inner reader into the given buffer. + /// + /// If a UTF-8 sequence only partially fits into the buffer, the remaining + /// bytes are written to `self.buffer`. + fn read_from_inner(&mut self, buf: &mut [u8]) -> Result { + let read_from_inner = self.inner.read(buf)?; + let read_portion = &buf[..read_from_inner]; + + let invalid_portion = ending_incomplete_utf8_sequence(read_portion)?; + + slice_copy(&mut self.buffer, invalid_portion); + self.read_into_buffer_until_utf8(invalid_portion.len())?; + + Ok(read_from_inner) + } + + /// Reads bytes from the inner reader into self.buffer until the buffer contains a valid UTF-8 + /// sequence. + /// + /// Before calling this method, self.buffer[..start_index] should contain the bytes that are + /// part of the incomplete UTF-8 sequence that we are trying to complete (or, it should be + /// empty, in which case, this function is a no-op) + /// + /// Then, we read one byte at a time from the inner reader into self.buffer, until we have a + /// valid UTF-8 sequence. + /// + /// Lastly, we set self.buffer_start to start_index (so we don't reread the bytes that started + /// the incomplete UTF-8 sequence) and self.buffer_end to the index of the last byte in the + /// buffer that is part of the UTF-8 sequence. + /// + /// The next read from the Utf8Reader will read from + /// self.buffer[self.buffer_start..self.buffer_end]. + fn read_into_buffer_until_utf8(&mut self, start_index: usize) -> Result<()> { + let bytes_until_utf8 = read_until_utf8(&mut self.inner, &mut self.buffer, start_index)?; + + self.buffer_start = start_index; + self.buffer_end = bytes_until_utf8; + + Ok(()) + } +} + +impl Read for Utf8Reader +where + R: Read, +{ + fn read(&mut self, buf: &mut [u8]) -> Result { + let read_from_buffer = self.read_from_buffer(buf)?; + + // self.read_from_inner overwrites self.buffer, so we can only call it if we have read + // everything from the buffer. + let read_from_inner = match self.buffer_to_read() { + [] => self.read_from_inner(&mut buf[read_from_buffer..])?, + _ => 0, + }; + + Ok(read_from_buffer + read_from_inner) + } +} + +/// Reads a single byte at a time from the inner reader into the buffer, starting from +/// current_index, until either the buffer contains a valid UTF-8 sequence, the reader is +/// exhausted, or the 4 bytes total have been read into the buffer (the maximum size of a +/// UTF-8 sequence). The 4 bytes also includes bytes already in the buffer when the function is +/// called, as indicated by the current_index parameter. +/// +/// If the buffer is empty (i.e. current_index is 0), we will return Ok(0) without reading +/// anything from the reader. An empty byte sequence is a valid UTF-8 sequence (the +/// empty string). +/// +/// Returns the number of bytes read into the buffer (including bytes already in the buffer +/// when the function is called), or an error if the reader errors or if reading a valid UTF-8 +/// sequence is not possible. +/// +/// The buffer must have a length of at least 4, otherwise this function can panic. +fn read_until_utf8( + reader: &mut impl Read, + buffer: &mut [u8], + mut current_index: usize, +) -> Result { + while str::from_utf8(&buffer[..current_index]).is_err() { + if current_index >= MAX_UTF8_SEQUENCE_SIZE + || reader.read(&mut buffer[current_index..current_index + 1])? == 0 + { + // We already have 4 bytes in the buffer (maximum UTF-8 sequence size), or the stream + // has been exhausted without finding a valid UTF-8 sequence. + return Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)); + } + + current_index += 1; + } + + Ok(current_index) +} + +/// Returns the index of the first invalid UTF-8 sequence +/// in the given bytes. If the sequence is valid, returns the +/// length of the bytes. +fn utf8_up_to(bytes: &[u8]) -> usize { + match str::from_utf8(bytes) { + Ok(_) => bytes.len(), + Err(e) => e.valid_up_to(), + } +} + +/// Given a byte slice, determines if the slice ends in what might be an incomplete UTF-8 sequence, +/// returning the incomplete sequence if so. +/// +/// The following return values are possible: +/// - Ok([]) if the byte slice is valid UTF-8, in its entirety. +/// - Ok(incomplete_sequence) if the byte slice is valid up to the `incomplete_sequence` slice +/// returned by this function. `incomplete_sequence` is always a slice of at most 3 bytes +/// occurring at the end of the input slice. In this case, it might be possible to make the +/// input slice a valid UTF-8 sequence by appending bytes to the input slice. +/// - Err(e) if the first UTF-8 violation in the input slice occurs more than 3 bytes from the +/// end of the input slice. In this case, it is definitely not possible to make the sequence +/// valid by appending bytes to the input slice. +fn ending_incomplete_utf8_sequence(bytes: &[u8]) -> Result<&[u8]> { + let valid_up_to = utf8_up_to(bytes); + let invalid_portion = &bytes[valid_up_to..]; + + if invalid_portion.len() >= MAX_UTF8_SEQUENCE_SIZE { + Err(Error::new(ErrorKind::InvalidData, UTF8ReaderError)) + } else { + Ok(invalid_portion) + } +} + +/// Copies as many elements as possible (i.e. the smaller of the two slice lengths) from the +/// beginning of the source slice to the beginning of the destination slice, overwriting anything +/// already in the destination slice. +/// +/// Returns the number of elements copied. +fn slice_copy(dst: &mut [T], src: &[T]) -> usize +where + T: Copy, +{ + let elements_to_copy = cmp::min(dst.len(), src.len()); + dst[..elements_to_copy].copy_from_slice(&src[..elements_to_copy]); + + elements_to_copy +} + +#[cfg(test)] +mod tests { + use super::*; + + use proptest::prelude::*; + + use core::str; + use std::io::Cursor; + + #[test] + fn test_read_empty() { + let mut empty_reader = Utf8Reader::new(Cursor::new(b"")); + + let mut buf = vec![]; + empty_reader + .read_to_end(&mut buf) + .expect("read_to_end errored"); + + assert_eq!(buf, b""); + } + + #[test] + fn test_read_ascii_simple() { + let mut reader = Utf8Reader::new(Cursor::new(b"Hello, world!")); + + let mut buf = vec![]; + reader.read_to_end(&mut buf).expect("read_to_end errored"); + + assert_eq!(buf, b"Hello, world!"); + } + + #[test] + fn test_read_utf8_simple() { + const HELLO_WORLD: &str = "你好世界!"; + let mut reader = Utf8Reader::new(Cursor::new(HELLO_WORLD.as_bytes())); + + let mut buf = vec![]; + reader.read_to_end(&mut buf).expect("read_to_end errored"); + + assert_eq!(buf, HELLO_WORLD.as_bytes()); + } + + #[test] + fn small_reads_splitting_sequence() { + let mut reader = Utf8Reader::new(Cursor::new("🙂".as_bytes())); + + let mut buf = [0; MAX_UTF8_SEQUENCE_SIZE]; + + for i in 0..MAX_UTF8_SEQUENCE_SIZE { + // Read at most one byte at a time. + let bytes_read = reader.read(&mut buf[i..i + 1]).expect("read errored"); + assert_eq!(bytes_read, 1, "bytes read"); + } + + assert_eq!(&buf[..], "🙂".as_bytes()); + } + + #[test] + fn invalid_utf8_sequence() { + let mut reader = Utf8Reader::new(Cursor::new([0b11111111])); + + let mut buf = [0; 1]; + reader.read(&mut buf).expect_err("read should have errored"); + } + + #[test] + fn invalid_utf8_sequence_at_end_of_reader() { + let mut read_buffer = Vec::from(b"Hello, world!"); + + // Cutting off the last byte will invalidate the UTF-8 sequence. + let invalid_sequence = &"🙂".as_bytes()[..'🙂'.len_utf8() - 1]; + read_buffer.extend(invalid_sequence); + + let mut reader = Utf8Reader::new(Cursor::new(&read_buffer)); + reader + .read_to_end(&mut vec![]) + .expect_err("read should have errored"); + } + + proptest! { + #[test] + fn read_arbitrary_string(s in any::()) { + let mut buf = vec![]; + let mut reader = Utf8Reader::new(Cursor::new(&s)); + + reader.read_to_end(&mut buf).unwrap(); + assert_eq!(str::from_utf8(&buf).unwrap(), s); + } + + #[test] + fn dont_read_arbitrary_nonstring(s in any::>()) { + prop_assume!(str::from_utf8(&s).is_err()); + let mut buf = vec![]; + let mut reader = Utf8Reader::new(Cursor::new(&s)); + + reader.read_to_end(&mut buf).unwrap_err(); + } + } +}