From 3796ed91bb5ce5e8900027ecc18181e82e86d826 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Fri, 22 Jul 2022 15:08:30 -0700 Subject: [PATCH 1/2] update fasta parsing to be configurable, and add tests for new settings also removes the distinction between a DnaFastaParser and a "regular" SimpleFastaParser -- everything is "FastaParser" now, with a phantom generic FromStr type that will be parsed (String has an infalliable FromStr impl) So SimpleFastaParser is now FastaParser, and DnaFastaParser is now FastaParser. ProteinFastaParser didn't exist before, but it's now FastaParser. A hypothetical integer fasta format could be FastaParser, if you wanted that for some weird reason. Fasta parsing is now state-machine-driven, which should make a non-allocating API easier to add in the future, and also simplified the parsing code since it was getting hairy handling the new special cases in the old loop. --- src/fasta.rs | 945 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 834 insertions(+), 111 deletions(-) diff --git a/src/fasta.rs b/src/fasta.rs index 3ac8f69..d01198f 100644 --- a/src/fasta.rs +++ b/src/fasta.rs @@ -1,78 +1,458 @@ //! This module is for reading and writing FASTA format files -use std::io::{BufRead, BufReader, Read}; +use std::io::{self, BufRead}; use std::str::FromStr; use thiserror::Error; -use crate::{DnaSequence, TranslationError}; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FastaRecord { + /// The header of this record, without the leading character (usually '>') + /// Depending on the content and parser settings, this header may be empty, and may contain newlines. + pub header: String, + /// The contents of this record + pub contents: T, + /// The starting and ending line numbers of this record, start inclusive, end exclusive, 1-indexed. + /// The record header is included in this range. + pub line_range: (usize, usize), +} -/// Note that the parser is buffered automatically, so you should not wrap handle in a buffered reader like io::BufReader. -pub struct SimpleFastaParser; +impl FastaRecord { + /// Try to parse a string-containing record into a type that impls [`core::str::FromStr`]. + pub fn parse(self) -> Result, T::Err> { + let Self { + header, + contents, + line_range, + } = self; + let contents = contents.parse()?; + Ok(FastaRecord { + header, + contents, + line_range, + }) + } +} -impl SimpleFastaParser { - pub fn parse(handle: R) -> Result, std::io::Error> { - let mut result: Vec<(String, String)> = vec![]; +/// Settings for a fasta parser. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FastaParseSettings { + /// If this flag is true, then successive headers in a FASTA file will be concatenated, instead of + /// generating empty records. + /// + /// ```rust + /// use quickdna::{FastaParser, FastaParseSettings, FastaRecord}; + /// + /// let nonconcat = FastaParser::::new( + /// FastaParseSettings::new().concatenate_headers(false) + /// ).parse_str(">a\n>b\n...").unwrap(); + /// assert_eq!( + /// nonconcat, + /// vec![ + /// FastaRecord { + /// header: "a".to_string(), + /// contents: "".to_string(), + /// line_range: (1, 2), + /// }, + /// FastaRecord { + /// header: "b".to_string(), + /// contents: "...".to_string(), + /// line_range: (2, 4), + /// } + /// ], + /// ); + /// + /// let concat = FastaParser::::new( + /// FastaParseSettings::new().concatenate_headers(true) + /// ).parse_str(">a\n>b\n...").unwrap(); + /// assert_eq!( + /// concat, + /// vec![ + /// FastaRecord { + /// header: "a\nb".to_string(), + /// contents: "...".to_string(), + /// line_range: (1, 4), + /// } + /// ], + /// ); + /// ``` + concatenate_headers: bool, + + /// If this flag is true, content before the first header will be treated as a file comment and ignored. Otherwise, + /// it will be parsed as a sequence with an empty header. + /// + /// ```rust + /// use quickdna::{FastaParser, FastaParseSettings, FastaRecord}; + /// + /// let comment = FastaParser::::new( + /// FastaParseSettings::new().allow_preceding_comment(true) + /// ).parse_str("comment\n>a\nsequence").unwrap(); + /// assert_eq!( + /// comment, + /// vec![ + /// FastaRecord { + /// header: "a".to_string(), + /// contents: "sequence".to_string(), + /// line_range: (2, 4), + /// } + /// ] + /// ); + /// + /// let comment = FastaParser::::new( + /// FastaParseSettings::new().allow_preceding_comment(false) + /// ).parse_str("comment\n>a\nsequence").unwrap(); + /// assert_eq!( + /// comment, + /// vec![ + /// FastaRecord { + /// header: "".to_string(), + /// contents: "comment".to_string(), + /// line_range: (1, 2), + /// }, + /// FastaRecord { + /// header: "a".to_string(), + /// contents: "sequence".to_string(), + /// line_range: (2, 4), + /// } + /// ] + /// ); + /// ``` + allow_preceding_comment: bool, +} - let mut lines: Vec = vec![]; - let mut title: Option = None; +// "Builder-lite" pattern: https://matklad.github.io/2022/05/29/builder-lite.html +impl FastaParseSettings { + /// Initializes settings "strictly" (See [`Self::strict()`]). + pub fn new() -> Self { + Self::strict() + } - let handle = BufReader::new(handle); + /// Initializes settings "strictly": + /// * [`Self::concatenate_headers`]: false + /// * [`Self::allow_preceding_comment`]: false + pub fn strict() -> Self { + Self { + concatenate_headers: false, + allow_preceding_comment: false, + } + } - for line in handle.lines() { - let line = line?; + /// Initializes settings "laxly": + /// * [`Self::concatenate_headers`]: true + /// * [`Self::allow_preceding_comment`]: true + pub fn lax() -> Self { + Self { + concatenate_headers: true, + allow_preceding_comment: true, + } + } + + /// Changes the setting for [`Self::concatenate_headers`] + pub fn concatenate_headers(mut self, concatenate_headers: bool) -> Self { + self.concatenate_headers = concatenate_headers; + self + } + + /// Changes the setting for [`Self::allow_preceding_comment`] + pub fn allow_preceding_comment(mut self, allow_preceding_comment: bool) -> Self { + self.allow_preceding_comment = allow_preceding_comment; + self + } +} + +impl Default for FastaParseSettings { + fn default() -> Self { + Self::new() + } +} - if let Some(ch) = line.chars().next() { - if ch == '>' { - if let Some(existing_title) = title { - result.push((existing_title, lines.join(""))); - lines.clear(); - } +enum ParserState { + StartOfFile { + /// only non-empty if settings.allow_preceding_comment is false + contents: String, + }, + InHeader { + start_line_number: usize, + header: String, + }, + InRecord { + start_line_number: usize, + header: String, + contents: String, + }, +} - let mut c = line.chars(); - c.next(); - title = Some(c.as_str().trim().to_string()); +impl ParserState { + /// Pump the state machine and maybe emit a record, if this line completes a record, along with + /// a new state + fn advance_line( + self, + settings: &FastaParseSettings, + line: &str, + line_number: usize, + ) -> (Self, Option>) { + let new_header = try_parse_header(line); + match (self, new_header) { + // start of file, and we have a header line => start a new record, + // maybe emiting the preceding content as a headerless record depending + // on parse settings + (ParserState::StartOfFile { contents }, Some(new_header)) => { + // don't emit if the settings don't want it, or if the record would just be whitespace + let record = if settings.allow_preceding_comment || contents.trim().is_empty() { + None } else { - match title { - None => { - // There is no title, skip this content! - } - Some(_) => { - lines.push(line.replace(' ', "").replace('\r', "")); - } - } + Some(FastaRecord { + header: "".to_string(), + contents, + line_range: (1, line_number), + }) + }; + + ( + Self::InHeader { + start_line_number: line_number, + header: new_header.to_string(), + }, + record, + ) + } + // start of file, no header line yet => maybe store this line in content, + // if we need to emit it later + (ParserState::StartOfFile { mut contents }, None) => { + if !settings.allow_preceding_comment { + // only bother to keep contents updated if we need to emit it + contents.push_str(line); } + (Self::StartOfFile { contents }, None) } - } - if let Some(t) = title { - result.push((t, lines.join(""))); + // In header, and we have a new header => either concatenate the headers + // or emit an empty record, depending on settings + ( + ParserState::InHeader { + start_line_number, + mut header, + }, + Some(new_header), + ) => { + if settings.concatenate_headers { + header.push('\n'); + header.push_str(new_header); + ( + Self::InHeader { + start_line_number, + header, + }, + None, + ) + } else { + ( + Self::InHeader { + start_line_number: line_number, + header: new_header.to_string(), + }, + Some(FastaRecord { + header, + contents: "".to_string(), + line_range: (start_line_number, line_number), + }), + ) + } + } + // in header and we don't have a new header => start of record content + ( + ParserState::InHeader { + start_line_number, + header, + }, + None, + ) => ( + Self::InRecord { + start_line_number, + header, + contents: line.to_string(), + }, + None, + ), + + // in record and we have a new header => start of a new header + ( + ParserState::InRecord { + start_line_number, + header, + contents, + }, + Some(new_header), + ) => ( + Self::InHeader { + start_line_number: line_number, + header: new_header.to_string(), + }, + Some(FastaRecord { + header, + contents, + line_range: (start_line_number, line_number), + }), + ), + // in record and we don't have a new header => continue record + ( + ParserState::InRecord { + start_line_number, + header, + mut contents, + }, + None, + ) => { + if line.is_empty() { + // don't push an empty line to a record at the end of a file with a trailing newline + // (if this isn't the EOF, the line numbers will continue and pushing an empty line would + // have been a no-op anyways) + ( + Self::InRecord { + start_line_number, + header, + contents, + }, + None, + ) + } else { + contents.push_str(line); + ( + Self::InRecord { + start_line_number, + header, + contents, + }, + None, + ) + } + } } + } - Ok(result) + /// At the end of the file, we want to pump the state machine one more time to maybe + /// spit out a trailing record. We don't return a new state because after EOF we don't care. + /// + /// `eof_line_number` here should be 1+ the last line number in the file, so that if something + /// e.g., spans a 5-line file, the end of its range can be 6 because the end is exclusive. + fn advance_eof( + self, + settings: &FastaParseSettings, + eof_line_number: usize, + ) -> Option> { + match self { + // start of file => maybe emit contents as a headerless record, depending on settings + ParserState::StartOfFile { contents } => { + // again, don't emit if settings don't want it or the record would just be whitespace + if settings.allow_preceding_comment || contents.trim().is_empty() { + None + } else { + Some(FastaRecord { + header: "".to_string(), + contents, + line_range: (1, eof_line_number), + }) + } + } + + // in header => emit the header as empty record + ParserState::InHeader { + start_line_number, + header, + } => Some(FastaRecord { + header, + contents: "".to_string(), + line_range: (start_line_number, eof_line_number), + }), + + // in record => emit as final record + ParserState::InRecord { + start_line_number, + header, + contents, + } => Some(FastaRecord { + header, + contents, + line_range: (start_line_number, eof_line_number), + }), + } } } -/// Note that the parser is buffered automatically, so you should not wrap handle in a buffered reader like io::BufReader. -pub struct DnaFastaParser; +#[derive(Debug, Error)] +pub enum FastaParseError { + #[error("error reading from reader: {0}")] + IOError(#[from] io::Error), + #[error("error parsing record: {0}")] + ParseError(#[source] ParseError), // can't use #[from] due to generic impl clash +} -#[derive(Error, Debug)] -pub enum DnaFastaParseError { - #[error("could not translate FASTA contents")] - TranslationError(#[from] TranslationError), - #[error("could not read FASTA")] - IOError(#[from] std::io::Error), +pub struct FastaParser { + settings: FastaParseSettings, + _marker: std::marker::PhantomData, } -impl DnaFastaParser { - pub fn parse(handle: R) -> Result, DnaFastaParseError> { - SimpleFastaParser::parse(handle)? - .into_iter() - .map(|(title, sequence)| -> Result<(String, DnaSequence), _> { - let dna = DnaSequence::from_str(&*sequence)?; - Ok((title, dna)) - }) - .collect() +impl FastaParser { + /// Construct a new FastaParser with the given [`FastaParseSettings`] + pub fn new(settings: FastaParseSettings) -> Self { + Self { + settings, + _marker: Default::default(), + } + } + + /// Construct a new FastaParser with strict settings (see [`FastaParseSettings::strict()`]) + pub fn strict() -> Self { + Self::new(FastaParseSettings::strict()) + } + + /// Construct a new FastaParser with lax settings (see [`FastaParseSettings::lax()`]) + pub fn lax() -> Self { + Self::new(FastaParseSettings::lax()) + } + + pub fn parse( + &self, + handle: R, + ) -> Result>, FastaParseError> { + let mut result: Vec> = vec![]; + let mut state = ParserState::StartOfFile { + contents: "".to_string(), + }; + + let mut line_number = 0; + for (idx, line) in handle.lines().enumerate() { + let line = line?; + line_number = idx + 1; + + let (new_state, record) = state.advance_line(&self.settings, &line, line_number); + state = new_state; + if let Some(record) = record { + result.push(record.parse().map_err(FastaParseError::ParseError)?); + } + } + + if let Some(record) = state.advance_eof(&self.settings, line_number + 1) { + result.push(record.parse().map_err(FastaParseError::ParseError)?); + } + + Ok(result) + } + + pub fn parse_str(&self, s: &str) -> Result>, FastaParseError> { + self.parse(s.as_bytes()) + } +} + +/// Try to parse a FASTA header (prefixed with > or ;), returning the line without the prefix char, and with any trailing +/// newlines removed +fn try_parse_header(line: &str) -> Option<&str> { + let head = line.chars().next(); + // ; is semi-obsolete alternative header char + if head == Some('>') || head == Some(';') { + Some(line[1..].trim_end_matches('\n')) // we know it's an ASCII char so this slice is panic-safe + } else { + None } } @@ -80,130 +460,473 @@ impl DnaFastaParser { mod tests { use super::*; + use crate::{DnaSequence, ProteinSequence, TranslationError}; + + macro_rules! assert_parse { + ($testcase:expr, $parser:expr, $expected:expr) => { + assert_eq!( + $parser.parse_str($testcase).unwrap(), + $expected, + "settings = {:?}", + $parser.settings + ); + }; + } + + macro_rules! assert_parse_err { + ($testcase:expr, $parser:expr, $match:pat) => { + // we use matches! since io::Error doesn't have a PartialEq impl + assert!( + matches!($parser.parse_str($testcase).unwrap_err(), $match), + "settings = {:?}", + $parser.settings + ) + }; + } + + /// Assert a parse matches the expected value with any combination of parser settings + fn assert_parse_with_all_settings(s: &str, expected: Vec>) { + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: true, + allow_preceding_comment: true + }), + expected + ); + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: true, + allow_preceding_comment: false + }), + expected + ); + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: false, + allow_preceding_comment: true + }), + expected + ); + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: false, + allow_preceding_comment: false + }), + expected + ); + } + + /// Assert a parse matches the expected value with the given value of concatenate_headers, + /// and all values for other settings + fn assert_parse_with_concatenate_headers( + s: &str, + concatenate_headers: bool, + expected: Vec>, + ) { + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers, + allow_preceding_comment: true + }), + expected + ); + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers, + allow_preceding_comment: false + }), + expected + ); + } + + /// Assert a parse matches the expected value with the given value of allow_preceding_comment, + /// and all values for other settings + fn assert_parse_with_allow_preceding_comment( + s: &str, + allow_preceding_comment: bool, + expected: Vec>, + ) { + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: true, + allow_preceding_comment + }), + expected + ); + assert_parse!( + s, + FastaParser::::new(FastaParseSettings { + concatenate_headers: false, + allow_preceding_comment + }), + expected + ); + } + #[test] fn test_empty_fasta() { - let r = SimpleFastaParser::parse("".as_bytes()).unwrap(); - assert_eq!(r.len(), 0); + assert_parse_with_all_settings("", vec![]); } #[test] fn test_empty_fasta_with_newlines() { - let r = SimpleFastaParser::parse("\n\n".as_bytes()).unwrap(); - assert_eq!(r.len(), 0); + assert_parse_with_all_settings("\n\n", vec![]); } #[test] fn test_empty_fasta_with_many_newlines() { - let r = SimpleFastaParser::parse(" \n\n \n \r\r \r\n \r \n".as_bytes()).unwrap(); - assert_eq!(r.len(), 0); + assert_parse_with_all_settings(" \n\n \n \r\r \r\n \r \n", vec![]); } #[test] fn test_fasta_with_no_content() { - let r = SimpleFastaParser::parse(">Virus\n".as_bytes()).unwrap(); - assert_eq!(r.len(), 1); - assert_eq!(r[0], ("Virus".to_string(), "".to_string())); + assert_parse_with_all_settings( + ">Virus\n", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "".to_string(), + line_range: (1, 2), + }], + ); } #[test] fn test_fasta_with_empty_content() { - let r = SimpleFastaParser::parse(">Virus\n\n".as_bytes()).unwrap(); - assert_eq!(r, vec![("Virus".to_string(), "".to_string())]); + assert_parse_with_all_settings( + ">Virus\n\n", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "".to_string(), + line_range: (1, 3), + }], + ); + } + + #[test] + fn test_comment_fasta_with_allow_preceding_comment() { + assert_parse_with_allow_preceding_comment( + "this is a file comment\n@author is foo", + true, + vec![], + ); } #[test] - fn test_fasta_with_stuff_no_header() { - let r = - SimpleFastaParser::parse("// this is a file comment\n@author is foo\n\n".as_bytes()) - .unwrap(); - assert_eq!(r.len(), 0); + fn test_comment_fasta_with_no_allow_preceding_comment() { + assert_parse_with_allow_preceding_comment( + "this is a file comment\n@author is foo", + false, + vec![FastaRecord { + header: "".to_string(), + contents: "this is a file comment@author is foo".to_string(), + line_range: (1, 3), + }], + ); } #[test] - fn test_fasta_with_stuff_before_header() { - let r = SimpleFastaParser::parse( - "// this is a file comment\n@author is foo\n\n>Virus\n\n".as_bytes(), - ) - .unwrap(); - assert_eq!(r, vec![("Virus".to_string(), "".to_string())]); + fn test_comment_and_record_fasta_with_allow_preceding_comment() { + assert_parse_with_allow_preceding_comment( + "this is a file comment\n@author is foo\n\n>Virus\n\n", + true, + vec![FastaRecord { + header: "Virus".to_string(), + contents: "".to_string(), + line_range: (4, 6), + }], + ); + } + + #[test] + fn test_comment_and_record_fasta_with_no_allow_preceding_comment() { + assert_parse_with_allow_preceding_comment( + "this is a file comment\n@author is foo\n\n>Virus\n\n", + false, + vec![ + FastaRecord { + header: "".to_string(), + contents: "this is a file comment@author is foo".to_string(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus".to_string(), + contents: "".to_string(), + line_range: (4, 6), + }, + ], + ); + } + + #[test] + fn test_no_allow_preceding_comment_doesnt_emit_whitespace_record() { + assert_parse_with_all_settings( + " \t\n\r\t \n\n>Virus\n\n", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "".to_string(), + line_range: (4, 6), + }], + ); } #[test] fn test_fasta_with_single_line_content() { - let r = SimpleFastaParser::parse(">Virus\nCAAAGT\n".as_bytes()).unwrap(); - assert_eq!(r, vec![("Virus".to_string(), "CAAAGT".to_string())]); + assert_parse_with_all_settings( + ">Virus\nCAAAGT\n", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "CAAAGT".to_string(), + line_range: (1, 3), + }], + ); + } + + #[test] + fn test_fasta_eof() { + assert_parse_with_all_settings( + ">Virus\nCAAAGT", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "CAAAGT".to_string(), + line_range: (1, 3), + }], + ); } #[test] fn test_fasta_with_multi_line_content() { - let r = SimpleFastaParser::parse(">Virus\nAAAA\nCCCC\nGGGG\n".as_bytes()).unwrap(); - assert_eq!(r, vec![("Virus".to_string(), "AAAACCCCGGGG".to_string())]); + assert_parse_with_all_settings( + ">Virus\nAAAA\nCCCC\nGGGG\n", + vec![FastaRecord { + header: "Virus".to_string(), + contents: "AAAACCCCGGGG".to_string(), + line_range: (1, 5), + }], + ); } #[test] fn test_fasta_mutiple_contents() { - let r = SimpleFastaParser::parse(">Virus1\nAAAA\n>Virus2\nCCCC\n".as_bytes()).unwrap(); - assert_eq!( - r, + assert_parse_with_all_settings( + ">Virus1\nAAAA\n>Virus2\nCCCC\n", vec![ - ("Virus1".to_string(), "AAAA".to_string()), - ("Virus2".to_string(), "CCCC".to_string()) - ] + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAA".to_string(), + line_range: (1, 3), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCC".to_string(), + line_range: (3, 5), + }, + ], ); } #[test] fn test_fasta_mutiple_contents_multiline() { - let r = SimpleFastaParser::parse(">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n".as_bytes()) - .unwrap(); - assert_eq!( - r, + assert_parse_with_all_settings( + ">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n", vec![ - ("Virus1".to_string(), "AAAAAAAA".to_string()), - ("Virus2".to_string(), "CCCCCCCC".to_string()) - ] + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAAAAAA".to_string(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCCCCCC".to_string(), + line_range: (4, 7), + }, + ], ); } #[test] - fn test_dna_fasta() { - let r: Vec<(String, DnaSequence)> = - DnaFastaParser::parse(">Virus1\nAAAA".as_bytes()).unwrap(); + fn test_fasta_triple() { + assert_parse_with_all_settings( + ">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n>Virus3\nCCCC\nCCCC\n", + vec![ + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAAAAAA".to_string(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCCCCCC".to_string(), + line_range: (4, 7), + }, + FastaRecord { + header: "Virus3".to_string(), + contents: "CCCCCCCC".to_string(), + line_range: (7, 10), + }, + ], + ); + } + + #[test] + fn test_fasta_trailing_header_only() { + assert_parse_with_all_settings( + ">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n>Virus3", + vec![ + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAAAAAA".to_string(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCCCCCC".to_string(), + line_range: (4, 7), + }, + FastaRecord { + header: "Virus3".to_string(), + contents: "".to_string(), + line_range: (7, 8), + }, + ], + ); + } + + #[test] + fn test_fasta_header_concat() { + assert_parse_with_concatenate_headers( + ">a\n>b\ntest", + true, + vec![FastaRecord { + header: "a\nb".to_string(), + contents: "test".to_string(), + line_range: (1, 4), + }], + ); + } + + #[test] + fn test_fasta_header_noconcat() { + assert_parse_with_concatenate_headers( + ">a\n>b\ntest", + false, + vec![ + FastaRecord { + header: "a".to_string(), + contents: "".to_string(), + line_range: (1, 2), + }, + FastaRecord { + header: "b".to_string(), + contents: "test".to_string(), + line_range: (2, 4), + }, + ], + ); + } - assert_eq!( - r, - vec![("Virus1".to_string(), DnaSequence::from_str("AAAA").unwrap())] + #[test] + fn test_fasta_header_concat_no_content() { + assert_parse_with_concatenate_headers( + ">a\n>b", + true, + vec![FastaRecord { + header: "a\nb".to_string(), + contents: "".to_string(), + line_range: (1, 3), + }], + ); + } + + #[test] + fn test_dna_fasta() { + assert_parse!( + ">Virus1\nAAAA", + FastaParser::::strict(), + vec![FastaRecord { + header: "Virus1".to_string(), + contents: "AAAA".parse().unwrap(), + line_range: (1, 3), + }] ); } #[test] fn test_dna_fasta_multiple() { - let r: Vec<(String, DnaSequence)> = - DnaFastaParser::parse(">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n".as_bytes()).unwrap(); - assert_eq!( - r, + assert_parse!( + ">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n", + FastaParser::::strict(), vec![ - ( - "Virus1".to_string(), - DnaSequence::from_str("AAAAAAAA").unwrap() - ), - ( - "Virus2".to_string(), - DnaSequence::from_str("CCCCCCCC").unwrap() - ) + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAAAAAA".parse().unwrap(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCCCCCC".parse().unwrap(), + line_range: (4, 7), + }, ] ); } #[test] fn test_dna_invalid_dna() { - let r = DnaFastaParser::parse(">Virus1\nAAAAelephant".as_bytes()); - assert!(matches!(r, Err(_))); + assert_parse_err!( + ">Virus1\nAAAelephant", + FastaParser::::strict(), + FastaParseError::ParseError(TranslationError::BadNucleotide('e')) + ); } #[test] fn test_dna_invalid_dna_multiple() { - let r = DnaFastaParser::parse(">Virus1\nAAAA\n>Virus2\nAAAAelephant".as_bytes()); - assert!(matches!(r, Err(_))); + assert_parse_err!( + ">Virus1\nAAAA\n>Virus2\nAAAAelephant", + FastaParser::::strict(), + FastaParseError::ParseError(TranslationError::BadNucleotide('e')) + ); } + + #[test] + fn test_protein_fasta() { + assert_parse!( + ">Virus1\nAAAA", + FastaParser::::strict(), + vec![FastaRecord { + header: "Virus1".to_string(), + contents: "AAAA".parse().unwrap(), + line_range: (1, 3), + }] + ); + } + + #[test] + fn test_protein_fasta_multiple() { + assert_parse!( + ">Virus1\nAAAA\nAAAA\n>Virus2\nCCCC\nCCCC\n", + FastaParser::::strict(), + vec![ + FastaRecord { + header: "Virus1".to_string(), + contents: "AAAAAAAA".parse().unwrap(), + line_range: (1, 4), + }, + FastaRecord { + header: "Virus2".to_string(), + contents: "CCCCCCCC".parse().unwrap(), + line_range: (4, 7), + }, + ] + ); + } + + // TODO: when we add validation for ProteinSequence, add tests for that here } From b1485a11d1aca9ef26b07049044422306ea4a085 Mon Sep 17 00:00:00 2001 From: Theia Vogel Date: Tue, 26 Jul 2022 14:23:58 -0700 Subject: [PATCH 2/2] More tests and remove pointless line-stripping --- src/fasta.rs | 150 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 147 insertions(+), 3 deletions(-) diff --git a/src/fasta.rs b/src/fasta.rs index d01198f..8dd83ab 100644 --- a/src/fasta.rs +++ b/src/fasta.rs @@ -444,13 +444,12 @@ impl FastaParser { } } -/// Try to parse a FASTA header (prefixed with > or ;), returning the line without the prefix char, and with any trailing -/// newlines removed +/// Try to parse a FASTA header (prefixed with > or ;), returning the line without the prefix char. fn try_parse_header(line: &str) -> Option<&str> { let head = line.chars().next(); // ; is semi-obsolete alternative header char if head == Some('>') || head == Some(';') { - Some(line[1..].trim_end_matches('\n')) // we know it's an ASCII char so this slice is panic-safe + Some(&line[1..]) // we know it's an ASCII char so this slice is panic-safe } else { None } @@ -461,6 +460,7 @@ mod tests { use super::*; use crate::{DnaSequence, ProteinSequence, TranslationError}; + use std::time::Duration; macro_rules! assert_parse { ($testcase:expr, $parser:expr, $expected:expr) => { @@ -570,6 +570,22 @@ mod tests { ); } + /// Helper to panic if a closure doesn't complete within a specified Duration. + /// Author: @shepmaster, https://github.com/rust-lang/rfcs/issues/2798#issuecomment-552949300 + fn panic_after T + Send + 'static>(d: Duration, f: F) -> T { + let (done_tx, done_rx) = std::sync::mpsc::channel(); + let handle = std::thread::spawn(move || { + let val = f(); + done_tx.send(()).expect("Unable to send completion signal"); + val + }); + + match done_rx.recv_timeout(d) { + Ok(_) => handle.join().expect("Thread panicked"), + Err(_) => panic!("Thread took too long"), + } + } + #[test] fn test_empty_fasta() { assert_parse_with_all_settings("", vec![]); @@ -844,6 +860,134 @@ mod tests { ); } + #[test] + fn test_fasta_unicode() { + // U+2009 is THIN-SPACE, which is White_Space=yes so should be ignored even if allow_comment is false + // Note that some "spacey" unicode characters, like U+200B "ZERO WIDTH SPACE", are White_Space=no + assert_parse_with_all_settings("\u{2009}\r\n>ὦ Ᾰ̓θηνᾶ, Heizölrückstoßabdämpfungを持つ!\nPchnąć w tę łódź jeża lub ośm skrzyń fig", vec![FastaRecord { + header: "ὦ Ᾰ̓θηνᾶ, Heizölrückstoßabdämpfungを持つ!".to_string(), + contents: "Pchnąć w tę łódź jeża lub ośm skrzyń fig".to_string(), + line_range: (2, 4), + }]) + } + + #[test] + fn test_fasta_ignores_alternative_linebreaks() { + use std::fmt::Write; + + let bad_linebreaks = [ + '\x0B', // LINE TABULATION (\v) + '\x0C', // FORM FEED (\f) + '\x0D', // CARRIAGE RETURN (\r) + '\u{0085}', // NEXT LINE + '\u{2028}', // LINE SEPARATOR + '\u{2029}', // PARAGRAPH SEPARATOR + ]; + let mut test_case = "contents-should-be-single-line:".to_string(); + for lb in bad_linebreaks { + write!( + test_case, + " {}>U+{:x} should not be a header", + lb, lb as u32 + ) + .unwrap(); + } + + assert_parse_with_allow_preceding_comment( + &test_case.clone(), + false, + vec![FastaRecord { + header: "".to_string(), + contents: test_case, + line_range: (1, 2), + }], + ) + } + + #[test] + fn test_fasta_handles_windows_newline() { + // the Rust BufRead trait's `lines()` method should handle \r\n lines by removing the \r as well, + // so this just checks that we're not doing anything to mess that up + + assert_parse_with_all_settings( + "\r\n>i love compatability\r\nwindows is awesome\r\n", + vec![FastaRecord { + header: "i love compatability".to_string(), + contents: "windows is awesome".to_string(), + line_range: (2, 4), + }], + ) + } + + #[test] + fn test_concat_headers_not_quadratic() { + // if this test gets slow, we probably accidentally introduced quadratic behavior somewhere + + let mut test_case = String::new(); + for _ in 0..10_000 { + test_case.push_str(">header\n"); + } + let header = test_case.trim().replace('>', ""); + + panic_after(Duration::from_millis(200), move || { + assert_parse_with_concatenate_headers( + &test_case, + true, + vec![FastaRecord { + header, + contents: "".to_string(), + line_range: (1, 10_001), + }], + ) + }); + } + + #[test] + fn test_comment_not_quadratic() { + // if this test gets slow, we probably accidentally introduced quadratic behavior somewhere + + let mut test_case = String::new(); + for _ in 0..10_000 { + test_case.push_str("comment\n"); + } + let contents = test_case.replace('\n', ""); + + panic_after(Duration::from_millis(200), move || { + assert_parse_with_allow_preceding_comment( + &test_case, + false, + vec![FastaRecord { + header: "".to_string(), + contents, + line_range: (1, 10_001), + }], + ) + }); + } + + #[test] + fn test_contents_not_quadratic() { + // if this test gets slow, we probably accidentally introduced quadratic behavior somewhere + + let mut contents = String::new(); + for _ in 0..10_000 { + contents.push_str("contents\n"); + } + let test_case = format!(">header\n{contents}"); + let contents = contents.replace('\n', ""); + + panic_after(Duration::from_millis(200), move || { + assert_parse_with_all_settings( + &test_case, + vec![FastaRecord { + header: "header".to_string(), + contents, + line_range: (1, 10_002), + }], + ) + }); + } + #[test] fn test_dna_fasta() { assert_parse!(