diff --git a/fuzzy/src/debug_output.rs b/fuzzy/src/debug_output.rs deleted file mode 100644 index c7f6047..0000000 --- a/fuzzy/src/debug_output.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! Provides an implementation of [`Output`] suitable for development. - -use crate::{Match, Output, Step}; -use std::fmt; - -pub struct DebugOutput { - output: String, -} - -impl Output for DebugOutput { - fn new(score: &usize, trace: &Vec>) -> Self { - Self { output: format!("score: {}\ntrace: {:#?}", *score, *trace) } - } -} - -impl fmt::Display for DebugOutput { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&self.output) - } -} diff --git a/fuzzy/src/diff_output.rs b/fuzzy/src/diff_output.rs index 37cf07c..ff69513 100644 --- a/fuzzy/src/diff_output.rs +++ b/fuzzy/src/diff_output.rs @@ -1,10 +1,8 @@ -//! Provides an implementation of [`Output`] that mimics git's character-level diff. - -use crate::{Output, Match, Step}; +use crate::{Match, Step}; use std::fmt; // NOTE: because we do character by character diffs, this won't be the real diff format -// for now. Instead, we will mimic the git diff format, expect we print out all matching +// for now. Instead, we will mimic the git diff format, except we print out all matching // lines and don't print any line numbers. // // The wording in these structs treat the patttern as the original, and text as new. So @@ -14,6 +12,8 @@ use std::fmt; // TODO make this configurable const ANY: char = '?'; +/// A quick display of the final trace, similar to the git character-level diff format. +/// Convenient for small texts. pub struct DiffOutput { pub chunks: Vec, } @@ -48,8 +48,8 @@ pub struct Same { pub text: Vec } #[derive(Eq, PartialEq, Debug)] pub struct Diff { pub taken: Vec, pub added: Vec } -impl Output for DiffOutput { - fn new(_score: &usize, trace: &Vec>) -> Self { +impl DiffOutput { + pub fn new(_score: &usize, trace: &Vec>) -> Self { let mut chunks = vec![]; for step in trace.iter() { let current_chunk = chunks.last_mut(); diff --git a/fuzzy/src/lattice_solution.rs b/fuzzy/src/lattice_solution.rs deleted file mode 100644 index 842012e..0000000 --- a/fuzzy/src/lattice_solution.rs +++ /dev/null @@ -1,451 +0,0 @@ -//! Provides a sub-trait of [`Solution`] with a generic [`Solution::solve`] implementation. - -use crate::{ElementCore, Match, Problem, Solution, Step}; -use crate::flat_pattern::Flat; -use crate::error::Error; -use nonempty::{NonEmpty, nonempty}; -use std::fmt::Debug; - -/// A family of [`Solution`] implementations which record state in a lattice of nodes. -/// -/// For now, this trait hardcodes the use of a flattened pattern which is easier to index, hardcodes -/// the node structure, and also the algorithm used to traverse nodes and update their state. -/// The way in which individual implementations store and index nodes is configurable. In the -/// future, as we add more features, we may make other parts of the implementation configurable. -/// -/// [`LatticeSolution`] implementations get [`Solution::solve`] defined automatically. Instead, -/// implementations are required to specify a mutable [`State`](LatticeSolution::State) space -/// and an [`Ix`](LatticeSolution::Ix) type which addresses it. -/// -/// Each index links to child indices which represent the next possible steps we can take to match -/// the pattern to the text (e.g. match a character, skip a character from the text or pattern, -/// etc.). There is a defined [`start`](LatticeConfig::start) index, when no progress has been made, -/// and an [`end`](LatticeConfig::end) index, when both the entire pattern and text have been matched. -/// Implementation must ensure that [`can_restart`](LatticeIx::can_restart) is implemented -/// correctly, so that these links never form a loop. These links form a -/// [lattice](https://en.wikipedia.org/wiki/Lattice_(order)). -pub trait LatticeSolution : Sized + Solution { - /// Carries immutable information derived from the [`Problem`](crate::Problem) being solved. - type Conf: LatticeConfig; - /// Mutable state being updated while solving. - type State: LatticeState; - /// The type used to index into [`State`](LatticeSolution::State) and - /// [`Conf`](LatticeSolution::Conf). - type Ix: LatticeIx; - - fn new(score: usize, trace: Vec>) -> Self; - - fn score_lattice(&self) -> &usize; - fn trace_lattice(&self) -> &Vec>; - - /// [`Solution::solve`] implementation. - fn solve_lattice(problem: &Problem) -> Result { - let conf = Self::Conf::new(problem); - let mut state = Self::State::new(&conf); - - let start_ix = conf.start(); - let end_ix = conf.end(); - - let _ = Self::calculate_optimal_path(&conf, &mut state)?; - - let start_node = state.get(start_ix); - let score = start_node.done_info() - .map(|i| i.0) - .map_err(|_| Error::IncompleteFinalState)?; - - let mut trace = vec![]; - let mut from = start_ix; - loop { - let node = state.get(from); - if !node.is_done() || from == end_ix { break; } - let (patt, text) = conf.get(from); - let (_, step_type, next) = node.done_info()?; - if let Some(step) = step_type.step() { - let final_step = step.map( - |_| match patt { - Some(Flat::Lit(c)) => Match::Lit(*c), - Some(Flat::Class(c)) => Match::Class(c.clone()), - unexpected => panic!("Unexpected trace pattern {:?}", unexpected), - }, - |_| match text { - Some(c) => *c, - unexpected => panic!("Unexpected trace text {:?}", unexpected), - } - ); - trace.push(final_step); - } - from = next; - } - if from != end_ix { - return Err(Error::IncompleteFinalState); - } - - Ok(LatticeSolution::new(score, trace)) - } - - /// Update [`State`](LatticeSolution::State) with the optimal steps from the start - /// [`Ix`](LatticeSolution::Ix) onwards. - fn calculate_optimal_path( - conf: &Self::Conf, - state: &mut Self::State, - ) -> Result<(), Error> { - let start_ix = conf.start(); - let end_ix = conf.end(); - - let mut loop_state = LoopState::Down(Down { - parent: Default::default(), - current: start_ix, - }); - - let mut loop_counter = 0; - - loop { - loop_counter += 1; - if loop_counter >= 1000000000 { // TODO make this max configurable - return Err(Error::ExceededMaxSteps(loop_counter)); - } - let new_parent = match &loop_state { - LoopState::Down(down) if state.get(down.current).is_ready() => { - let (flat, text) = conf.get(down.current); - let opt_node_type = NodeType::get(flat, text, &down.current); - let node_state = state.get_mut(down.current); - node_state.initialise(end_ix, down.parent, down.current, opt_node_type)?; - down.parent - } - LoopState::Down(down) => down.parent, - LoopState::Back(back) => { - let new_child = back.child; - let (new_score, _, _) = state.get(new_child).done_info()?; - let node_state = state.get_mut(back.current); - let new_parent = node_state.update(new_child, back.current, new_score)?; - new_parent - } - }; - - let current_ix = loop_state.current(); - let final_state = state.get(current_ix); - if current_ix == start_ix && final_state.is_done() { - break; - } else if final_state.is_done() { - loop_state = LoopState::Back(Back { - current: new_parent, - child: current_ix, - }); - } else if final_state.is_working() { - let current_step_type = final_state.current_step_type()?; - let child = conf.step(current_ix, current_step_type); - loop_state = LoopState::Down(Down { - parent: current_ix, - current: child, - }); - } else { - return Err(Error::NoNodeProgress(format!("{:?}", current_ix))); - } - } - - Ok(()) - } -} - -#[derive(Debug)] -enum LoopState { - Down(Down), - Back(Back), -} - -impl LoopState { - fn current(&self) -> Ix { - match self { - LoopState::Down(down) => down.current, - LoopState::Back(back) => back.current, - } - } -} - -#[derive(Debug)] -struct Down { - parent: Ix, - current: Ix, -} - -#[derive(Debug)] -struct Back { - current: Ix, - child: Ix, -} - -impl Solution for Sln where - Sln: LatticeSolution, -{ - fn score(&self) -> &usize { - LatticeSolution::score_lattice(self) - } - - fn trace(&self) -> &Vec> { - LatticeSolution::trace_lattice(self) - } - - fn solve(problem: &Problem) -> Result { - LatticeSolution::solve_lattice(&problem) - } -} - -pub trait LatticeConfig { - fn new(problem: &Problem) -> Self; - fn get(&self, ix: Ix) -> (Option<&Flat>, Option<&char>); - - fn start(&self) -> Ix; - fn end(&self) -> Ix; - - fn step(&self, ix: Ix, step_type: StepType) -> Ix; -} - -pub trait LatticeState { - fn new(conf: &Conf) -> Self; - fn get(&self, ix: Ix) -> &Node; - fn get_mut(&mut self, ix: Ix) -> &mut Node; - fn set(&mut self, ix: Ix, node: Node); -} - -// TODO Ix turns out to be a sizable struct, remove Copy and pass by reference where possible -pub trait LatticeIx : Eq + PartialEq + Copy + Clone + Debug + Sized + Default { - fn can_restart(&self) -> bool; -} - -// TODO make a better Node type -// -// Calculate_optimal_path (originally called solve_ix) used to store a lot of state on the stack: -// the parent node, our progress through the possible step types, the optimal score, etc. The -// node was a simple enum which was either Ready, Working, or Done. Only the Done value had any -// fields, and it was never mutated. -// -// Once we began to run out of stack space for mid-sized use-cases, we transferred all of that -// state into the heap by adding it to this Node struct. Much of this information is mutated as we -// try out each possible step type. -// -// I had a lot of trouble implementing this expanded node. Solve loops over my table of node -// values, taking a mutable reference to a single node in each iteration. My code originally -// pattern matched on the Node enum, and called methods on inner types which could only be accessed -// when node had the right case. But I struggled to do this and satisfy rust's borrow checker. -// -// For now, I've abandonded pattern matching and type safety, and implemented rust as an abstract -// data type. The node still has three states: Ready, Working, and Done, but they aren't reflected -// in rust's type system. Instead, Node methods return errors if they are called when the node is -// in the wrong state. -// -// The three states are a bit implicit in the Node structure. They are driven by current. Current -// changes from 0..=step_types.len()+1 over the life of the Node: -// -// 1. A node is Ready if current == 0 -// 2. A node is Working if 1 >= current >= step_types.len() -// 3. A node is Done if current == step_types.len() + 1 -// -// When a node is working, the current step type being attempted is step_types[current-1]. -// -// When a node has processed at least one node (current >= 2), score/step_type/next record the -// optimal choice among step_types[0..current-1]. This means those fields are optimal when a Node -// is Done. -// -// I'd like to return to this Node when I'm more comfortable working with rust, and do a better job -// implementing it. - -#[derive(Clone, Eq, PartialEq, Debug)] -pub struct Node { - current: usize, - parent: Ix, - score: usize, - step_type: StepType, - next: Ix, - step_types: Vec, -} - -impl Node { - pub fn new() -> Self { - Self { - current: 0, - parent: Default::default(), - score: 0, - step_type: StepType::Hit, - next: Default::default(), - step_types: vec![], - } - } - - fn is_ready(&self) -> bool { - self.current == 0 - } - - fn is_working(&self) -> bool { - self.current > 0 && self.current <= self.step_types.len() - } - - fn is_done(&self) -> bool { - self.current > self.step_types.len() - } - - fn current_step_type(&self) -> Result { - if self.is_working() { - Ok(self.step_types[self.current - 1]) - } else { - Err(Error::CannotGetNodeField("current_step_type", "working")) - } - } - - fn done_info(&self) -> Result<(usize, StepType, Ix), Error> { - if self.is_done() { - Ok((self.score, self.step_type, self.next)) - } else { - Err(Error::CannotGetNodeField("score/step_type/next", "done")) - } - } - - fn initialise(&mut self, end_ix: Ix, parent_ix: Ix, ix: Ix, opt_node_type: Option) -> Result<(), Error>{ - if self.is_ready() { - match opt_node_type { - Some(node_type) => { - let step_types = Vec::from(node_type.step_types()); - self.parent = parent_ix; - self.current += 1; - self.step_types = step_types; - Ok(()) - } - None if ix == end_ix => { // end_ix: insert dummy done value - self.parent = parent_ix; - self.current += 1; - Ok(()) - } - None => { - Err(Error::NoNodeType(format!("{:?}", ix))) - } - } - } else { - Err(Error::CannotInitialiseNode(format!("{:?}", ix))) - } - } - - fn update(&mut self, new_child: Ix, ix: Ix, new_score: usize) -> Result { - if self.is_working() { - let parent_ix = self.parent; - let current_step_type = self.current_step_type()?; - let new_score = new_score + current_step_type.cost(); - if self.current <= 1 || new_score < self.score { - self.step_type = current_step_type; - self.score = new_score; - self.next = new_child; - self.current += 1; - } else { - self.current += 1; - } - Ok(parent_ix) - } else { - Err(Error::CannotUpdateNode(format!("{:?}", ix))) - } - } -} - -#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)] -pub enum NodeType { - FinishedPattern, - FinishedText, - Hit, - NoHit, - StartGroup, - EndGroup, - AlternativeLeft(usize), - AlternativeRight(usize), - RepetitionStart(usize), - RepetitionRestart(usize), - RepetitionEnd, -} - -impl NodeType { - fn get>(opt_flat: Option<&Flat>, opt_text: Option<&char>, ix: &Ix) -> Option { - // TODO this is surprisingly hard to follow for something conceptually simple. Can I make it nicer? - match opt_flat { - None if opt_text == None => None, - None => Some(NodeType::FinishedPattern), - Some(flat) => Some(match flat { - Flat::Lit(c) if opt_text == Some(c) => NodeType::Hit, - Flat::Lit(_) if opt_text == None => NodeType::FinishedText, - Flat::Lit(_) => NodeType::NoHit, - Flat::Class(class) if opt_text.map_or(false, |t| class.matches(*t)) => NodeType::Hit, - Flat::Class(_) if opt_text == None => NodeType::FinishedText, - Flat::Class(_) => NodeType::NoHit, - Flat::GroupStart => NodeType::StartGroup, - Flat::GroupEnd => NodeType::EndGroup, - Flat::AlternativeLeft(off) => NodeType::AlternativeLeft(*off), - Flat::AlternativeRight(off) => NodeType::AlternativeRight(*off), - Flat::RepetitionStart(off) => NodeType::RepetitionStart(*off), - Flat::RepetitionEnd(off) if ix.can_restart() => NodeType::RepetitionRestart(*off), - Flat::RepetitionEnd(_) => NodeType::RepetitionEnd, - }) - } - } - - fn step_types(&self) -> NonEmpty { - use StepType::*; - match self { - Self::FinishedPattern => nonempty![SkipText], - Self::FinishedText => nonempty![SkipPattern], - Self::Hit => nonempty![Hit, SkipPattern, SkipText], - Self::NoHit => nonempty![SkipPattern, SkipText], - Self::StartGroup => nonempty![StartGroup], - Self::EndGroup => nonempty![EndGroup], - Self::AlternativeLeft(off) => nonempty![StartLeft, StartRight(*off)], - Self::AlternativeRight(off) => nonempty![PassRight(*off)], - Self::RepetitionStart(off) => nonempty![StartRepetition, PassRepetition(*off)], - Self::RepetitionRestart(off) => nonempty![RestartRepetition(*off)], - Self::RepetitionEnd => nonempty![EndRepetition], - } - } -} - -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum StepType { - SkipText, - SkipPattern, - Hit, - StartGroup, - EndGroup, - StartLeft, - StartRight(usize), - PassRight(usize), - StartRepetition, - PassRepetition(usize), - EndRepetition, - RestartRepetition(usize), -} - -impl StepType { - fn cost(&self) -> usize { - match self { - Self::SkipPattern => 1, - Self::SkipText => 1, - _ => 0, - } - } - - fn step(&self) -> Option> { - match self { - Self::Hit => Some(Step::Hit((), ())), - Self::SkipPattern => Some(Step::SkipPattern(())), - Self::SkipText => Some(Step::SkipText(())), - Self::StartGroup => Some(Step::StartCapture), - Self::EndGroup => Some(Step::StopCapture), - _ => None, - } - } -} - -#[cfg(test)] -pub mod test_logic { - use super::*; - use crate::test_cases::TestCase; - - pub fn test_solve(test_case: TestCase) { - let desugared = test_case.problem.desugar(); - let actual = Sln::solve(&desugared).unwrap(); - assert_eq!(test_case.score, *actual.score()); - assert_eq!(test_case.trace, *actual.trace()); - } -} diff --git a/fuzzy/src/lib.rs b/fuzzy/src/lib.rs index 93fc358..db5b680 100644 --- a/fuzzy/src/lib.rs +++ b/fuzzy/src/lib.rs @@ -4,113 +4,33 @@ //! //! In lieu of better documentation, see the project README for more discussion about the regex //! features we support and how well the "closest match" works in practice. -//! -//! This crate is very early in it's development, it's API is akward, and will likely be changed in -//! breaking ways several times before it matures. We don't currently implement any convenience -//! functions which match a pattern against a text in one call. Instead, the crate provides -//! implementations of the following three traits, which can be combined to do the match: -//! -//! - a [`Question`] produces a [`Problem`] to be solved. -//! - a [`Solution`] calculates the optimal match and provides the corresponding -//! [`score`](Solution::score) and [`trace`](Solution::trace). -//! - an [`Output`] displays [`Problem`] and [`Solution`] info to the user. -//! -//! Implementations can be combined as follows: -//! -//! ```rust -//! use fuzzy::{Question, Solution, Output}; -//! use fuzzy::regex_question::RegexQuestion; -//! use fuzzy::table_solution::TableSolution; -//! use fuzzy::diff_output::DiffOutput; -//! use fuzzy::error::Error; -//! -//! fn fuzzy_match(pattern_regex: String, text: String) -> Result<(), Error> { -//! let question = RegexQuestion { pattern_regex, text }; -//! let problem = question.ask()?; -//! let problem_core = problem.desugar(); -//! let solution = TableSolution::solve(&problem_core)?; -//! let output = DiffOutput::new(&solution.score(), &solution.trace()); -//! println!("{}", output); -//! Ok(()) -//! } -//! ``` -//! -//! # Overview -//! -//! The main three traits in our API are [`Question`], [`Solution`], and [`Output`]. See -//! submodules for the various implementations. -//! -//! In addition to these traits: -//! -//! - The [`Problem`] contains the parsed [`pattern`](Problem::pattern) and [`text`](Problem::text). -//! - From the [`Solution`]: -//! - The [`score`](Solution::score) is a simple `usize`. -//! - A [`Step`] is a single item from the optimal [`trace`](Solution::trace). -use std::fmt::Display; use regex_syntax::hir; -pub mod regex_question; -pub mod lattice_solution; -pub mod map_solution; +pub mod regex_pattern; pub mod table_solution; -pub mod debug_output; pub mod diff_output; pub mod flat_pattern; pub mod error; -/// A builder of [`Problem`] values. -/// -/// Questions are built from some specification of a pattern and text, but the details are not part -/// of this API: different Question implementations can do this differently. -pub trait Question { - /// Try to build a [`Problem`]. - fn ask(&self) -> Result, Error>; +use regex_pattern::parse_pattern; +use table_solution::solve; +use diff_output::DiffOutput; +use error::Error; + +pub fn fuzzy_match(pattern_regex: String, text_str: String) -> Result { + let pattern = parse_pattern(&pattern_regex)?; + let text = Atoms { atoms: text_str.chars().collect() }; + let pattern_core = pattern.desugar(); + let solution = solve(&pattern_core, &text)?; + let output = DiffOutput::new(&solution.score, &solution.trace); + return Ok(output); } -/// Calculates the optimal solution for a [`Problem`]. -/// -/// In practice, our solution implementations to date are simply structs directly storing the final -/// calculated `score` and `trace`. We will probably change this API in the future. -pub trait Solution : Sized { - /// Try to figure out the solution for a [`Problem`]. - fn solve(problem: &Problem) -> Result; - - /// Return the final score for the solution. - /// - /// This score represents the cost of mismatches: `0` is best, higher worse. - fn score(&self) -> &usize; - - /// Return the [`Step`]s followed by the optimal match between pattern and text. - fn trace(&self) -> &Vec>; -} - -/// Displays the final solution. -/// -/// Output implementations are just types that implement -/// [`Display`](https://doc.rust-lang.org/std/fmt/trait.Display.html) and can be constructed out of -/// the [`score`](Solution::score) and [`trace`](Solution::trace). -/// -/// If the [`Solution`] API changes, we will probably change this API as well. -pub trait Output : Display { - /// Build the display. This value will have a user-friendly string representation. - fn new(score: &usize, trace: &Vec>) -> Self; -} - -/// A problem to be solved: contains the pattern we are matching text against, as well as the text -/// which may or may not match it. -#[derive(Eq, PartialEq, Clone, Debug)] -pub struct Problem { - pub pattern: Pattern, - pub text: Atoms, -} - -impl Problem { - pub fn desugar(&self) -> Problem { - let pattern = self.pattern.desugar(); - let text = self.text.clone(); - Problem { pattern, text } - } +#[derive(Eq, PartialEq, Debug)] +pub struct Solution { + pub score: usize, + pub trace: Vec>, } #[derive(Eq, PartialEq, Clone, Debug)] @@ -267,7 +187,8 @@ pub mod test_cases { use regex_syntax::hir::HirKind; pub struct TestCase { - pub problem: Problem, + pub pattern: Pattern, + pub text: Atoms, pub score: usize, pub trace: Vec>, } @@ -275,7 +196,8 @@ pub mod test_cases { impl TestCase { pub fn match_empty() -> Self { Self { - problem: problem(vec![], ""), + pattern: pattern(vec![]), + text: text(""), score: 0, trace: vec![], } @@ -283,7 +205,8 @@ pub mod test_cases { pub fn fail_empty_1() -> Self { Self { - problem: problem(vec![], "a"), + pattern: pattern(vec![]), + text: text("a"), score: 1, trace: vec![ Step::SkipText('a'), @@ -293,7 +216,8 @@ pub mod test_cases { pub fn fail_empty_2() -> Self { Self { - problem: problem(lits("a"), ""), + pattern: pattern(lits("a")), + text: text(""), score: 1, trace: vec![ Step::SkipPattern(Match::Lit('a')), @@ -303,7 +227,8 @@ pub mod test_cases { pub fn match_lit_1() -> Self { Self { - problem: problem(lits("a"), "a"), + pattern: pattern(lits("a")), + text: text("a"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -313,7 +238,8 @@ pub mod test_cases { pub fn match_lit_2() -> Self { Self { - problem: problem(lits("ab"), "ab"), + pattern: pattern(lits("ab")), + text: text("ab"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -324,7 +250,8 @@ pub mod test_cases { pub fn fail_lit_1() -> Self { Self { - problem: problem(lits("a"), "aa"), + pattern: pattern(lits("a")), + text: text("aa"), score: 1, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -335,7 +262,8 @@ pub mod test_cases { pub fn fail_lit_2() -> Self { Self { - problem: problem(lits("aba"), "aa"), + pattern: pattern(lits("aba")), + text: text("aa"), score: 1, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -347,7 +275,8 @@ pub mod test_cases { pub fn fail_lit_3() -> Self { Self { - problem: problem(lits("abcde"), "zabke"), + pattern: pattern(lits("abcde")), + text: text("zabke"), score: 4, trace: vec![ Step::SkipText('z'), @@ -364,7 +293,8 @@ pub mod test_cases { pub fn match_class_1() -> Self { Self { - problem: problem(vec![class(".")], "a"), + pattern: pattern(vec![class(".")]), + text: text("a"), score: 0, trace: vec![ Step::Hit(patt_class("."), 'a'), @@ -374,7 +304,8 @@ pub mod test_cases { pub fn match_class_2() -> Self { Self { - problem: problem(vec![class("[a-zA-Z]")], "a"), + pattern: pattern(vec![class("[a-zA-Z]")]), + text: text("a"), score: 0, trace: vec![ Step::Hit(patt_class("[a-zA-Z]"), 'a'), @@ -384,7 +315,8 @@ pub mod test_cases { pub fn match_class_3() -> Self { Self { - problem: problem(vec![class("[a-zA-Z]")], "X"), + pattern: pattern(vec![class("[a-zA-Z]")]), + text: text("X"), score: 0, trace: vec![ Step::Hit(patt_class("[a-zA-Z]"), 'X'), @@ -394,7 +326,8 @@ pub mod test_cases { pub fn fail_class_1() -> Self { Self { - problem: problem(vec![class("[^a]")], "a"), + pattern: pattern(vec![class("[^a]")]), + text: text("a"), score: 2, trace: vec![ // TODO handle valid possibility that the order of next two steps is reversed @@ -406,7 +339,8 @@ pub mod test_cases { pub fn match_alternative_1() -> Self { Self { - problem: problem(vec![alt(lits("ab"), lits("cd"))], "ab"), + pattern: pattern(vec![alt(lits("ab"), lits("cd"))]), + text: text("ab"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -417,7 +351,8 @@ pub mod test_cases { pub fn match_alternative_2() -> Self { Self { - problem: problem(vec![alt(lits("ab"), lits("cd"))], "cd"), + pattern: pattern(vec![alt(lits("ab"), lits("cd"))]), + text: text("cd"), score: 0, trace: vec![ Step::Hit(Match::Lit('c'), 'c'), @@ -428,13 +363,12 @@ pub mod test_cases { pub fn match_alternative_3() -> Self { Self { - problem: problem( + pattern: pattern( vec![ - alt(lits("a"), vec![alt(lits("b"), vec![alt(lits("c"), lits("d"))])]), - lit('z') - ], - "cz" + alt(lits("a"), vec![alt(lits("b"), vec![alt(lits("c"), lits("d"))])]), lit('z') + ] ), + text: text("cz"), score: 0, trace: vec![ Step::Hit(Match::Lit('c'), 'c'), @@ -445,7 +379,8 @@ pub mod test_cases { pub fn fail_alternative_1() -> Self { Self { - problem: problem(vec![alt(lits("ab"), lits("cd"))], "acd"), + pattern: pattern(vec![alt(lits("ab"), lits("cd"))]), + text: text("acd"), score: 1, trace: vec![ Step::SkipText('a'), @@ -457,7 +392,8 @@ pub mod test_cases { pub fn match_repetition_1() -> Self { Self { - problem: problem(vec![rep(lits("a"))], "aa"), + pattern: pattern(vec![rep(lits("a"))]), + text: text("aa"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -468,7 +404,8 @@ pub mod test_cases { pub fn match_repetition_2() -> Self { Self { - problem: problem(vec![rep(vec![lit('a'), rep(lits("b"))])], "aababb"), + pattern: pattern(vec![rep(vec![lit('a'), rep(lits("b"))])]), + text: text("aababb"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -483,7 +420,8 @@ pub mod test_cases { pub fn match_repetition_3() -> Self { Self { - problem: problem(vec![rep(vec![class("[0-9]")])], "0451"), + pattern: pattern(vec![rep(vec![class("[0-9]")])]), + text: text("0451"), score: 0, trace: vec![ Step::Hit(patt_class("[0-9]"), '0'), @@ -496,7 +434,8 @@ pub mod test_cases { pub fn match_repetition_4() -> Self { Self { - problem: problem(vec![rep_min(1, lits("a"))], "a"), + pattern: pattern(vec![rep_min(1, lits("a"))]), + text: text("a"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -506,7 +445,8 @@ pub mod test_cases { pub fn match_repetition_5() -> Self { Self { - problem: problem(vec![rep_bound(0, 5, lits("a"))], "aaaa"), + pattern: pattern(vec![rep_bound(0, 5, lits("a"))]), + text: text("aaaa"), score: 0, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -519,7 +459,8 @@ pub mod test_cases { pub fn fail_repetition_1() -> Self { Self { - problem: problem(vec![rep(lits("a"))], "aba"), + pattern: pattern(vec![rep(lits("a"))]), + text: text("aba"), score: 1, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -531,7 +472,8 @@ pub mod test_cases { pub fn fail_repetition_2() -> Self { Self { - problem: problem(vec![rep_min(1, lits("a"))], ""), + pattern: pattern(vec![rep_min(1, lits("a"))]), + text: text(""), score: 1, trace: vec![ Step::SkipPattern(Match::Lit('a')), @@ -541,7 +483,8 @@ pub mod test_cases { pub fn fail_repetition_3() -> Self { Self { - problem: problem(vec![rep_bound(0, 1, lits("a"))], "aa"), + pattern: pattern(vec![rep_bound(0, 1, lits("a"))]), + text: text("aa"), score: 1, trace: vec![ Step::Hit(Match::Lit('a'), 'a'), @@ -560,12 +503,13 @@ pub mod test_cases { Match::Class(Class::from(wildcard_class)) } - pub fn problem(elems: Vec, text: &str) -> Problem { + pub fn pattern(elems: Vec) -> Pattern { + Pattern { elems } + } + + pub fn text(text: &str) -> Atoms { let atoms = text.chars().collect(); - Problem { - pattern: Pattern { elems }, - text: Atoms { atoms }, - } + Atoms { atoms } } pub fn lits(cs: &str) -> Vec { diff --git a/fuzzy/src/map_solution.rs b/fuzzy/src/map_solution.rs deleted file mode 100644 index 2346fa4..0000000 --- a/fuzzy/src/map_solution.rs +++ /dev/null @@ -1,166 +0,0 @@ -//! An implementation of [`Solution`](crate::Solution) that should be relatively easy to develop new features for. -//! -//! This implementation uses a [map](State) to store state for each [node](Ix), so it should be -//! easy to change node representation and expand the state space over time. - -use crate::{ElementCore, Match, Problem, Step}; -use crate::flat_pattern::{Flat, FlatPattern}; -use crate::lattice_solution::{LatticeConfig, LatticeIx, LatticeSolution, LatticeState, Node, StepType}; -use std::collections::hash_map::HashMap; - -#[derive(Eq, PartialEq, Debug)] -pub struct MapSolution { - score: usize, - trace: Vec>, -} - -impl LatticeSolution for MapSolution { - type Conf = Config; - type Ix = Ix; - type State = State; - - fn new(score: usize, trace: Vec>) -> Self { - MapSolution { score, trace } - } - - fn score_lattice(&self) -> &usize { - &self.score - } - - fn trace_lattice(&self) -> &Vec> { - &self.trace - } -} - -pub struct Config { - pattern: FlatPattern, - text: Vec, -} - -impl LatticeConfig for Config { - fn new(problem: &Problem) -> Self { - let pattern = FlatPattern::new(&problem.pattern); - let text = problem.text.atoms.clone(); - Config { pattern, text } - } - - fn get(&self, ix: Ix) -> (Option<&Flat>, Option<&char>) { - (self.pattern.get(ix.pattern), self.text.get(ix.text)) - } - - fn start(&self) -> Ix { - Ix { pattern: 0, text: 0, rep_off: 0 } - } - - fn end(&self) -> Ix { - Ix { pattern: self.pattern.len(), text: self.text.len(), rep_off: 0 } - } - - fn step(&self, ix: Ix, step_type: StepType) -> Ix { - match step_type { - StepType::Hit => - Ix { pattern: ix.pattern + 1, text: ix.text + 1, rep_off: 0, ..ix }, - StepType::SkipText => - Ix { text: ix.text + 1, rep_off: 0, ..ix }, - StepType::SkipPattern | StepType::StartGroup | StepType::EndGroup | StepType::StartLeft => - Ix { pattern: ix.pattern + 1, ..ix }, - StepType::StartRight(off) => - Ix { pattern: ix.pattern + off + 1, ..ix }, - StepType::PassRight(off) => - Ix { pattern: ix.pattern + off, ..ix }, - StepType::StartRepetition => - Ix { pattern: ix.pattern + 1, rep_off: ix.rep_off + 1, ..ix }, - StepType::EndRepetition => - Ix { pattern: ix.pattern + 1, rep_off: ix.rep_off - 1, ..ix }, - StepType::PassRepetition(off) => - Ix { pattern: ix.pattern + off + 1, ..ix}, - StepType::RestartRepetition(off) => - Ix { pattern: ix.pattern - off, ..ix }, - } - } -} - -pub struct State { - nodes: HashMap>, - default: Node, -} - -impl LatticeState for State { - fn new(_conf: &Config) -> Self { - State { nodes: HashMap::new(), default: Node::new(), } - } - - fn get(&self, ix: Ix) -> &Node { - match self.nodes.get(&ix) { - Some(node) => node, - None => &self.default, - } - } - - fn get_mut(&mut self, ix: Ix) -> &mut Node { - self.nodes.entry(ix).or_insert(self.default.clone()) - } - - fn set(&mut self, ix: Ix, node: Node) { - let _ = self.nodes.insert(ix, node); - } -} - -#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Default)] -pub struct Ix { - /// The index into the [flattened `Problem::pattern`](crate::flat_pattern::FlatPattern). - pub pattern: usize, - /// The index into [`Problem::text`](crate::Problem::text). - pub text: usize, - /// This field represents our "repetition depth since we last changed text index". - /// - /// To avoid infinite loops, we have to avoid repeating a repetition group if that would take us - /// back to the same index we started at. We keep track of how many repetition groups we entered - /// since we last matched or skipped a text character, and avoid looping back unless this is 0. - /// This ix the "repetition depth". Because the "repetition depth" affects future jumps, it also - /// affects the future score, and so we have a separate score and a separate index for each - /// repetition depth value. - pub rep_off: usize, -} - -impl LatticeIx for Ix { - fn can_restart(&self) -> bool { - self.rep_off == 0 - } -} - -#[cfg(test)] -mod tests { - use super::MapSolution; - use crate::test_cases::TestCase; - use crate::lattice_solution::test_logic; - use test_case::test_case; - - #[test_case(TestCase::match_empty())] - #[test_case(TestCase::fail_empty_1())] - #[test_case(TestCase::fail_empty_2())] - #[test_case(TestCase::match_lit_1())] - #[test_case(TestCase::match_lit_2())] - #[test_case(TestCase::fail_lit_1())] - #[test_case(TestCase::fail_lit_2())] - #[test_case(TestCase::fail_lit_3())] - #[test_case(TestCase::match_class_1())] - #[test_case(TestCase::match_class_2())] - #[test_case(TestCase::match_class_3())] - #[test_case(TestCase::fail_class_1())] - #[test_case(TestCase::match_alternative_1())] - #[test_case(TestCase::match_alternative_2())] - #[test_case(TestCase::match_alternative_3())] - #[test_case(TestCase::fail_alternative_1())] - #[test_case(TestCase::match_repetition_1())] - #[test_case(TestCase::match_repetition_2())] - #[test_case(TestCase::match_repetition_3())] - #[test_case(TestCase::match_repetition_4())] - #[test_case(TestCase::match_repetition_5())] - #[test_case(TestCase::fail_repetition_1())] - #[test_case(TestCase::fail_repetition_2())] - #[test_case(TestCase::fail_repetition_3())] - fn test_solve(test: TestCase) { - test_logic::test_solve::(test); - } -} diff --git a/fuzzy/src/regex_pattern.rs b/fuzzy/src/regex_pattern.rs new file mode 100644 index 0000000..a0df9d0 --- /dev/null +++ b/fuzzy/src/regex_pattern.rs @@ -0,0 +1,191 @@ +//! Parses pattern using [`regex_syntax`](https://docs.rs/regex-syntax). +//! +//! [`regex_syntax`](https://docs.rs/regex-syntax) sometimes uses bytes in their API, while this +//! crate currently operates on unicode characters. For now, we are getting around this by naively +//! assuming all characters are ASCII. We will change this in the future. + +use regex_syntax::hir; +use crate::{Class, Element, Match, Pattern, Repetition}; +use crate::error::Error; + +pub fn parse_pattern(pattern: &str) -> Result, Error> { + let hir = regex_syntax::parse(pattern)?; + return wrap(parse_impl(&hir)); +} + +fn wrap(try_elems: Result, Error>) -> Result, Error> { + try_elems.map(|elems| Pattern { elems }) +} + + +fn parse_impl(hir: &hir::Hir) -> Result, Error> +{ + match hir.kind() { + hir::HirKind::Literal(hir::Literal(ref bytes)) => { + // TODO modify Patt::Lit to use bytes rather then chars. For now, assuming ascii + Ok(bytes.iter().map(|b| Element::Match(Match::Lit(*b as char))).collect()) + } + hir::HirKind::Class(class) => { + Ok(vec![Element::Match(Match::Class(Class::from(class.clone())))]) + } + hir::HirKind::Capture(hir::Capture { sub, .. }) => { + let pattern = wrap(parse_impl(sub))?; + Ok(vec![Element::Capture(pattern)]) + } + hir::HirKind::Alternation(children) => { + match &children[..] { + [] => Ok(vec![]), + [sub] => parse_impl(sub), + [sub1, sub2, subs @ ..] => { + let try_p1 = wrap(parse_impl(sub1)); + let try_p2 = wrap(parse_impl(sub2)); + let mut try_ps = subs.iter().map(|sub| wrap(parse_impl(sub))); + + let try_init = try_p1.and_then(|p1| try_p2.map(|p2| Element::Alternative(p1, p2))); + + let try_alternative = try_init.and_then(|init| + try_ps.try_fold(init, |elem, try_p| + try_p.map(|p| Element::Alternative(Pattern { elems: vec![elem] }, p)) + ) + ); + + try_alternative.map(|alt| vec![alt]) + } + } + } + hir::HirKind::Repetition(hir::Repetition { min, max, sub, .. }) => { + Result::from_iter( + wrap(parse_impl(sub)).map(|inner| { + let minimum = (*min).try_into().map_err(|_| Error::RegexBoundTooLarge)?; + let maximum = max.map_or(Ok(None), |max| + max.try_into().map(|m| Some(m)).map_err(|_| Error::RegexBoundTooLarge) + )?; + Ok(Element::Repetition(Repetition { minimum, maximum, inner })) + }) + ) + } + hir::HirKind::Concat(subs) => { + let try_nested: Result>, Error> = + Result::from_iter(subs.iter().map(|sub| parse_impl(sub))); + try_nested.map(|nested| nested.into_iter().flatten().collect()) + } + unsupported => { + Err(Error::PatternUnsupported(format!("{:?}", unsupported))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_cases::{alt, class, capture, lit, lits, rep, rep_min, rep_bound}; + use proptest::prelude::*; + + #[test] + fn parse_lit_1() { + parse_test("a", lits("a")); + } + + #[test] + fn parse_lit_2() { + parse_test("abc", lits("abc")); + } + + #[test] + fn parse_wildcard() { + parse_test(".", vec![class(".")]) + } + + #[test] + fn parse_concat_1() { + parse_test("a.", vec![lit('a'), class(".")]); + } + + #[test] + fn parse_repetition_1() { + parse_test("a*", vec![rep(lits("a"))]); + } + + #[test] + fn parse_repetition_2() { + parse_test("a+", vec![rep_min(1, lits("a"))]); + } + + #[test] + fn parse_repetition_3() { + parse_test("a{2,}", vec![rep_min(2, lits("a"))]); + } + + #[test] + fn parse_repetition_4() { + parse_test("a{0,3}", vec![rep_bound(0, 3, lits("a"))]); + } + + #[test] + fn parse_repetition_5() { + parse_test("a{4}", vec![rep_bound(4, 4, lits("a"))]); + } + + #[test] + fn parse_group_1() { + parse_test("(a)", vec![capture(lits("a"))]); + } + + #[test] + fn parse_alternative_1() { + parse_test("ab|cd", vec![alt(lits("ab"), lits("cd"))]); + } + + #[test] + fn parse_alternative_2() { + parse_test("ab|cd|wxyz", vec![alt(vec![alt(lits("ab"), lits("cd"))], lits("wxyz"))]); + } + + fn parse_test(pattern: &str, expected_elems: Vec) { + let expected_pattern = Pattern { elems: expected_elems }; + let actual_pattern = parse_pattern(&pattern).expect("Cannot parse pattern"); + assert_eq!(expected_pattern, actual_pattern); + } + + // TODO support parsing empty patterns + // TODO more accurate range of literal patterns here + const LITERAL_PATTERN_REGEX: &str = "[[:alnum:]]+"; + + proptest! { + #[test] + fn smoketest(pattern in "\\PC*") { + let _ = parse_pattern(&pattern); + } + + #[test] + fn literals(pattern in LITERAL_PATTERN_REGEX) { + let expected_pattern = Pattern { elems: lits(&pattern) }; + let actual_pattern = parse_pattern(&pattern).expect("Cannot parse pattern"); + prop_assert_eq!(expected_pattern, actual_pattern); + } + + #[test] + fn captures(inner in LITERAL_PATTERN_REGEX) { + let wrapped = format!("({})", inner); + let Pattern { elems: actual_inner } = parse_pattern(&inner).expect("Cannot parse inner"); + let Pattern { elems: actual_wrapped } = parse_pattern(&wrapped).expect("Cannot parse wrapped"); + prop_assert_eq!( actual_wrapped, vec![capture(actual_inner)]); + } + + #[test] + fn alternatives(inners in prop::collection::vec(LITERAL_PATTERN_REGEX, 2..5)) { + // the regex lib is smart enough to turn an alternative of single characters into a + // character class ... which is good, but annoying for this particular test + prop_assume!(inners.iter().any(|inner| inner.len() > 1)); + + let alt_pattern = inners.join("|"); + let expected_alt = inners.iter() + .map(|inner| lits(&inner)) + .reduce(|acc, right| vec![alt(acc, right)]).expect("Cannot be empty"); + + let expected_pattern = Pattern { elems: expected_alt }; + let actual_pattern = parse_pattern(&alt_pattern).expect("Cannot parse pattern"); + prop_assert_eq!(expected_pattern, actual_pattern); + } + } +} diff --git a/fuzzy/src/regex_question.rs b/fuzzy/src/regex_question.rs deleted file mode 100644 index 2d04f6b..0000000 --- a/fuzzy/src/regex_question.rs +++ /dev/null @@ -1,206 +0,0 @@ -//! An implementation of [`Question`](crate::Question) that parses the pattern using -//! [`regex_syntax`](https://docs.rs/regex-syntax). -//! -//! [`regex_syntax`](https://docs.rs/regex-syntax) sometimes uses bytes in their API, while this -//! crate currently operates on unicode characters. For now, we are getting around this by naively -//! assuming all characters are ASCII. We will change this in the future. - -use regex_syntax; -use regex_syntax::hir; -use crate::{Atoms, Class, Element, Match, Pattern, Problem, Question, Repetition}; -use crate::error::Error; - -pub struct RegexQuestion { - pub pattern_regex: String, - pub text: String, -} - -impl Question for RegexQuestion { - fn ask(&self) -> Result, Error> { - let pattern = Self::parse_pattern(&self.pattern_regex)?; - let text = Atoms { atoms: self.text.chars().collect() }; - Ok(Problem { pattern, text }) - } -} - -impl RegexQuestion { - fn parse_pattern(pattern: &str) -> Result, Error> { - let hir = regex_syntax::parse(pattern)?; - Self::pattern(Self::parse_impl(&hir)) - } - - fn pattern(try_elems: Result, Error>) -> Result, Error> { - try_elems.map(|elems| Pattern { elems }) - } - - fn parse_impl(hir: &hir::Hir) -> Result, Error> - { - match hir.kind() { - hir::HirKind::Literal(hir::Literal(ref bytes)) => { - // TODO modify Patt::Lit to use bytes rather then chars. For now, assuming ascii - Ok(bytes.iter().map(|b| Element::Match(Match::Lit(*b as char))).collect()) - } - hir::HirKind::Class(class) => { - Ok(vec![Element::Match(Match::Class(Class::from(class.clone())))]) - } - hir::HirKind::Capture(hir::Capture { sub, .. }) => { - Self::pattern(Self::parse_impl(sub)).map(|p| vec![Element::Capture(p)]) - } - hir::HirKind::Alternation(children) => { - match &children[..] { - [] => Ok(vec![]), - [sub] => Self::parse_impl(sub), - [sub1, sub2, subs @ ..] => { - let try_p1 = Self::pattern(Self::parse_impl(sub1)); - let try_p2 = Self::pattern(Self::parse_impl(sub2)); - let mut try_ps = subs.iter().map(|sub| Self::pattern(Self::parse_impl(sub))); - - let try_init = try_p1.and_then(|p1| try_p2.map(|p2| Element::Alternative(p1, p2))); - - let try_alternative = try_init.and_then(|init| - try_ps.try_fold(init, |elem, try_p| - try_p.map(|p| Element::Alternative(Pattern { elems: vec![elem] }, p)) - ) - ); - - try_alternative.map(|alt| vec![alt]) - } - } - } - hir::HirKind::Repetition(hir::Repetition { min, max, sub, .. }) => { - Result::from_iter( - Self::pattern(Self::parse_impl(sub)).map(|inner| { - let minimum = (*min).try_into().map_err(|_| Error::RegexBoundTooLarge)?; - let maximum = max.map_or(Ok(None), |max| - max.try_into().map(|m| Some(m)).map_err(|_| Error::RegexBoundTooLarge) - )?; - Ok(Element::Repetition(Repetition { minimum, maximum, inner })) - }) - ) - } - hir::HirKind::Concat(subs) => { - let try_nested: Result>, Error> = - Result::from_iter(subs.iter().map(|sub| Self::parse_impl(sub))); - try_nested.map(|nested| nested.into_iter().flatten().collect()) - } - unsupported => { - Err(Error::PatternUnsupported(format!("{:?}", unsupported))) - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test_cases::{alt, class, capture, lit, lits, rep, rep_min, rep_bound}; - use proptest::prelude::*; - - #[test] - fn parse_lit_1() { - parse_test("a", lits("a")); - } - - #[test] - fn parse_lit_2() { - parse_test("abc", lits("abc")); - } - - #[test] - fn parse_wildcard() { - parse_test(".", vec![class(".")]) - } - - #[test] - fn parse_concat_1() { - parse_test("a.", vec![lit('a'), class(".")]); - } - - #[test] - fn parse_repetition_1() { - parse_test("a*", vec![rep(lits("a"))]); - } - - #[test] - fn parse_repetition_2() { - parse_test("a+", vec![rep_min(1, lits("a"))]); - } - - #[test] - fn parse_repetition_3() { - parse_test("a{2,}", vec![rep_min(2, lits("a"))]); - } - - #[test] - fn parse_repetition_4() { - parse_test("a{0,3}", vec![rep_bound(0, 3, lits("a"))]); - } - - #[test] - fn parse_repetition_5() { - parse_test("a{4}", vec![rep_bound(4, 4, lits("a"))]); - } - - #[test] - fn parse_group_1() { - parse_test("(a)", vec![capture(lits("a"))]); - } - - #[test] - fn parse_alternative_1() { - parse_test("ab|cd", vec![alt(lits("ab"), lits("cd"))]); - } - - #[test] - fn parse_alternative_2() { - parse_test("ab|cd|wxyz", vec![alt(vec![alt(lits("ab"), lits("cd"))], lits("wxyz"))]); - } - - fn parse_test(pattern: &str, expected_elems: Vec) { - let expected_pattern = Pattern { elems: expected_elems }; - let actual_pattern = RegexQuestion::parse_pattern(&pattern).expect("Cannot parse pattern"); - assert_eq!(expected_pattern, actual_pattern); - } - - // TODO support parsing empty patterns - // TODO more accurate range of literal patterns here - const LITERAL_PATTERN_REGEX: &str = "[[:alnum:]]+"; - - proptest! { - #[test] - fn smoketest(pattern in "\\PC*") { - let _ = RegexQuestion::parse_pattern(&pattern); - } - - #[test] - fn literals(pattern in LITERAL_PATTERN_REGEX) { - let expected_pattern = Pattern { elems: lits(&pattern) }; - let actual_pattern = RegexQuestion::parse_pattern(&pattern).expect("Cannot parse pattern"); - prop_assert_eq!(expected_pattern, actual_pattern); - } - - #[test] - fn captures(inner in LITERAL_PATTERN_REGEX) { - let wrapped = format!("({})", inner); - let Pattern { elems: actual_inner } = RegexQuestion::parse_pattern(&inner).expect("Cannot parse inner"); - let Pattern { elems: actual_wrapped } = RegexQuestion::parse_pattern(&wrapped).expect("Cannot parse wrapped"); - prop_assert_eq!( actual_wrapped, vec![capture(actual_inner)]); - } - - #[test] - fn alternatives(inners in prop::collection::vec(LITERAL_PATTERN_REGEX, 2..5)) { - // the regex lib is smart enough to turn an alternative of single characters into a - // character class ... which is good, but annoying for this particular test - prop_assume!(inners.iter().any(|inner| inner.len() > 1)); - - let alt_pattern = inners.join("|"); - let expected_alt = inners.iter() - .map(|inner| lits(&inner)) - .reduce(|acc, right| vec![alt(acc, right)]).expect("Cannot be empty"); - - let expected_pattern = Pattern { elems: expected_alt }; - let actual_pattern = RegexQuestion::parse_pattern(&alt_pattern).expect("Cannot parse pattern"); - prop_assert_eq!(expected_pattern, actual_pattern); - } - } -} diff --git a/fuzzy/src/table_solution.rs b/fuzzy/src/table_solution.rs index 8b80b08..ff58372 100644 --- a/fuzzy/src/table_solution.rs +++ b/fuzzy/src/table_solution.rs @@ -1,38 +1,120 @@ -//! A theoretically faster implementation of [`Solution`](crate::Solution). +//! A theoretically faster solver than my initial cached recursive implementation. //! //! This implementation pre-allocates a [vector](State) storing state for all [nodes](Ix), so in //! theory it should be relatively efficient, although we haven't done any benchmarks yet. We will //! do these in the future. -use crate::{ElementCore, Match, Problem, Step}; +use crate::{Atoms, ElementCore, Match, Pattern, Solution, Step}; +use crate::error::Error; use crate::flat_pattern::{Flat, FlatPattern}; -use crate::lattice_solution::{LatticeConfig, LatticeIx, LatticeSolution, LatticeState, Node, StepType}; +use nonempty::{NonEmpty, nonempty}; -#[derive(Eq, PartialEq, Debug)] -pub struct TableSolution { - score: usize, - trace: Vec>, -} +pub fn solve(pattern: &Pattern, text: &Atoms) -> Result { + let conf = Config::new(pattern, text); + let mut state = State::new(&conf); -impl LatticeSolution for TableSolution { - type Conf = Config; - type Ix = Ix; - type State = State; + let start_ix = conf.start(); + let end_ix = conf.end(); - fn new(score: usize, trace: Vec>) -> Self { - TableSolution { score, trace } - } + let _ = calculate_optimal_path(&conf, &mut state)?; - fn score_lattice(&self) -> &usize { - &self.score + let start_node = state.get(start_ix); + let score = start_node.done_info() + .map(|i| i.0) + .map_err(|_| Error::IncompleteFinalState)?; + + let mut trace = vec![]; + let mut from = start_ix; + loop { + let node = state.get(from); + if !node.is_done() || from == end_ix { break; } + let (patt, text) = conf.get(from); + let (_, step_type, next) = node.done_info()?; + if let Some(step) = step_type.step() { + let final_step = step.map( + |_| match patt { + Some(Flat::Lit(c)) => Match::Lit(*c), + Some(Flat::Class(c)) => Match::Class(c.clone()), + unexpected => panic!("Unexpected trace pattern {:?}", unexpected), + }, + |_| match text { + Some(c) => *c, + unexpected => panic!("Unexpected trace text {:?}", unexpected), + } + ); + trace.push(final_step); + } + from = next; + } + if from != end_ix { + return Err(Error::IncompleteFinalState); } - fn trace_lattice(&self) -> &Vec> { - &self.trace + Ok(Solution { score, trace }) +} + +fn calculate_optimal_path( + conf: &Config, + state: &mut State, + ) -> Result<(), Error> { + let start_ix = conf.start(); + let end_ix = conf.end(); + + let mut loop_state = LoopState::Down(Down { + parent: Default::default(), + current: start_ix, + }); + + let mut loop_counter = 0; + + loop { + loop_counter += 1; + if loop_counter >= 1000000000 { // TODO make this max configurable + return Err(Error::ExceededMaxSteps(loop_counter)); + } + let new_parent = match &loop_state { + LoopState::Down(down) if state.get(down.current).is_ready() => { + let (flat, text) = conf.get(down.current); + let opt_node_type = NodeType::get(flat, text, &down.current); + let node_state = state.get_mut(down.current); + node_state.initialise(end_ix, down.parent, down.current, opt_node_type)?; + down.parent + } + LoopState::Down(down) => down.parent, + LoopState::Back(back) => { + let new_child = back.child; + let (new_score, _, _) = state.get(new_child).done_info()?; + let node_state = state.get_mut(back.current); + let new_parent = node_state.update(new_child, back.current, new_score)?; + new_parent + } + }; + + let current_ix = loop_state.current(); + let final_state = state.get(current_ix); + if current_ix == start_ix && final_state.is_done() { + break; + } else if final_state.is_done() { + loop_state = LoopState::Back(Back { + current: new_parent, + child: current_ix, + }); + } else if final_state.is_working() { + let current_step_type = final_state.current_step_type()?; + let child = conf.step(current_ix, current_step_type); + loop_state = LoopState::Down(Down { + parent: current_ix, + current: child, + }); + } else { + return Err(Error::NoNodeProgress(format!("{:?}", current_ix))); + } } + + Ok(()) } -/// Stores the text and pattern from the original [`Problem`](crate::Problem). +/// Flattens the text and pattern so we can easily index each one. /// /// Our state stores an array of nodes. This array forms a table, with one dimension representing /// the text, while the other dimension represents an expanded pattern, per [`FlatPattern::custom`]. @@ -41,10 +123,10 @@ pub struct Config { pattern: FlatPattern, } -impl LatticeConfig for Config { - fn new(problem: &Problem) -> Self { - let pattern = FlatPattern::custom(&problem.pattern, 1); - let text = problem.text.atoms.clone(); +impl Config { + fn new(pattern: &Pattern, text: &Atoms) -> Self { + let pattern = FlatPattern::custom(pattern, 1); + let text = text.atoms.clone(); Config { text, pattern } } @@ -125,7 +207,7 @@ impl LatticeConfig for Config { } pub struct State { - nodes: Vec>, + nodes: Vec, pattern_len: usize, } @@ -133,9 +215,7 @@ impl State { fn node(&self, ix: Ix) -> usize { ix.text * self.pattern_len + ix.pattern + ix.rep_off } -} -impl LatticeState for State { fn new(conf: &Config) -> Self { // we need an extra row/col for indices at the end of pattern and text let pattern_len = conf.pattern.len() + 1; @@ -148,28 +228,24 @@ impl LatticeState for State { } } - fn get(&self, ix: Ix) -> &Node { + fn get(&self, ix: Ix) -> &Node { let node_ix = self.node(ix); &self.nodes[node_ix] } - fn get_mut(&mut self, ix: Ix) -> &mut Node { + fn get_mut(&mut self, ix: Ix) -> &mut Node { let node_ix = self.node(ix); &mut self.nodes[node_ix] } - - fn set(&mut self, ix: Ix, node: Node) { - let node_ix = self.node(ix); - self.nodes[node_ix] = node; - } } /// Indexes into [`State`]. +/// This struct is a LOT bigger than I initially expected, attempt to reduce size in the future. #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Default)] pub struct Ix { - /// The index into the [flattened `Problem::pattern`](crate::flat_pattern::FlatPattern). + /// The index into the flat pattern. pub pattern: usize, - /// The index into [`Problem::text`](crate::Problem::text). + /// The index into the text. pub text: usize, /// This field tracks how many times we are repeating each pattern element. pub reps: usize, @@ -184,17 +260,280 @@ pub struct Ix { pub rep_off: usize, } -impl LatticeIx for Ix { +impl Ix { fn can_restart(&self) -> bool { self.rep_off == 0 } } +#[derive(Debug)] +enum LoopState { + Down(Down), + Back(Back), +} + +impl LoopState { + fn current(&self) -> Ix { + match self { + LoopState::Down(down) => down.current, + LoopState::Back(back) => back.current, + } + } +} + +#[derive(Debug)] +struct Down { + parent: Ix, + current: Ix, +} + +#[derive(Debug)] +struct Back { + current: Ix, + child: Ix, +} + +// TODO make a better Node type +// +// Calculate_optimal_path (originally called solve_ix) used to store a lot of state on the stack: +// the parent node, our progress through the possible step types, the optimal score, etc. The +// node was a simple enum which was either Ready, Working, or Done. Only the Done value had any +// fields, and it was never mutated. +// +// Once we began to run out of stack space for mid-sized use-cases, we transferred all of that +// state into the heap by adding it to this Node struct. Much of this information is mutated as we +// try out each possible step type. +// +// I had a lot of trouble implementing this expanded node. Solve loops over my table of node +// values, taking a mutable reference to a single node in each iteration. My code originally +// pattern matched on the Node enum, and called methods on inner types which could only be accessed +// when node had the right case. But I struggled to do this and satisfy rust's borrow checker. +// +// For now, I've abandonded pattern matching and type safety, and implemented rust as an abstract +// data type. The node still has three states: Ready, Working, and Done, but they aren't reflected +// in rust's type system. Instead, Node methods return errors if they are called when the node is +// in the wrong state. +// +// The three states are a bit implicit in the Node structure. They are driven by current. Current +// changes from 0..=step_types.len()+1 over the life of the Node: +// +// 1. A node is Ready if current == 0 +// 2. A node is Working if 1 >= current >= step_types.len() +// 3. A node is Done if current == step_types.len() + 1 +// +// When a node is working, the current step type being attempted is step_types[current-1]. +// +// When a node has processed at least one node (current >= 2), score/step_type/next record the +// optimal choice among step_types[0..current-1]. This means those fields are optimal when a Node +// is Done. +// +// I'd like to return to this Node when I'm more comfortable working with rust, and do a better job +// implementing it. + +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct Node { + current: usize, + parent: Ix, + score: usize, + step_type: StepType, + next: Ix, + step_types: Vec, +} + +impl Node { + pub fn new() -> Self { + Self { + current: 0, + parent: Default::default(), + score: 0, + step_type: StepType::Hit, + next: Default::default(), + step_types: vec![], + } + } + + fn is_ready(&self) -> bool { + self.current == 0 + } + + fn is_working(&self) -> bool { + self.current > 0 && self.current <= self.step_types.len() + } + + fn is_done(&self) -> bool { + self.current > self.step_types.len() + } + + fn current_step_type(&self) -> Result { + if self.is_working() { + Ok(self.step_types[self.current - 1]) + } else { + Err(Error::CannotGetNodeField("current_step_type", "working")) + } + } + + fn done_info(&self) -> Result<(usize, StepType, Ix), Error> { + if self.is_done() { + Ok((self.score, self.step_type, self.next)) + } else { + Err(Error::CannotGetNodeField("score/step_type/next", "done")) + } + } + + fn initialise(&mut self, end_ix: Ix, parent_ix: Ix, ix: Ix, opt_node_type: Option) -> Result<(), Error>{ + if self.is_ready() { + match opt_node_type { + Some(node_type) => { + let step_types = Vec::from(node_type.step_types()); + self.parent = parent_ix; + self.current += 1; + self.step_types = step_types; + Ok(()) + } + None if ix == end_ix => { // end_ix: insert dummy done value + self.parent = parent_ix; + self.current += 1; + Ok(()) + } + None => { + Err(Error::NoNodeType(format!("{:?}", ix))) + } + } + } else { + Err(Error::CannotInitialiseNode(format!("{:?}", ix))) + } + } + + fn update(&mut self, new_child: Ix, ix: Ix, new_score: usize) -> Result { + if self.is_working() { + let parent_ix = self.parent; + let current_step_type = self.current_step_type()?; + let new_score = new_score + current_step_type.cost(); + if self.current <= 1 || new_score < self.score { + self.step_type = current_step_type; + self.score = new_score; + self.next = new_child; + self.current += 1; + } else { + self.current += 1; + } + Ok(parent_ix) + } else { + Err(Error::CannotUpdateNode(format!("{:?}", ix))) + } + } +} + +#[derive(Copy, Clone, Eq, Hash, PartialEq, Debug)] +pub enum NodeType { + FinishedPattern, + FinishedText, + Hit, + NoHit, + StartGroup, + EndGroup, + AlternativeLeft(usize), + AlternativeRight(usize), + RepetitionStart(usize), + RepetitionRestart(usize), + RepetitionEnd, +} + +impl NodeType { + fn get(opt_flat: Option<&Flat>, opt_text: Option<&char>, ix: &Ix) -> Option { + // TODO this is surprisingly hard to follow for something conceptually simple. Can I make it nicer? + match opt_flat { + None if opt_text == None => None, + None => Some(NodeType::FinishedPattern), + Some(flat) => Some(match flat { + Flat::Lit(c) if opt_text == Some(c) => NodeType::Hit, + Flat::Lit(_) if opt_text == None => NodeType::FinishedText, + Flat::Lit(_) => NodeType::NoHit, + Flat::Class(class) if opt_text.map_or(false, |t| class.matches(*t)) => NodeType::Hit, + Flat::Class(_) if opt_text == None => NodeType::FinishedText, + Flat::Class(_) => NodeType::NoHit, + Flat::GroupStart => NodeType::StartGroup, + Flat::GroupEnd => NodeType::EndGroup, + Flat::AlternativeLeft(off) => NodeType::AlternativeLeft(*off), + Flat::AlternativeRight(off) => NodeType::AlternativeRight(*off), + Flat::RepetitionStart(off) => NodeType::RepetitionStart(*off), + Flat::RepetitionEnd(off) if ix.can_restart() => NodeType::RepetitionRestart(*off), + Flat::RepetitionEnd(_) => NodeType::RepetitionEnd, + }) + } + } + + fn step_types(&self) -> NonEmpty { + use StepType::*; + match self { + Self::FinishedPattern => nonempty![SkipText], + Self::FinishedText => nonempty![SkipPattern], + Self::Hit => nonempty![Hit, SkipPattern, SkipText], + Self::NoHit => nonempty![SkipPattern, SkipText], + Self::StartGroup => nonempty![StartGroup], + Self::EndGroup => nonempty![EndGroup], + Self::AlternativeLeft(off) => nonempty![StartLeft, StartRight(*off)], + Self::AlternativeRight(off) => nonempty![PassRight(*off)], + Self::RepetitionStart(off) => nonempty![StartRepetition, PassRepetition(*off)], + Self::RepetitionRestart(off) => nonempty![RestartRepetition(*off)], + Self::RepetitionEnd => nonempty![EndRepetition], + } + } +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum StepType { + SkipText, + SkipPattern, + Hit, + StartGroup, + EndGroup, + StartLeft, + StartRight(usize), + PassRight(usize), + StartRepetition, + PassRepetition(usize), + EndRepetition, + RestartRepetition(usize), +} + +impl StepType { + fn cost(&self) -> usize { + match self { + Self::SkipPattern => 1, + Self::SkipText => 1, + _ => 0, + } + } + + fn step(&self) -> Option> { + match self { + Self::Hit => Some(Step::Hit((), ())), + Self::SkipPattern => Some(Step::SkipPattern(())), + Self::SkipText => Some(Step::SkipText(())), + Self::StartGroup => Some(Step::StartCapture), + Self::EndGroup => Some(Step::StopCapture), + _ => None, + } + } +} + +#[cfg(test)] +pub mod test_logic { + use super::*; + use crate::test_cases::TestCase; + + pub fn test_solve(test_case: TestCase) { + let desugared = test_case.pattern.desugar(); + let actual = solve(&desugared, &test_case.text).unwrap(); + assert_eq!(test_case.score, actual.score); + assert_eq!(test_case.trace, actual.trace); + } +} #[cfg(test)] mod tests { - use super::TableSolution; + use super::test_logic; use crate::test_cases::TestCase; - use crate::lattice_solution::test_logic; use test_case::test_case; #[test_case(TestCase::match_empty())] @@ -222,6 +561,6 @@ mod tests { #[test_case(TestCase::fail_repetition_2())] #[test_case(TestCase::fail_repetition_3())] fn test_solve(test: TestCase) { - test_logic::test_solve::(test); + test_logic::test_solve(test); } } diff --git a/fuzzy_cli/src/lib.rs b/fuzzy_cli/src/lib.rs index 8f4ddda..5f4686d 100644 --- a/fuzzy_cli/src/lib.rs +++ b/fuzzy_cli/src/lib.rs @@ -1,8 +1,5 @@ use clap::Parser; -use fuzzy::{Output, Question, Solution}; -use fuzzy::diff_output::DiffOutput; -use fuzzy::table_solution::TableSolution; -use fuzzy::regex_question::RegexQuestion; +use fuzzy; use fuzzy::error::Error; use std::fs; @@ -32,14 +29,6 @@ pub fn run(args: Args) -> Result { fs::read_to_string(args.text)? }; - let question = RegexQuestion { pattern_regex, text }; - run_impl::(question) -} - -fn run_impl, S: Solution, O: Output>(question: Q) -> Result { - let problem = question.ask()?; - let problem_core = problem.desugar(); - let solution = S::solve(&problem_core)?; - let output = O::new(&solution.score(), &solution.trace()); + let output = fuzzy::fuzzy_match(pattern_regex, text)?; Ok(format!("{}", output)) } diff --git a/fuzzy_lambda/src/main.rs b/fuzzy_lambda/src/main.rs index 41dd209..c86a1ee 100644 --- a/fuzzy_lambda/src/main.rs +++ b/fuzzy_lambda/src/main.rs @@ -1,7 +1,6 @@ -use fuzzy::{Output, Question, Solution}; -use fuzzy::diff_output::{Chunk, DiffOutput}; -use fuzzy::table_solution::TableSolution; -use fuzzy::regex_question::RegexQuestion; +use fuzzy; +use fuzzy::diff_output::Chunk; + use lambda_http::{run, service_fn, Body, Error, Request, Response}; use serde::{Serialize, Deserialize}; @@ -16,7 +15,6 @@ struct Args { #[derive(Serialize)] struct Out { - score: usize, trace: Vec, } @@ -52,12 +50,9 @@ impl OutChunk { async fn function_handler(event: Request) -> Result, Error> { let body_str = std::str::from_utf8(event.body())?; let args = serde_json::from_str::(body_str)?; + let output = fuzzy::fuzzy_match(args.pattern, args.text)?; - let problem = RegexQuestion { pattern_regex: args.pattern, text: args.text }.ask()?; - let problem_core = problem.desugar(); - let solution = TableSolution::solve(&problem_core)?; - let output = DiffOutput::new(&solution.score(), &solution.trace()); - let body = Out { score: *solution.score(), trace: OutChunk::from(&output.chunks) }; + let body = Out { trace: OutChunk::from(&output.chunks) }; let body_json = serde_json::to_string(&body)?; let resp = Response::builder()