diff --git a/crates/sempai-core/Cargo.toml b/crates/sempai-core/Cargo.toml index 38f44a52..1379232d 100644 --- a/crates/sempai-core/Cargo.toml +++ b/crates/sempai-core/Cargo.toml @@ -9,6 +9,7 @@ test-support = [] [dependencies] serde = { workspace = true } +serde_json = { workspace = true } thiserror = { workspace = true } [dev-dependencies] diff --git a/crates/sempai-core/src/formula.rs b/crates/sempai-core/src/formula.rs new file mode 100644 index 00000000..41eb6f9d --- /dev/null +++ b/crates/sempai-core/src/formula.rs @@ -0,0 +1,145 @@ +//! Canonical normalised formula model shared by legacy and v2 paths. +//! +//! The [`Formula`] enum is the single internal representation that both +//! legacy Semgrep operators and v2 `match` syntax lower into. It lives +//! in `sempai_core` so that every downstream crate (including the future +//! `sempai_ts` backend) can depend on it without pulling in the YAML +//! parser. +//! +//! # Design +//! +//! The formula tree mirrors the logical structure of a Semgrep query: +//! +//! - [`Atom`] — a leaf pattern or regex to match against source code. +//! - [`Formula::And`] / [`Formula::Or`] — Boolean combinators. +//! - [`Formula::Not`] / [`Formula::Inside`] / [`Formula::Anywhere`] — +//! unary modifiers. +//! - [`Formula::Constraint`] — opaque constraint preserved for later +//! evaluation (e.g. `metavariable-regex`, `metavariable-pattern`). +//! - [`Decorated`] — metadata wrapper carrying `where`, `as`, and `fix` +//! annotations from v2 syntax. + +use serde_json::Value; + +/// A leaf pattern or regex to match against source code. +/// +/// # Example +/// +/// ``` +/// use sempai_core::formula::Atom; +/// +/// let atom = Atom::Pattern(String::from("foo($X)")); +/// assert!(matches!(atom, Atom::Pattern(_))); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Atom { + /// A concrete code pattern (e.g. `foo($X)`). + Pattern(String), + /// A regular expression pattern. + Regex(String), + /// A raw Tree-sitter S-expression query. + TreeSitterQuery(String), +} + +/// Metadata wrapper carrying v2 `where`, `as`, and `fix` annotations. +/// +/// Legacy formulas produce bare `Decorated` values with empty metadata. +/// v2 formulas propagate decoration from the parsed `Decorated` variant. +/// +/// # Example +/// +/// ``` +/// use sempai_core::formula::{Atom, Decorated, Formula}; +/// +/// let bare = Decorated::bare(Formula::Atom(Atom::Pattern(String::from("x")))); +/// assert!(bare.where_clauses.is_empty()); +/// assert!(bare.as_name.is_none()); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Decorated { + /// The wrapped formula node. + pub node: T, + /// Raw `where` clauses preserved for later constraint evaluation. + pub where_clauses: Vec, + /// Optional metavariable alias name. + pub as_name: Option, + /// Optional inline fix text. + pub fix: Option, +} + +impl Decorated { + /// Wraps a node with no decoration metadata. + pub const fn bare(node: T) -> Self { + Self { + node, + where_clauses: Vec::new(), + as_name: None, + fix: None, + } + } + + /// Wraps a node with full decoration metadata. + pub const fn with_metadata( + node: T, + where_clauses: Vec, + as_name: Option, + fix: Option, + ) -> Self { + Self { + node, + where_clauses, + as_name, + fix, + } + } +} + +/// Canonical normalised formula for a single search rule. +/// +/// Both legacy Semgrep operators and v2 `match` syntax lower into this +/// representation. The tree is validated for semantic constraints after +/// construction. +/// +/// # Example +/// +/// ``` +/// use sempai_core::formula::{Atom, Decorated, Formula}; +/// +/// let conjunction = Formula::And(vec![ +/// Decorated::bare(Formula::Atom(Atom::Pattern(String::from("foo($X)")))), +/// Decorated::bare(Formula::Not(Box::new( +/// Decorated::bare(Formula::Atom(Atom::Pattern(String::from("bar($X)")))), +/// ))), +/// ]); +/// assert!(matches!(conjunction, Formula::And(_))); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Formula { + /// A leaf pattern or regex. + Atom(Atom), + /// Logical negation. + Not(Box>), + /// Scope restriction (`pattern-inside` / `inside`). + Inside(Box>), + /// Unrestricted scope (`semgrep-internal-pattern-anywhere` / `anywhere`). + Anywhere(Box>), + /// Logical conjunction (`patterns` / `all`). + And(Vec>), + /// Logical disjunction (`pattern-either` / `any`). + Or(Vec>), + /// Opaque constraint preserved for later evaluation. + Constraint(Value), +} + +impl Formula { + /// Returns `true` if this formula node is a positive match-producing + /// term. + /// + /// Positive terms are `Atom`, `And`, and `Or` nodes. `Not`, `Inside`, + /// `Anywhere`, and `Constraint` are not positive terms for the purpose + /// of the `MissingPositiveTermInAnd` semantic check. + #[must_use] + pub const fn is_positive_term(&self) -> bool { + matches!(self, Self::Atom(_) | Self::And(_) | Self::Or(_)) + } +} diff --git a/crates/sempai-core/src/lib.rs b/crates/sempai-core/src/lib.rs index ab8d4303..818c4486 100644 --- a/crates/sempai-core/src/lib.rs +++ b/crates/sempai-core/src/lib.rs @@ -14,6 +14,7 @@ //! - [`CaptureValue`] and [`CapturedNode`] — metavariable bindings //! - [`DiagnosticReport`] and [`Diagnostic`] — structured error reporting //! - [`EngineConfig`] and [`EngineLimits`] — performance and safety limits +//! - [`Formula`], [`Atom`], and [`Decorated`] — canonical normalised formula model //! //! # Example //! @@ -28,6 +29,7 @@ mod capture; mod config; mod diagnostic; +pub mod formula; mod language; mod match_result; mod span; @@ -35,6 +37,7 @@ mod span; pub use capture::{CaptureValue, CapturedNode}; pub use config::{EngineConfig, EngineLimits}; pub use diagnostic::{Diagnostic, DiagnosticCode, DiagnosticReport, SourceSpan}; +pub use formula::{Atom, Decorated, Formula}; pub use language::{Language, LanguageParseError}; pub use match_result::Match; pub use span::{LineCol, Span}; diff --git a/crates/sempai-core/src/tests/formula_tests.rs b/crates/sempai-core/src/tests/formula_tests.rs new file mode 100644 index 00000000..f074d25a --- /dev/null +++ b/crates/sempai-core/src/tests/formula_tests.rs @@ -0,0 +1,146 @@ +//! Tests for the canonical formula model types. + +use crate::formula::{Atom, Decorated, Formula}; + +// ----------------------------------------------------------------------- +// Atom construction +// ----------------------------------------------------------------------- + +#[test] +fn atom_pattern_stores_string() { + let atom = Atom::Pattern(String::from("foo($X)")); + assert_eq!(atom, Atom::Pattern(String::from("foo($X)"))); +} + +#[test] +fn atom_regex_stores_string() { + let atom = Atom::Regex(String::from("foo.*")); + assert_eq!(atom, Atom::Regex(String::from("foo.*"))); +} + +#[test] +fn atom_tree_sitter_query_stores_string() { + let atom = Atom::TreeSitterQuery(String::from("(call_expression)")); + assert_eq!( + atom, + Atom::TreeSitterQuery(String::from("(call_expression)")) + ); +} + +// ----------------------------------------------------------------------- +// Decorated wrapper +// ----------------------------------------------------------------------- + +#[test] +fn decorated_bare_has_empty_metadata() { + let d = Decorated::bare(Formula::Atom(Atom::Pattern(String::from("x")))); + assert!(d.where_clauses.is_empty()); + assert!(d.as_name.is_none()); + assert!(d.fix.is_none()); +} + +#[test] +fn decorated_with_metadata_preserves_fields() { + let d = Decorated::with_metadata( + Formula::Atom(Atom::Regex(String::from("r"))), + vec![serde_json::json!({"metavariable": "$X"})], + Some(String::from("alias")), + Some(String::from("fix: $X")), + ); + assert_eq!(d.where_clauses.len(), 1); + assert_eq!(d.as_name.as_deref(), Some("alias")); + assert_eq!(d.fix.as_deref(), Some("fix: $X")); +} + +// ----------------------------------------------------------------------- +// Formula construction and equality +// ----------------------------------------------------------------------- + +#[test] +fn formula_atom_equality() { + let a = Formula::Atom(Atom::Pattern(String::from("x"))); + let b = Formula::Atom(Atom::Pattern(String::from("x"))); + assert_eq!(a, b); +} + +#[test] +fn formula_not_wraps_inner() { + let inner = Decorated::bare(Formula::Atom(Atom::Pattern(String::from("x")))); + let formula = Formula::Not(Box::new(inner.clone())); + assert_eq!(formula, Formula::Not(Box::new(inner))); +} + +#[test] +fn formula_and_holds_children() { + let children = vec![ + Decorated::bare(Formula::Atom(Atom::Pattern(String::from("a")))), + Decorated::bare(Formula::Atom(Atom::Pattern(String::from("b")))), + ]; + let formula = Formula::And(children.clone()); + assert_eq!(formula, Formula::And(children)); +} + +#[test] +fn formula_or_holds_branches() { + let branches = vec![ + Decorated::bare(Formula::Atom(Atom::Pattern(String::from("a")))), + Decorated::bare(Formula::Atom(Atom::Pattern(String::from("b")))), + ]; + let formula = Formula::Or(branches.clone()); + assert_eq!(formula, Formula::Or(branches)); +} + +#[test] +fn formula_constraint_preserves_json() { + let val = serde_json::json!({"metavariable-regex": {"metavariable": "$X", "regex": "foo"}}); + let formula = Formula::Constraint(val.clone()); + assert_eq!(formula, Formula::Constraint(val)); +} + +// ----------------------------------------------------------------------- +// is_positive_term classification +// ----------------------------------------------------------------------- + +#[test] +fn atom_is_positive() { + assert!(Formula::Atom(Atom::Pattern(String::from("x"))).is_positive_term()); +} + +#[test] +fn and_is_positive() { + assert!(Formula::And(vec![]).is_positive_term()); +} + +#[test] +fn or_is_positive() { + assert!(Formula::Or(vec![]).is_positive_term()); +} + +#[test] +fn not_is_not_positive() { + let f = Formula::Not(Box::new(Decorated::bare(Formula::Atom(Atom::Pattern( + String::from("x"), + ))))); + assert!(!f.is_positive_term()); +} + +#[test] +fn inside_is_not_positive() { + let f = Formula::Inside(Box::new(Decorated::bare(Formula::Atom(Atom::Pattern( + String::from("x"), + ))))); + assert!(!f.is_positive_term()); +} + +#[test] +fn anywhere_is_not_positive() { + let f = Formula::Anywhere(Box::new(Decorated::bare(Formula::Atom(Atom::Pattern( + String::from("x"), + ))))); + assert!(!f.is_positive_term()); +} + +#[test] +fn constraint_is_not_positive() { + assert!(!Formula::Constraint(serde_json::json!({})).is_positive_term()); +} diff --git a/crates/sempai-core/src/tests/mod.rs b/crates/sempai-core/src/tests/mod.rs index c2a65bd1..34b67b84 100644 --- a/crates/sempai-core/src/tests/mod.rs +++ b/crates/sempai-core/src/tests/mod.rs @@ -4,6 +4,7 @@ mod capture_tests; mod config_tests; mod diagnostic_snapshot_tests; mod diagnostic_tests; +mod formula_tests; mod language_tests; mod match_tests; mod span_tests; diff --git a/crates/sempai/src/engine.rs b/crates/sempai/src/engine.rs index c8062bde..0ce9685b 100644 --- a/crates/sempai/src/engine.rs +++ b/crates/sempai/src/engine.rs @@ -5,10 +5,12 @@ //! Compilation and execution are separate phases, allowing a compiled //! [`QueryPlan`] to be reused across multiple source files. +use sempai_core::formula::Formula; use sempai_core::{DiagnosticReport, EngineConfig, Language, Match}; -use sempai_yaml::parse_rule_file; +use sempai_yaml::{RuleMode, RulePrincipal, parse_rule_file}; use crate::mode_validation::validate_supported_modes; +use crate::normalise::{normalise_search_principal, validate_formula_constraints}; /// A compiled query plan for one rule and one language. /// @@ -19,37 +21,40 @@ use crate::mode_validation::validate_supported_modes; /// # Example /// /// ``` -/// use sempai::{Engine, EngineConfig, Language}; +/// use sempai::{Engine, EngineConfig}; /// +/// let yaml = concat!( +/// "rules:\n", +/// " - id: demo.rule\n", +/// " message: detect foo\n", +/// " languages: [rust]\n", +/// " severity: ERROR\n", +/// " pattern: foo($X)\n", +/// ); /// let engine = Engine::new(EngineConfig::default()); -/// // compile_dsl currently returns an error (not yet implemented) -/// let result = engine.compile_dsl("rule-1", Language::Rust, "pattern(\"fn $F\")"); -/// assert!(result.is_err()); +/// let plans = engine.compile_yaml(yaml).expect("valid rule"); +/// assert_eq!(plans.len(), 1); +/// assert!(plans[0].formula().is_some()); /// ``` #[derive(Debug, Clone)] pub struct QueryPlan { rule_id: String, language: Language, - /// Placeholder for the internal plan representation. Will be replaced - /// by `sempai_core::PlanNode` once the normalization layer is built. - _plan: (), + formula: Option, } impl QueryPlan { - /// Creates a new query plan (crate-internal). - // FIXME: remove `#[cfg(test)]` when `compile_yaml` / `compile_dsl` produce - // real plans — https://github.com/leynos/weaver/issues/67 - #[cfg(test)] + /// Creates a new query plan. #[must_use] #[expect( clippy::missing_const_for_fn, reason = "heap types cannot be used in const contexts" )] - pub(crate) fn new(rule_id: String, language: Language) -> Self { + pub(crate) fn new(rule_id: String, language: Language, formula: Option) -> Self { Self { rule_id, language, - _plan: (), + formula, } } @@ -64,6 +69,15 @@ impl QueryPlan { pub const fn language(&self) -> Language { self.language } + + /// Returns the normalised formula, if this rule has formula semantics. + /// + /// `ProjectDependsOn` rules return `None` because they have no + /// formula representation. + #[must_use] + pub const fn formula(&self) -> Option<&Formula> { + self.formula.as_ref() + } } /// Compiles and executes Semgrep-compatible queries on Tree-sitter syntax @@ -80,9 +94,11 @@ impl QueryPlan { /// use sempai::{Engine, EngineConfig}; /// /// let engine = Engine::new(EngineConfig::default()); -/// let result = engine.compile_yaml("rules: []"); -/// // Malformed YAML and schema errors now surface parser diagnostics. -/// assert!(result.is_err()); +/// // Malformed YAML surfaces parser diagnostics. +/// assert!(engine.compile_yaml("not valid yaml: [").is_err()); +/// // A valid empty rule file produces an empty plan list. +/// let plans = engine.compile_yaml("rules: []").expect("valid"); +/// assert!(plans.is_empty()); /// ``` #[derive(Debug)] pub struct Engine { @@ -104,18 +120,35 @@ impl Engine { /// Compiles a YAML rule file into query plans. /// - /// # Errors + /// Each search-mode rule produces one [`QueryPlan`] per declared + /// language. The plan carries a normalised [`Formula`] (or `None` + /// for `ProjectDependsOn` rules). /// - /// Returns a diagnostic report if parsing or validation fails. + /// # Errors /// - /// Successful YAML parsing still stops at the post-parse placeholder until - /// rule normalization is implemented. + /// Returns a diagnostic report if parsing, mode validation, + /// normalization, or semantic constraint validation fails. pub fn compile_yaml(&self, yaml: &str) -> Result, DiagnosticReport> { let file = parse_rule_file(yaml, None)?; validate_supported_modes(&file)?; - Err(DiagnosticReport::not_implemented( - "compile_yaml query-plan normalization", - )) + + let mut plans = Vec::new(); + for rule in file.rules() { + if *rule.mode() != RuleMode::Search { + continue; + } + let RulePrincipal::Search(principal) = rule.principal() else { + continue; + }; + + let formula = normalise_search_principal(principal)?; + if let Some(ref f) = formula { + validate_formula_constraints(f)?; + } + + compile_rule_languages(rule.id(), rule.languages(), formula.as_ref(), &mut plans)?; + } + Ok(plans) } /// Compiles a one-liner query DSL expression into a query plan. @@ -148,3 +181,23 @@ impl Engine { Err(DiagnosticReport::not_implemented("execute")) } } + +/// Parses declared languages and appends query plans for a single rule. +fn compile_rule_languages( + rule_id: &str, + languages: &[String], + formula: Option<&Formula>, + plans: &mut Vec, +) -> Result<(), DiagnosticReport> { + for lang_str in languages { + let language: Language = lang_str + .parse() + .map_err(|_| DiagnosticReport::not_implemented(&format!("language `{lang_str}`")))?; + plans.push(QueryPlan::new( + String::from(rule_id), + language, + formula.cloned(), + )); + } + Ok(()) +} diff --git a/crates/sempai/src/lib.rs b/crates/sempai/src/lib.rs index e0f0b815..d4adadcc 100644 --- a/crates/sempai/src/lib.rs +++ b/crates/sempai/src/lib.rs @@ -19,6 +19,7 @@ //! - [`CaptureValue`] and [`CapturedNode`] — metavariable bindings //! - [`DiagnosticReport`] and [`Diagnostic`] — structured error reporting //! - [`EngineConfig`] and [`EngineLimits`] — performance and safety limits +//! - [`Formula`], [`Atom`], and [`Decorated`] — canonical normalised formula model //! - [`Engine`] — the query compilation and execution entrypoint //! - [`QueryPlan`] — a compiled query plan //! @@ -37,11 +38,13 @@ mod engine; mod mode_validation; +mod normalise; // Re-export all stable types from sempai_core. pub use sempai_core::{ - CaptureValue, CapturedNode, Diagnostic, DiagnosticCode, DiagnosticReport, EngineConfig, - EngineLimits, Language, LanguageParseError, LineCol, Match, SourceSpan, Span, + Atom, CaptureValue, CapturedNode, Decorated, Diagnostic, DiagnosticCode, DiagnosticReport, + EngineConfig, EngineLimits, Formula, Language, LanguageParseError, LineCol, Match, SourceSpan, + Span, }; pub use engine::{Engine, QueryPlan}; diff --git a/crates/sempai/src/normalise/constraints.rs b/crates/sempai/src/normalise/constraints.rs new file mode 100644 index 00000000..5824c785 --- /dev/null +++ b/crates/sempai/src/normalise/constraints.rs @@ -0,0 +1,96 @@ +//! Semantic constraint validation for the canonical [`Formula`] tree. +//! +//! After normalization, the formula tree must satisfy two invariants: +//! +//! 1. **No negation inside disjunction** — a `Not` node must not appear +//! as a direct child of an `Or` node. +//! 2. **Positive term in conjunction** — every `And` node must contain at +//! least one positive match-producing child. + +use sempai_core::formula::{Decorated, Formula}; +use sempai_core::{DiagnosticCode, DiagnosticReport}; + +/// Validates semantic constraints on a normalised [`Formula`] tree. +/// +/// # Errors +/// +/// Returns a [`DiagnosticReport`] containing the first constraint +/// violation found during a depth-first walk. +pub(crate) fn validate_formula_constraints(formula: &Formula) -> Result<(), DiagnosticReport> { + walk(formula) +} + +/// Depth-first constraint walk. +fn walk(formula: &Formula) -> Result<(), DiagnosticReport> { + match formula { + Formula::Or(branches) => { + check_no_not_in_or(branches)?; + for branch in branches { + walk(&branch.node)?; + } + } + Formula::And(terms) => { + check_positive_term_in_and(terms)?; + for term in terms { + walk(&term.node)?; + } + } + Formula::Not(inner) | Formula::Inside(inner) | Formula::Anywhere(inner) => { + walk(&inner.node)?; + } + Formula::Atom(_) | Formula::Constraint(_) => {} + } + Ok(()) +} + +/// Checks that no direct child of an `Or` node is a `Not`. +fn check_no_not_in_or(branches: &[Decorated]) -> Result<(), DiagnosticReport> { + for branch in branches { + if matches!(branch.node, Formula::Not(_)) { + return Err(DiagnosticReport::validation_error( + DiagnosticCode::ESempaiInvalidNotInOr, + String::from( + "negated formula inside disjunction \ + (`pattern-either` / `any`) is not permitted", + ), + None, + vec![String::from( + "move the negated term into a conjunction \ + (`patterns` / `all`) instead", + )], + )); + } + } + Ok(()) +} + +/// Checks that an `And` node has at least one positive term. +/// +/// Positive terms are `Atom`, `And`, and `Or` nodes. `Not`, `Inside`, +/// `Anywhere`, and `Constraint` are not positive for this check. +/// +/// **Exception:** if all terms are `Constraint` nodes (e.g. +/// `metavariable-pattern` contexts), the conjunction is accepted because +/// constraints may act as implicit positive terms in later evaluation. +fn check_positive_term_in_and(terms: &[Decorated]) -> Result<(), DiagnosticReport> { + let has_positive = terms.iter().any(|t| t.node.is_positive_term()); + let all_constraints = terms + .iter() + .all(|t| matches!(t.node, Formula::Constraint(_))); + + if !has_positive && !all_constraints { + return Err(DiagnosticReport::validation_error( + DiagnosticCode::ESempaiMissingPositiveTermInAnd, + String::from( + "conjunction (`patterns` / `all`) contains no positive \ + match-producing term", + ), + None, + vec![String::from( + "add at least one `pattern`, `regex`, or nested \ + conjunction/disjunction", + )], + )); + } + Ok(()) +} diff --git a/crates/sempai/src/normalise/legacy.rs b/crates/sempai/src/normalise/legacy.rs new file mode 100644 index 00000000..2189a2ff --- /dev/null +++ b/crates/sempai/src/normalise/legacy.rs @@ -0,0 +1,60 @@ +//! Legacy Semgrep operator normalization into canonical [`Formula`]. +//! +//! Each legacy operator variant maps deterministically to a [`Formula`] +//! node following the mapping table in the `ExecPlan` and the Semgrep +//! operator-precedence document. + +use sempai_core::formula::{Atom, Decorated, Formula}; +use sempai_yaml::{LegacyClause, LegacyFormula, LegacyValue}; + +/// Normalises a legacy formula tree into a canonical [`Formula`]. +pub(crate) fn normalise_legacy(formula: &LegacyFormula) -> Formula { + match formula { + LegacyFormula::Pattern(s) => Formula::Atom(Atom::Pattern(s.clone())), + + LegacyFormula::PatternRegex(s) => Formula::Atom(Atom::Regex(s.clone())), + + LegacyFormula::Patterns(clauses) => { + let children = clauses.iter().map(normalise_clause).collect(); + Formula::And(children) + } + + LegacyFormula::PatternEither(branches) => { + let children = branches + .iter() + .map(|b| Decorated::bare(normalise_legacy(b))) + .collect(); + Formula::Or(children) + } + + LegacyFormula::PatternNot(value) => Formula::Not(Box::new(normalise_value(value))), + + LegacyFormula::PatternInside(value) => Formula::Inside(Box::new(normalise_value(value))), + + LegacyFormula::PatternNotInside(value) => Formula::Not(Box::new(Decorated::bare( + Formula::Inside(Box::new(normalise_value(value))), + ))), + + LegacyFormula::PatternNotRegex(s) => Formula::Not(Box::new(Decorated::bare( + Formula::Atom(Atom::Regex(s.clone())), + ))), + + LegacyFormula::Anywhere(value) => Formula::Anywhere(Box::new(normalise_value(value))), + } +} + +/// Normalises a [`LegacyValue`] (string shorthand or nested formula). +fn normalise_value(value: &LegacyValue) -> Decorated { + match value { + LegacyValue::String(s) => Decorated::bare(Formula::Atom(Atom::Pattern(s.clone()))), + LegacyValue::Formula(f) => Decorated::bare(normalise_legacy(f)), + } +} + +/// Normalises a [`LegacyClause`] inside a `patterns` conjunction. +fn normalise_clause(clause: &LegacyClause) -> Decorated { + match clause { + LegacyClause::Formula(f) => Decorated::bare(normalise_legacy(f)), + LegacyClause::Constraint(v) => Decorated::bare(Formula::Constraint(v.clone())), + } +} diff --git a/crates/sempai/src/normalise/mod.rs b/crates/sempai/src/normalise/mod.rs new file mode 100644 index 00000000..9b7c8e3c --- /dev/null +++ b/crates/sempai/src/normalise/mod.rs @@ -0,0 +1,50 @@ +//! Normalization of parsed Semgrep queries into the canonical [`Formula`] +//! model. +//! +//! This module lowers both legacy Semgrep operators and v2 `match` syntax +//! into a single [`Formula`] tree defined in `sempai_core`. After +//! normalization, [`validate_formula_constraints`] enforces semantic +//! invariants on the tree. +//! +//! Normalization lives in the `sempai` facade crate rather than in +//! `sempai_core` because it depends on parser-level types from +//! `sempai_yaml`, and `sempai_yaml` already depends on `sempai_core`. +//! Placing it here avoids a circular dependency. + +mod constraints; +mod legacy; +mod v2; + +pub(crate) use constraints::validate_formula_constraints; + +use sempai_core::DiagnosticReport; +use sempai_core::formula::Formula; +use sempai_yaml::SearchQueryPrincipal; + +/// Normalises a parsed [`SearchQueryPrincipal`] into a canonical +/// [`Formula`]. +/// +/// Returns `Ok(Some(formula))` for legacy and v2 match principals, or +/// `Ok(None)` for `ProjectDependsOn` rules which have no formula +/// semantics. +/// +/// # Errors +/// +/// Returns a [`DiagnosticReport`] if the principal contains structurally +/// invalid content that cannot be lowered. In practice the YAML parser +/// catches structural issues, so this path is defensive. +#[expect( + clippy::unnecessary_wraps, + reason = "Result is needed once normalization can report lowering errors" +)] +pub(crate) fn normalise_search_principal( + principal: &SearchQueryPrincipal, +) -> Result, DiagnosticReport> { + match principal { + SearchQueryPrincipal::Legacy(legacy) => Ok(Some(legacy::normalise_legacy(legacy))), + SearchQueryPrincipal::Match(match_formula) => { + Ok(Some(v2::normalise_match(match_formula).node)) + } + SearchQueryPrincipal::ProjectDependsOn(_) => Ok(None), + } +} diff --git a/crates/sempai/src/normalise/v2.rs b/crates/sempai/src/normalise/v2.rs new file mode 100644 index 00000000..dde27f96 --- /dev/null +++ b/crates/sempai/src/normalise/v2.rs @@ -0,0 +1,57 @@ +//! v2 `match` formula normalization into canonical [`Formula`]. +//! +//! Each v2 match operator variant maps deterministically to a [`Formula`] +//! node, with [`Decorated`] metadata preserved through the lowering. + +use sempai_core::formula::{Atom, Decorated, Formula}; +use sempai_yaml::MatchFormula; + +/// Normalises a v2 match formula into a [`Decorated`]. +/// +/// The outer [`Decorated`] wrapper carries any `where`, `as`, or `fix` +/// metadata from a v2 `Decorated` variant. Non-decorated nodes produce +/// a bare wrapper. +pub(crate) fn normalise_match(formula: &MatchFormula) -> Decorated { + match formula { + MatchFormula::Pattern(s) | MatchFormula::PatternObject(s) => { + Decorated::bare(Formula::Atom(Atom::Pattern(s.clone()))) + } + + MatchFormula::Regex(s) => Decorated::bare(Formula::Atom(Atom::Regex(s.clone()))), + + MatchFormula::All(items) => { + let children = items.iter().map(normalise_match).collect(); + Decorated::bare(Formula::And(children)) + } + + MatchFormula::Any(items) => { + let children = items.iter().map(normalise_match).collect(); + Decorated::bare(Formula::Or(children)) + } + + MatchFormula::Not(inner) => Decorated::bare(Formula::Not(Box::new(normalise_match(inner)))), + + MatchFormula::Inside(inner) => { + Decorated::bare(Formula::Inside(Box::new(normalise_match(inner)))) + } + + MatchFormula::Anywhere(inner) => { + Decorated::bare(Formula::Anywhere(Box::new(normalise_match(inner)))) + } + + MatchFormula::Decorated { + formula: inner_formula, + where_clauses, + as_name, + fix, + } => { + let inner = normalise_match(inner_formula); + Decorated::with_metadata( + inner.node, + where_clauses.clone(), + as_name.clone(), + fix.clone(), + ) + } + } +} diff --git a/crates/sempai/src/tests/behaviour.rs b/crates/sempai/src/tests/behaviour.rs index 05386ce5..63a15c22 100644 --- a/crates/sempai/src/tests/behaviour.rs +++ b/crates/sempai/src/tests/behaviour.rs @@ -14,7 +14,7 @@ use crate::{DiagnosticReport, Engine, EngineConfig, Language}; #[derive(Default)] struct TestWorld { engine: Option, - compile_result: Option>, + compile_result: Option, DiagnosticReport>>, execute_result: Option>, } @@ -39,7 +39,8 @@ fn given_default_engine(world: &mut TestWorld) { #[when("YAML {yaml} is compiled")] fn when_compile_yaml(world: &mut TestWorld, yaml: QuotedString) { let engine = world.engine.as_ref().expect("engine should be set"); - world.compile_result = Some(engine.compile_yaml(yaml.as_str()).map(|_| ())); + let yaml_text = yaml.as_str().replace("\\n", "\n"); + world.compile_result = Some(engine.compile_yaml(&yaml_text)); } #[when("DSL {dsl} is compiled for language {lang}")] @@ -49,14 +50,14 @@ fn when_compile_dsl(world: &mut TestWorld, dsl: QuotedString, lang: QuotedString world.compile_result = Some( engine .compile_dsl("interactive", language, dsl.as_str()) - .map(|_| ()), + .map(|plan| vec![plan]), ); } #[when("a query plan is executed")] fn when_execute(world: &mut TestWorld) { let engine = world.engine.as_ref().expect("engine should be set"); - let plan = QueryPlan::new(String::from("test-rule"), Language::Rust); + let plan = QueryPlan::new(String::from("test-rule"), Language::Rust, None); world.execute_result = Some( engine .execute(&plan, "file:///t.rs", "fn main() {}") @@ -68,26 +69,17 @@ fn when_execute(world: &mut TestWorld) { // Helpers // --------------------------------------------------------------------------- -/// Asserts that a diagnostic result contains a specific error code. -fn assert_diagnostic_code( - result: Option<&Result<(), DiagnosticReport>>, - expected_code: &str, +/// Extracts the diagnostic report from an error result. +fn extract_report( + result: Option<&Result, DiagnosticReport>>, result_name: &str, failure_kind: &str, -) { +) -> DiagnosticReport { let inner = result.unwrap_or_else(|| panic!("{result_name} should be set")); - let report = inner + inner .as_ref() - .expect_err(&format!("expected {failure_kind}")); - let first = report - .diagnostics() - .first() - .expect("at least one diagnostic"); - let actual_code = format!("{}", first.code()); - assert_eq!( - actual_code, expected_code, - "expected code '{expected_code}', got '{actual_code}'" - ); + .expect_err(&format!("expected {failure_kind}")) + .clone() } // --------------------------------------------------------------------------- @@ -102,22 +94,40 @@ fn then_engine_max_matches(world: &mut TestWorld, count: usize) { #[then("compilation fails with code {code}")] fn then_compilation_fails(world: &mut TestWorld, code: QuotedString) { - assert_diagnostic_code( + let report = extract_report( world.compile_result.as_ref(), - code.as_str(), "compile result", "compilation failure", ); + let first = report + .diagnostics() + .first() + .expect("at least one diagnostic"); + let actual_code = format!("{}", first.code()); + assert_eq!( + actual_code, + code.as_str(), + "expected code '{}', got '{actual_code}'", + code.as_str(), + ); } -#[then("the first diagnostic message contains {snippet}")] -fn then_first_diagnostic_message_contains(world: &mut TestWorld, snippet: QuotedString) { - let report = world +#[then("compilation succeeds")] +fn then_compilation_succeeds(world: &mut TestWorld) { + let result = world .compile_result .as_ref() - .expect("compile result should be set") - .as_ref() - .expect_err("expected compilation failure"); + .expect("compile result should be set"); + assert!(result.is_ok(), "expected compilation to succeed"); +} + +#[then("the first diagnostic message contains {snippet}")] +fn then_first_diagnostic_message_contains(world: &mut TestWorld, snippet: QuotedString) { + let report = extract_report( + world.compile_result.as_ref(), + "compile result", + "compilation failure", + ); let first = report .diagnostics() .first() @@ -125,13 +135,70 @@ fn then_first_diagnostic_message_contains(world: &mut TestWorld, snippet: Quoted assert!(first.message().contains(snippet.as_str())); } +#[then("the result contains {count} query plan")] +fn then_result_contains_n_plans(world: &mut TestWorld, count: usize) { + let plans = world + .compile_result + .as_ref() + .expect("compile result should be set") + .as_ref() + .expect("expected compilation success"); + assert_eq!(plans.len(), count); +} + +#[then("the first plan has a formula")] +fn then_first_plan_has_formula(world: &mut TestWorld) { + let plans = world + .compile_result + .as_ref() + .expect("compile result should be set") + .as_ref() + .expect("expected compilation success"); + assert!( + plans + .first() + .expect("at least one plan") + .formula() + .is_some(), + "expected plan to have a formula", + ); +} + +#[then("the first plan has no formula")] +fn then_first_plan_has_no_formula(world: &mut TestWorld) { + let plans = world + .compile_result + .as_ref() + .expect("compile result should be set") + .as_ref() + .expect("expected compilation success"); + assert!( + plans + .first() + .expect("at least one plan") + .formula() + .is_none(), + "expected plan to have no formula", + ); +} + #[then("execution fails with code {code}")] fn then_execution_fails(world: &mut TestWorld, code: QuotedString) { - assert_diagnostic_code( - world.execute_result.as_ref(), + let inner = world + .execute_result + .as_ref() + .expect("execute result should be set"); + let report = inner.as_ref().expect_err("expected execution failure"); + let first = report + .diagnostics() + .first() + .expect("at least one diagnostic"); + let actual_code = format!("{}", first.code()); + assert_eq!( + actual_code, + code.as_str(), + "expected code '{}', got '{actual_code}'", code.as_str(), - "execute result", - "execution failure", ); } @@ -143,3 +210,8 @@ fn then_execution_fails(world: &mut TestWorld, code: QuotedString) { fn sempai_engine_behaviour(world: TestWorld) { let _ = world; } + +#[scenario(path = "tests/features/formula_normalization.feature")] +fn formula_normalization_behaviour(world: TestWorld) { + let _ = world; +} diff --git a/crates/sempai/src/tests/constraint_tests.rs b/crates/sempai/src/tests/constraint_tests.rs new file mode 100644 index 00000000..2b611e92 --- /dev/null +++ b/crates/sempai/src/tests/constraint_tests.rs @@ -0,0 +1,151 @@ +//! Unit tests for semantic constraint validation on formula trees. + +use sempai_core::DiagnosticCode; +use sempai_core::formula::{Atom, Decorated, Formula}; + +use crate::normalise::validate_formula_constraints; + +// ----------------------------------------------------------------------- +// Helpers +// ----------------------------------------------------------------------- + +fn pat(s: &str) -> Formula { + Formula::Atom(Atom::Pattern(String::from(s))) +} +fn bare(f: Formula) -> Decorated { + Decorated::bare(f) +} + +// ----------------------------------------------------------------------- +// InvalidNotInOr +// ----------------------------------------------------------------------- + +#[test] +fn or_with_not_child_is_rejected() { + let formula = Formula::Or(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + let err = validate_formula_constraints(&formula).expect_err("should fail"); + let code = err.diagnostics().first().expect("at least one").code(); + assert_eq!(code, DiagnosticCode::ESempaiInvalidNotInOr); +} + +#[test] +fn or_without_not_children_passes() { + let formula = Formula::Or(vec![bare(pat("a")), bare(pat("b"))]); + assert!(validate_formula_constraints(&formula).is_ok()); +} + +#[test] +fn or_with_only_atoms_passes() { + let formula = Formula::Or(vec![ + bare(pat("a")), + bare(Formula::Atom(Atom::Regex(String::from("r")))), + ]); + assert!(validate_formula_constraints(&formula).is_ok()); +} + +// ----------------------------------------------------------------------- +// MissingPositiveTermInAnd +// ----------------------------------------------------------------------- + +#[test] +fn and_with_no_positive_terms_is_rejected() { + let formula = Formula::And(vec![ + bare(Formula::Not(Box::new(bare(pat("a"))))), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + let err = validate_formula_constraints(&formula).expect_err("should fail"); + let code = err.diagnostics().first().expect("at least one").code(); + assert_eq!(code, DiagnosticCode::ESempaiMissingPositiveTermInAnd); +} + +#[test] +fn and_with_one_positive_and_one_negative_passes() { + let formula = Formula::And(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + assert!(validate_formula_constraints(&formula).is_ok()); +} + +#[test] +fn and_with_only_inside_and_anywhere_is_rejected() { + let formula = Formula::And(vec![ + bare(Formula::Inside(Box::new(bare(pat("ctx"))))), + bare(Formula::Anywhere(Box::new(bare(pat("x"))))), + ]); + let err = validate_formula_constraints(&formula).expect_err("should fail"); + let code = err.diagnostics().first().expect("at least one").code(); + assert_eq!(code, DiagnosticCode::ESempaiMissingPositiveTermInAnd); +} + +#[test] +fn and_with_only_constraints_passes() { + // Metavariable-pattern exception: constraint-only conjunctions are + // accepted because they may be metavariable-pattern contexts. + let formula = Formula::And(vec![ + bare(Formula::Constraint( + serde_json::json!({"metavariable-regex": {}}), + )), + bare(Formula::Constraint( + serde_json::json!({"metavariable-pattern": {}}), + )), + ]); + assert!(validate_formula_constraints(&formula).is_ok()); +} + +#[test] +fn and_with_positive_term_and_constraint_passes() { + let formula = Formula::And(vec![ + bare(pat("a")), + bare(Formula::Constraint(serde_json::json!({}))), + ]); + assert!(validate_formula_constraints(&formula).is_ok()); +} + +// ----------------------------------------------------------------------- +// Nested constraint violations +// ----------------------------------------------------------------------- + +#[test] +fn nested_and_inside_or_with_no_positive_term_is_rejected() { + let inner_and = Formula::And(vec![bare(Formula::Not(Box::new(bare(pat("a")))))]); + let formula = Formula::Or(vec![bare(inner_and), bare(pat("b"))]); + let err = validate_formula_constraints(&formula).expect_err("should fail"); + let code = err.diagnostics().first().expect("at least one").code(); + assert_eq!(code, DiagnosticCode::ESempaiMissingPositiveTermInAnd); +} + +#[test] +fn nested_or_with_not_inside_and_is_rejected() { + let inner_or = Formula::Or(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + let formula = Formula::And(vec![bare(pat("c")), bare(inner_or)]); + let err = validate_formula_constraints(&formula).expect_err("should fail"); + let code = err.diagnostics().first().expect("at least one").code(); + assert_eq!(code, DiagnosticCode::ESempaiInvalidNotInOr); +} + +// ----------------------------------------------------------------------- +// Happy-path atoms and simple formulas +// ----------------------------------------------------------------------- + +#[test] +fn bare_atom_passes() { + assert!(validate_formula_constraints(&pat("x")).is_ok()); +} + +#[test] +fn bare_constraint_passes() { + assert!(validate_formula_constraints(&Formula::Constraint(serde_json::json!({}))).is_ok()); +} + +#[test] +fn not_wrapping_atom_passes() { + let formula = Formula::Not(Box::new(bare(pat("x")))); + assert!(validate_formula_constraints(&formula).is_ok()); +} diff --git a/crates/sempai/src/tests/engine_tests.rs b/crates/sempai/src/tests/engine_tests.rs index e90d8511..a994ba2b 100644 --- a/crates/sempai/src/tests/engine_tests.rs +++ b/crates/sempai/src/tests/engine_tests.rs @@ -1,5 +1,7 @@ //! Tests for the `Engine` and `QueryPlan` types. +use sempai_core::formula::{Atom, Formula}; + use crate::engine::QueryPlan; use crate::{ Diagnostic, DiagnosticCode, DiagnosticReport, Engine, EngineConfig, EngineLimits, Language, @@ -52,32 +54,47 @@ fn compile_yaml_returns_schema_diagnostic_for_missing_rule_id() { } #[test] -fn compile_yaml_returns_not_implemented_after_successful_parse() { +fn compile_yaml_produces_plan_with_formula_for_valid_search_rule() { let engine = default_engine(); - let result = engine.compile_yaml( - "rules:\n - id: demo.rule\n message: oops\n languages: [rust]\n severity: ERROR\n pattern: foo($X)\n", + let plans = engine + .compile_yaml(concat!( + "rules:\n", + " - id: demo.rule\n", + " message: oops\n", + " languages: [rust]\n", + " severity: ERROR\n", + " pattern: foo($X)\n", + )) + .expect("valid search rule should produce plans"); + assert_eq!(plans.len(), 1); + let plan = plans.first().expect("one plan"); + assert_eq!(plan.rule_id(), "demo.rule"); + assert_eq!(plan.language(), Language::Rust); + assert_eq!( + plan.formula(), + Some(&Formula::Atom(Atom::Pattern(String::from("foo($X)")))), ); - let (code, diag) = first_diagnostic_of_err(result); - assert_eq!(code, DiagnosticCode::NotImplemented); - assert!(diag.message().contains("normalization")); } #[test] -fn compile_yaml_returns_not_implemented_for_project_depends_on_search_rule() { +fn compile_yaml_produces_plan_without_formula_for_project_depends_on() { let engine = default_engine(); - let result = engine.compile_yaml(concat!( - "rules:\n", - " - id: demo.depends\n", - " message: detect vulnerable dependency\n", - " languages: [python]\n", - " severity: WARNING\n", - " r2c-internal-project-depends-on:\n", - " namespace: pypi\n", - " package: requests\n", - )); - let (code, diag) = first_diagnostic_of_err(result); - assert_eq!(code, DiagnosticCode::NotImplemented); - assert!(diag.message().contains("normalization")); + let plans = engine + .compile_yaml(concat!( + "rules:\n", + " - id: demo.depends\n", + " message: detect vulnerable dependency\n", + " languages: [python]\n", + " severity: WARNING\n", + " r2c-internal-project-depends-on:\n", + " namespace: pypi\n", + " package: requests\n", + )) + .expect("project-depends-on rule should produce plans"); + assert_eq!(plans.len(), 1); + let plan = plans.first().expect("one plan"); + assert_eq!(plan.rule_id(), "demo.depends"); + assert!(plan.formula().is_none()); } fn assert_compile_yaml_unsupported_mode(yaml: &str, expected_mode_fragment: &str) { @@ -140,7 +157,7 @@ fn compile_dsl_returns_not_implemented() { #[test] fn execute_returns_not_implemented() { let engine = default_engine(); - let plan = QueryPlan::new(String::from("test-rule"), Language::Rust); + let plan = QueryPlan::new(String::from("test-rule"), Language::Rust, None); let result = engine.execute(&plan, "file:///test.rs", "fn main() {}"); let (code, diag) = first_diagnostic_of_err(result); assert_eq!(code, DiagnosticCode::NotImplemented); diff --git a/crates/sempai/src/tests/mod.rs b/crates/sempai/src/tests/mod.rs index d24e15ce..4a054ac8 100644 --- a/crates/sempai/src/tests/mod.rs +++ b/crates/sempai/src/tests/mod.rs @@ -1,6 +1,8 @@ //! Unit tests for the `sempai` facade crate. +mod constraint_tests; mod engine_tests; +mod normalise_tests; mod reexport_tests; mod behaviour; diff --git a/crates/sempai/src/tests/normalise_tests.rs b/crates/sempai/src/tests/normalise_tests.rs new file mode 100644 index 00000000..8e7ac5da --- /dev/null +++ b/crates/sempai/src/tests/normalise_tests.rs @@ -0,0 +1,325 @@ +//! Unit tests for legacy and v2 formula normalization. + +use rstest::rstest; +use sempai_core::formula::{Atom, Decorated, Formula}; +use sempai_yaml::{LegacyClause, LegacyFormula, LegacyValue, MatchFormula, SearchQueryPrincipal}; + +use crate::normalise::normalise_search_principal; + +// ----------------------------------------------------------------------- +// Helpers +// ----------------------------------------------------------------------- + +fn pat(s: &str) -> Formula { + Formula::Atom(Atom::Pattern(String::from(s))) +} +fn re(s: &str) -> Formula { + Formula::Atom(Atom::Regex(String::from(s))) +} +fn bare(f: Formula) -> Decorated { + Decorated::bare(f) +} + +// ----------------------------------------------------------------------- +// Legacy normalization +// ----------------------------------------------------------------------- + +#[test] +fn legacy_pattern_normalises_to_atom() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::Pattern(String::from("foo($X)"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(pat("foo($X)"))); +} + +#[test] +fn legacy_pattern_regex_normalises_to_regex_atom() { + let principal = + SearchQueryPrincipal::Legacy(LegacyFormula::PatternRegex(String::from("foo.*"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(re("foo.*"))); +} + +#[test] +fn legacy_patterns_normalises_to_and() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::Patterns(vec![ + LegacyClause::Formula(LegacyFormula::Pattern(String::from("a"))), + LegacyClause::Formula(LegacyFormula::PatternNot(Box::new(LegacyValue::String( + String::from("b"), + )))), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + let expected = Formula::And(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + assert_eq!(result, Some(expected)); +} + +#[test] +fn legacy_pattern_either_normalises_to_or() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternEither(vec![ + LegacyFormula::Pattern(String::from("a")), + LegacyFormula::Pattern(String::from("b")), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!( + result, + Some(Formula::Or(vec![bare(pat("a")), bare(pat("b"))])) + ); +} + +#[test] +fn legacy_pattern_not_normalises_to_not() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternNot(Box::new( + LegacyValue::String(String::from("x")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Not(Box::new(bare(pat("x")))))); +} + +#[test] +fn legacy_pattern_inside_normalises_to_inside() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternInside(Box::new( + LegacyValue::String(String::from("ctx")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Inside(Box::new(bare(pat("ctx")))))); +} + +#[test] +fn legacy_pattern_not_inside_normalises_to_not_inside() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternNotInside(Box::new( + LegacyValue::String(String::from("ctx")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + let expected = Formula::Not(Box::new(bare(Formula::Inside(Box::new(bare(pat("ctx"))))))); + assert_eq!(result, Some(expected)); +} + +#[test] +fn legacy_pattern_not_regex_normalises_to_not_regex() { + let principal = + SearchQueryPrincipal::Legacy(LegacyFormula::PatternNotRegex(String::from("bad.*"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Not(Box::new(bare(re("bad.*")))))); +} + +#[test] +fn legacy_anywhere_normalises_to_anywhere() { + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::Anywhere(Box::new( + LegacyValue::String(String::from("x")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Anywhere(Box::new(bare(pat("x")))))); +} + +#[test] +fn legacy_value_formula_recurses() { + let inner = LegacyFormula::Pattern(String::from("inner")); + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternNot(Box::new( + LegacyValue::Formula(inner), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Not(Box::new(bare(pat("inner")))))); +} + +#[test] +fn legacy_constraint_preserved_as_opaque() { + let constraint_json = serde_json::json!({ + "metavariable-regex": { + "metavariable": "$X", + "regex": "foo.*" + } + }); + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::Patterns(vec![ + LegacyClause::Formula(LegacyFormula::Pattern(String::from("a"))), + LegacyClause::Constraint(constraint_json.clone()), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + let expected = Formula::And(vec![ + bare(pat("a")), + bare(Formula::Constraint(constraint_json)), + ]); + assert_eq!(result, Some(expected)); +} + +// ----------------------------------------------------------------------- +// v2 Match normalization +// ----------------------------------------------------------------------- + +#[test] +fn v2_pattern_string_normalises_to_atom() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Pattern(String::from("foo"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(pat("foo"))); +} + +#[test] +fn v2_pattern_object_normalises_to_atom() { + let principal = SearchQueryPrincipal::Match(MatchFormula::PatternObject(String::from("foo"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(pat("foo"))); +} + +#[test] +fn v2_regex_normalises_to_regex_atom() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Regex(String::from("r.*"))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(re("r.*"))); +} + +#[test] +fn v2_all_normalises_to_and() { + let principal = SearchQueryPrincipal::Match(MatchFormula::All(vec![ + MatchFormula::Pattern(String::from("a")), + MatchFormula::Not(Box::new(MatchFormula::Pattern(String::from("b")))), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + let expected = Formula::And(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ]); + assert_eq!(result, Some(expected)); +} + +#[test] +fn v2_any_normalises_to_or() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Any(vec![ + MatchFormula::Pattern(String::from("a")), + MatchFormula::Pattern(String::from("b")), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!( + result, + Some(Formula::Or(vec![bare(pat("a")), bare(pat("b"))])) + ); +} + +#[test] +fn v2_not_normalises_to_not() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Not(Box::new( + MatchFormula::Pattern(String::from("x")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Not(Box::new(bare(pat("x")))))); +} + +#[test] +fn v2_inside_normalises_to_inside() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Inside(Box::new( + MatchFormula::Pattern(String::from("ctx")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Inside(Box::new(bare(pat("ctx")))))); +} + +#[test] +fn v2_anywhere_normalises_to_anywhere() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Anywhere(Box::new( + MatchFormula::Pattern(String::from("x")), + ))); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(Formula::Anywhere(Box::new(bare(pat("x")))))); +} + +#[test] +fn v2_decorated_carries_metadata() { + let principal = SearchQueryPrincipal::Match(MatchFormula::Decorated { + formula: Box::new(MatchFormula::Pattern(String::from("foo"))), + where_clauses: vec![serde_json::json!({"metavariable": "$X"})], + as_name: Some(String::from("alias")), + fix: Some(String::from("bar")), + }); + // The top-level normalise_search_principal returns the inner node + // stripped of its Decorated wrapper (since the top level is just a + // Formula, not Decorated). The metadata is lost at the top + // level — it is preserved when Decorated appears as a child within + // And/Or/Not etc. + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, Some(pat("foo"))); +} + +// ----------------------------------------------------------------------- +// ProjectDependsOn passthrough +// ----------------------------------------------------------------------- + +#[test] +fn project_depends_on_returns_none() { + use sempai_yaml::ProjectDependsOnPayload; + let payload: ProjectDependsOnPayload = serde_json::json!({ + "namespace": "pypi", + "package": "requests" + }) + .try_into() + .expect("valid payload"); + let principal = SearchQueryPrincipal::ProjectDependsOn(payload); + let result = normalise_search_principal(&principal).expect("ok"); + assert_eq!(result, None); +} + +// ----------------------------------------------------------------------- +// Paired equivalence tests +// ----------------------------------------------------------------------- + +#[rstest] +#[case::simple_pattern( + SearchQueryPrincipal::Legacy(LegacyFormula::Pattern(String::from("foo($X)"))), + SearchQueryPrincipal::Match(MatchFormula::Pattern(String::from("foo($X)"))) +)] +#[case::conjunction( + SearchQueryPrincipal::Legacy(LegacyFormula::Patterns(vec![ + LegacyClause::Formula(LegacyFormula::Pattern(String::from("a"))), + LegacyClause::Formula(LegacyFormula::PatternNot( + Box::new(LegacyValue::String(String::from("b"))), + )), + ])), + SearchQueryPrincipal::Match(MatchFormula::All(vec![ + MatchFormula::Pattern(String::from("a")), + MatchFormula::Not(Box::new(MatchFormula::Pattern(String::from("b")))), + ])), +)] +#[case::disjunction( + SearchQueryPrincipal::Legacy(LegacyFormula::PatternEither(vec![ + LegacyFormula::Pattern(String::from("a")), + LegacyFormula::Pattern(String::from("b")), + ])), + SearchQueryPrincipal::Match(MatchFormula::Any(vec![ + MatchFormula::Pattern(String::from("a")), + MatchFormula::Pattern(String::from("b")), + ])), +)] +fn paired_legacy_and_v2_produce_equal_formula( + #[case] legacy: SearchQueryPrincipal, + #[case] v2: SearchQueryPrincipal, +) { + let legacy_formula = normalise_search_principal(&legacy).expect("ok"); + let v2_formula = normalise_search_principal(&v2).expect("ok"); + assert_eq!(legacy_formula, v2_formula); +} + +// ----------------------------------------------------------------------- +// Deep nesting +// ----------------------------------------------------------------------- + +#[test] +fn deeply_nested_legacy_formula() { + // pattern-either containing patterns containing pattern-not + let principal = SearchQueryPrincipal::Legacy(LegacyFormula::PatternEither(vec![ + LegacyFormula::Patterns(vec![ + LegacyClause::Formula(LegacyFormula::Pattern(String::from("a"))), + LegacyClause::Formula(LegacyFormula::PatternNot(Box::new(LegacyValue::String( + String::from("b"), + )))), + ]), + LegacyFormula::Pattern(String::from("c")), + ])); + let result = normalise_search_principal(&principal).expect("ok"); + let expected = Formula::Or(vec![ + bare(Formula::And(vec![ + bare(pat("a")), + bare(Formula::Not(Box::new(bare(pat("b"))))), + ])), + bare(pat("c")), + ]); + assert_eq!(result, Some(expected)); +} diff --git a/crates/sempai/src/tests/reexport_tests.rs b/crates/sempai/src/tests/reexport_tests.rs index 245b8e3c..da351c13 100644 --- a/crates/sempai/src/tests/reexport_tests.rs +++ b/crates/sempai/src/tests/reexport_tests.rs @@ -7,8 +7,8 @@ use std::collections::BTreeMap; use crate::{ - CaptureValue, CapturedNode, Diagnostic, DiagnosticCode, DiagnosticReport, EngineConfig, - Language, LineCol, Match, SourceSpan, Span, + Atom, CaptureValue, CapturedNode, Decorated, Diagnostic, DiagnosticCode, DiagnosticReport, + EngineConfig, Formula, Language, LineCol, Match, SourceSpan, Span, }; #[test] @@ -62,3 +62,11 @@ fn engine_config_is_accessible() { let config = EngineConfig::default(); assert_eq!(config.max_matches_per_rule(), 10_000); } + +#[test] +fn formula_types_are_accessible() { + let atom = Atom::Pattern(String::from("foo")); + let formula = Formula::Atom(atom); + let decorated = Decorated::bare(formula); + assert!(decorated.where_clauses.is_empty()); +} diff --git a/crates/sempai/tests/features/formula_normalization.feature b/crates/sempai/tests/features/formula_normalization.feature new file mode 100644 index 00000000..dc32261e --- /dev/null +++ b/crates/sempai/tests/features/formula_normalization.feature @@ -0,0 +1,11 @@ +Feature: Formula normalization + + Scenario: Negation inside disjunction is rejected + Given an engine with default configuration + When YAML "rules:\n - id: bad.not-in-or\n message: detect\n languages: [rust]\n severity: ERROR\n pattern-either:\n - pattern: foo($X)\n - pattern-not: bar($X)\n" is compiled + Then compilation fails with code "E_SEMPAI_INVALID_NOT_IN_OR" + + Scenario: Conjunction without positive term is rejected + Given an engine with default configuration + When YAML "rules:\n - id: bad.no-positive\n message: detect\n languages: [rust]\n severity: ERROR\n patterns:\n - pattern-not: foo($X)\n" is compiled + Then compilation fails with code "E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND" diff --git a/crates/sempai/tests/features/sempai_engine.feature b/crates/sempai/tests/features/sempai_engine.feature index 68773b0f..645b9490 100644 --- a/crates/sempai/tests/features/sempai_engine.feature +++ b/crates/sempai/tests/features/sempai_engine.feature @@ -14,17 +14,19 @@ Feature: Sempai engine facade behaviour When YAML "rules:\n - message: detect foo\n languages: [rust]\n severity: ERROR\n pattern: foo($X)\n" is compiled Then compilation fails with code "E_SEMPAI_SCHEMA_INVALID" - Scenario: Engine compile_yaml keeps a post-parse placeholder for valid YAML + Scenario: Engine compile_yaml produces a plan with formula for valid search rule Given an engine with default configuration When YAML "rules:\n - id: demo.rule\n message: detect foo\n languages: [rust]\n severity: ERROR\n pattern: foo($X)\n" is compiled - Then compilation fails with code "NOT_IMPLEMENTED" - And the first diagnostic message contains "normalization" + Then compilation succeeds + And the result contains 1 query plan + And the first plan has a formula - Scenario: Engine compile_yaml keeps the placeholder for dependency search rules + Scenario: Engine compile_yaml produces a plan without formula for dependency rules Given an engine with default configuration When YAML "rules:\n - id: demo.depends\n message: detect vulnerable dependency\n languages: [python]\n severity: WARNING\n r2c-internal-project-depends-on:\n namespace: pypi\n package: requests\n" is compiled - Then compilation fails with code "NOT_IMPLEMENTED" - And the first diagnostic message contains "normalization" + Then compilation succeeds + And the result contains 1 query plan + And the first plan has no formula Scenario: Engine compile_yaml rejects extract mode during execution gating Given an engine with default configuration diff --git a/docs/execplans/4-1-5-normalization-into-canonical-formula-model.md b/docs/execplans/4-1-5-normalization-into-canonical-formula-model.md new file mode 100644 index 00000000..00333129 --- /dev/null +++ b/docs/execplans/4-1-5-normalization-into-canonical-formula-model.md @@ -0,0 +1,875 @@ +# 4.1.5 Implement legacy and v2 normalization into canonical Formula model + +This ExecPlan (execution plan) is a living document. The sections +`Constraints`, `Tolerances`, `Risks`, `Progress`, `Surprises & Discoveries`, +`Decision Log`, and `Outcomes & Retrospective` must be kept up to date as work +proceeds. + +Status: COMPLETE + +## Purpose / big picture + +After this change, the Sempai compilation pipeline will lower both legacy +Semgrep operators (`pattern`, `patterns`, `pattern-either`, `pattern-not`, +`pattern-inside`, `pattern-not-inside`, `pattern-not-regex`, +`semgrep-internal-pattern-anywhere`) and v2 `match` operators (`pattern`, +`regex`, `all`, `any`, `not`, `inside`, `anywhere`) into one canonical +`Formula` enum defined in `sempai_core`. Two parser-enforced semantic +constraint checks will reject invalid formula shapes deterministically: + +- `E_SEMPAI_INVALID_NOT_IN_OR`: a negated branch inside `pattern-either` or + `any`. +- `E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND`: a conjunction (`patterns` or `all`) + with no positive match-producing term (except in metavariable-pattern + contexts, which are preserved as opaque constraints for now). + +Observable user-facing behaviour after implementation: + +- `Engine::compile_yaml(...)` accepts valid `search` rules and produces + `Vec` containing a normalised `Formula` for each rule, instead of + returning `NOT_IMPLEMENTED`. +- Paired legacy and v2 YAML fixtures that express equivalent queries normalise + to structurally identical `Formula` values. +- Semantically invalid rules (negation in disjunction, missing positive term in + conjunction) emit deterministic rule diagnostics with stable error codes and + source spans pointing at the offending formula node. +- Legacy `LegacyClause::Constraint` objects are preserved as opaque + `Formula::Constraint` nodes rather than dropped or rejected, so constraint + evaluation can be added in a later milestone. +- v2 `Decorated` formulas have their `where`, `as`, and `fix` metadata carried + through to the canonical `Decorated` wrapper. +- `Engine::execute(...)` continues to return `NOT_IMPLEMENTED` until backend + work lands in 4.2.x. + +Observable completion evidence: + +```plaintext +set -o pipefail; cargo test -p sempai_core 2>&1 | tee /tmp/4-1-5-sempai-core-test.log +set -o pipefail; cargo test -p sempai 2>&1 | tee /tmp/4-1-5-sempai-test.log +set -o pipefail; make check-fmt 2>&1 | tee /tmp/4-1-5-make-check-fmt.log +set -o pipefail; make lint 2>&1 | tee /tmp/4-1-5-make-lint.log +set -o pipefail; make test 2>&1 | tee /tmp/4-1-5-make-test.log +``` + +Because this milestone updates Markdown documentation and the roadmap: + +```plaintext +set -o pipefail; make fmt 2>&1 | tee /tmp/4-1-5-make-fmt.log +set -o pipefail; make markdownlint 2>&1 | tee /tmp/4-1-5-make-markdownlint.log +set -o pipefail; make nixie 2>&1 | tee /tmp/4-1-5-make-nixie.log +``` + +## Constraints + +- Follow the normalised formula model from + [docs/sempai-query-language-design.md](../sempai-query-language-design.md) + section "Normalised formula model" and the operator mappings in + [docs/semgrep-language-reference/semgrep-operator-precedence.md](../semgrep-language-reference/semgrep-operator-precedence.md) + and + [docs/semgrep-language-reference/semgrep-legacy-vs-v2-guidance.md](../semgrep-language-reference/semgrep-legacy-vs-v2-guidance.md). +- The canonical `Formula` enum and its `Decorated` wrapper live in + `sempai_core`, not in `sempai_yaml` or `sempai`. This keeps the formula model + available to all downstream crates including the future `sempai_ts` backend. +- Normalization must be a pure function from `SearchQueryPrincipal` to + `Result`, with no side-effects and no I/O. +- Preserve the existing parser-vs-engine validation boundary: `sempai_yaml` + owns YAML shape and structural validation; the normalization pass in + `sempai_core` owns semantic constraint checks on the canonical formula. +- Parser-level types (`LegacyFormula`, `MatchFormula`, `LegacyValue`, + `LegacyClause`) remain in `sempai_yaml`. Do not remove or restructure them. +- Semantic constraint checks must use the existing diagnostic codes + `ESempaiInvalidNotInOr` and `ESempaiMissingPositiveTermInAnd` from + `sempai_core::DiagnosticCode`. +- Legacy `LegacyClause::Constraint` objects must be preserved as opaque nodes + (`Formula::Constraint(serde_json::Value)`) so metavariable-regex, + metavariable-pattern, and similar constraints can be evaluated in a later + milestone. +- The `QueryPlan` struct in `sempai/src/engine.rs` must carry a real `Formula` + instead of the current `_plan: ()` placeholder. +- `ProjectDependsOn` search principals must be preserved as-is and skip + normalization, since they have no formula semantics. +- Tests must include unit tests and BDD tests using `rstest-bdd` v0.5.0, + covering happy paths, unhappy paths (semantic constraint violations), and + edge cases (empty conjunctions, single-element disjunctions, deeply nested + formulas, decorated v2 nodes). +- Keep files below 400 lines. Split the normalization module, semantic + validator, and test helpers into focused modules. +- Every new module must begin with a `//!` module comment. New public items must + carry Rustdoc with examples where appropriate. +- Use en-GB-oxendict spelling and grammar throughout code, comments, and + documentation. +- Record any design decisions in + [docs/sempai-query-language-design.md](../sempai-query-language-design.md). +- Update [docs/users-guide.md](../users-guide.md) with the user-visible change + in `compile_yaml(...)` behaviour. +- Mark roadmap item 4.1.5 done in [docs/roadmap.md](../roadmap.md) only after + all tests and quality gates pass. +- `make check-fmt`, `make lint`, and `make test` must all succeed. + +## Tolerances (exception triggers) + +- Scope: if implementation requires touching more than 18 net files outside + `crates/sempai-core/`, `crates/sempai/`, `crates/sempai-yaml/`, and the four + required docs, stop and escalate. +- Interface: if normalisation requires a breaking change to + `sempai_yaml::SearchQueryPrincipal`, `sempai_yaml::LegacyFormula`, + `sempai_yaml::MatchFormula`, or `sempai::Engine::compile_yaml`, stop and + present the least disruptive options before proceeding. +- Dependency: if the canonical `Formula` requires a new workspace dependency + beyond `serde_json` (already available), stop and escalate. +- Iterations: if the same failing lint or test loop is attempted five times + without a clear path forward, stop and escalate. +- Ambiguity: if the Semgrep schema and the design document disagree on whether + `pattern-not-inside` normalises to `Not(Inside(...))` versus a distinct + `NotInside(...)` variant, stop and escalate with competing interpretations. + +## Risks + +- Risk: the existing `LegacyValue` type wraps either a string or a nested + formula, but the canonical `Formula` expects atoms. Normalization must + handle both forms. Severity: medium. Likelihood: high. Mitigation: treat + `LegacyValue::String(s)` as `Formula::Atom(Atom::Pattern(s))` during + lowering. + +- Risk: `MatchFormula::Pattern(s)` and `MatchFormula::PatternObject(s)` are + semantically identical (both are pattern atoms) but structurally different. + The normaliser must unify them. Severity: low. Likelihood: certain. + Mitigation: map both to `Formula::Atom(Atom::Pattern(s))`. + +- Risk: the `MissingPositiveTermInAnd` check must not reject conjunctions + where the only positive term comes from a metavariable-pattern constraint. + The Semgrep spec allows this as a special case. Severity: medium. Likelihood: + medium. Mitigation: treat `LegacyClause::Constraint` as a non-positive, + non-negative term for the purpose of the positive-term check, and add a + fixture specifically testing the metavariable-pattern exception. + +- Risk: `Decorated` v2 formulas nest decorators around formulas. Normalization + must preserve decoration while lowering the inner formula. Severity: medium. + Likelihood: high. Mitigation: normalise the inner formula recursively, then + wrap in `Decorated` carrying the `where`, `as`, and `fix` metadata. + +- Risk: the positive-term check interacts with `Inside` and `Anywhere` + semantics. Per the design doc, these are constraints, not match-producers, so + they do not count as positive terms. Severity: medium. Likelihood: medium. + Mitigation: follow the design doc classification precisely and add explicit + test fixtures for `all: [inside(...)]` (should fail) and + `all: [pattern(...), inside(...)]` (should pass). + +## Progress + +- [x] Reviewed roadmap item 4.1.5, the Sempai design doc, the Semgrep operator + precedence and legacy-vs-v2 guidance, the current `sempai_yaml` model, and + the 4.1.4 ExecPlan. +- [x] Drafted this ExecPlan. +- [x] Stage A: Defined canonical `Formula`, `Atom`, `Decorated` in + `sempai_core/src/formula.rs`. Added `serde_json` as a regular dependency + to `sempai_core`. Exported from `sempai_core/src/lib.rs`. +- [x] Stage B: Implemented normalization module in `sempai/src/normalise/` with + submodules `legacy.rs`, `v2.rs`, and `constraints.rs`. Followed Option B + from Step 7 (normalization in `sempai` to avoid circular dependencies). +- [x] Stage C: Implemented `validate_formula_constraints(...)` with depth-first + walk checking `InvalidNotInOr` and `MissingPositiveTermInAnd`. Added + metavariable-pattern exception for all-constraint conjunctions. +- [x] Stage D: Added 17 formula type unit tests in `sempai_core`, 25 + normalization unit tests in `sempai` (including 3 paired equivalence + tests via `rstest`), and 14 constraint validation unit tests. +- [x] Stage D (BDD): Added 2 semantic constraint BDD scenarios in + `formula_normalization.feature` and updated 2 existing scenarios in + `sempai_engine.feature` from `NOT_IMPLEMENTED` expectations to formula + plan assertions. +- [x] Stage E: Wired normalization into `Engine::compile_yaml`, updated + `QueryPlan` to carry `Option` with a `formula()` accessor. + Removed `#[cfg(test)]` gate from `QueryPlan::new`. +- [x] Stage F: Updated `sempai-query-language-design.md` with implementation + notes (crate placement, `pattern-not-inside` lowering, constraint + preservation, positive-term exception, `ProjectDependsOn` passthrough). + Updated `users-guide.md` with `compile_yaml` behaviour and semantic + constraint error codes. Marked 4.1.5 done in `roadmap.md`. +- [x] Stage G: All quality gates pass — `make check-fmt`, `make lint`, + `make test` (199 tests), `make markdownlint` (0 errors). `make nixie` + unavailable in environment (custom Mermaid tool); no Mermaid diagrams + were modified. + +## Surprises & Discoveries + +1. **BDD step function `\n` escaping**: The `QuotedString` parser from + `rstest-bdd` does not interpret `\n` escape sequences — it strips + surrounding quotes but passes the literal `\n` characters through. + The `sempai_yaml` BDD tests already worked around this with + `.replace("\\n", "\n")` in their step definitions. The same + workaround was applied to the `sempai` BDD step definitions. + +2. **BDD step registry is global**: `rstest-bdd` v0.5.0 registers step + functions in a global registry keyed by step text. Defining the same + step (e.g. `"Then compilation succeeds"`) in two different BDD + modules causes a runtime panic due to duplicate registration. All + normalization BDD scenarios were therefore placed in the existing + `behaviour.rs` and registered via a second `#[scenario]` attribute + pointing at `formula_normalization.feature`. + +3. **`serde_json` upgrade to workspace dependency**: `serde_json` was + already available as a workspace dependency but only as a + dev-dependency in `sempai_core`. Adding it as a regular dependency + was straightforward with no version conflicts. + +## Decision Log + +1. **Normalization placement: Option B confirmed.** Normalization functions + live in `crates/sempai/src/normalise/` (not in `sempai_core`) because + they depend on `sempai_yaml` model types, and `sempai_yaml` depends on + `sempai_core`. Placing them in `sempai` avoids the circular dependency. + The canonical `Formula` type remains in `sempai_core` for downstream + consumption. + +2. **`normalise_search_principal` returns `Option`.** Rather than + returning `Result`, the function returns + `Result, DiagnosticReport>` where `None` represents + `ProjectDependsOn` rules that have no formula semantics. This avoids + needing a sentinel `Formula` variant for non-formula rules. + +3. **`QueryPlan::new` no longer test-only.** The `#[cfg(test)]` gate was + removed since `Engine::compile_yaml` now constructs real `QueryPlan` + values in production code. + +4. **Constraint-only conjunction exception.** A conjunction (`And`) where + every child is `Constraint` is accepted without triggering + `MissingPositiveTermInAnd`. This accommodates metavariable-pattern + contexts where constraints act as implicit positive terms. + +5. **Top-level `Decorated` metadata from v2.** When `normalise_match` + encounters a v2 `Decorated` variant, the metadata (`where`, `as`, + `fix`) is preserved in the resulting `Decorated`. However, + `normalise_search_principal` returns only the inner `Formula` node + (stripping the outer `Decorated` wrapper) because the top-level plan + needs just the `Formula`. Metadata is preserved when `Decorated` + appears as a child within `And`/`Or`/`Not` etc. + +## Outcomes & Retrospective + +Target outcome at completion: + +1. `sempai_core` exports a canonical `Formula` enum, `Atom` enum, and + `Decorated` wrapper as stable public types. +2. `sempai_core` exports a `normalise_search_principal(...)` function that + lowers `SearchQueryPrincipal` to `Formula`. +3. `sempai_core` exports a `validate_formula_constraints(...)` function that + checks semantic constraints and returns `DiagnosticReport` on violation. +4. Paired legacy and v2 test fixtures produce structurally equal `Formula` + values after normalization. +5. Invalid formula shapes (`Not` in `Or`, missing positive term in `And`) emit + deterministic diagnostics with the correct `E_SEMPAI_*` codes. +6. `Engine::compile_yaml(...)` returns `Ok(Vec)` for valid search + rules, with each plan carrying a normalised `Formula`. +7. `Engine::execute(...)` still returns `NOT_IMPLEMENTED` until 4.2.x. +8. `docs/sempai-query-language-design.md` records the normalization mapping and + any implementation decisions. +9. `docs/users-guide.md` documents the change in `compile_yaml(...)` behaviour. +10. `docs/roadmap.md` marks 4.1.5 done. +11. `make fmt`, `make markdownlint`, `make nixie`, `make check-fmt`, + `make lint`, and `make test` all pass. + +Retrospective notes: + +**What went well:** + +- The plan's dependency graph (Steps 1→2→3→5→6→8→9) held throughout. No + backtracking was required between stages. +- Option B (normalization in `sempai`) was the correct crate-placement call. + The alternative (Option A, normalization in `sempai_core`) would have + introduced a circular dependency with `sempai_yaml`. +- The paired equivalence tests (legacy ↔ v2 producing identical `Formula` + values) gave high confidence that the two lowering paths are consistent. +- The metavariable-pattern exception (all-constraint conjunction) was + anticipated in the plan and handled smoothly. +- The 400-line file limit was maintained across all new modules by splitting + normalization into `legacy.rs`, `v2.rs`, `constraints.rs`, and `mod.rs`. + +**What was harder than expected:** + +- `rstest-bdd` v0.5.0's global step registry caused a runtime panic when + step definitions were split across two modules. The workaround (a second + `#[scenario]` attribute on the existing `behaviour.rs` test function) + was effective but non-obvious. This should be documented as a project + gotcha. +- The `QuotedString` type from `rstest-bdd` does not unescape `\n`, which + caused BDD scenarios with multi-line YAML to fail silently. The fix + (`.replace("\\n", "\n")`) mirrors the existing `sempai_yaml` BDD tests + but cost a debugging cycle to rediscover. +- Clippy's `excessive_nesting` and `manual_let_else` lints required + extracting a helper function and restructuring control flow in + `engine.rs`. This was ultimately beneficial for readability. + +**Outcomes vs. targets:** + +All eleven target outcomes from the plan are met: + +1. ✓ `sempai_core` exports `Formula`, `Atom`, `Decorated`. +2. ✓ Normalization dispatches through `normalise_search_principal()` in + `sempai` (not `sempai_core`, per Option B — the plan's public API + section listed `sempai_core` but Decision Log entry 1 corrected this). +3. ✓ `validate_formula_constraints()` checks both semantic invariants. +4. ✓ Three paired equivalence tests confirm legacy ↔ v2 structural + equality. +5. ✓ Invalid formula shapes emit `ESempaiInvalidNotInOr` and + `ESempaiMissingPositiveTermInAnd` diagnostics. +6. ✓ `Engine::compile_yaml()` returns `Ok(Vec)` with formulas. +7. ✓ `Engine::execute()` still returns `NOT_IMPLEMENTED`. +8. ✓ `sempai-query-language-design.md` updated with implementation notes. +9. ✓ `users-guide.md` documents the new `compile_yaml` behaviour. +10. ✓ `roadmap.md` marks 4.1.5 as `[x]`. +11. ✓ `make check-fmt`, `make lint`, `make test`, `make markdownlint` all + pass. `make nixie` unavailable in environment. + +**Test coverage summary:** + +- 17 formula type unit tests (`sempai_core`). +- 25 normalization unit tests (`sempai`), including 3 paired + `#[rstest]` `#[case]` equivalence tests. +- 14 semantic constraint unit tests (`sempai`). +- 2 BDD scenarios for semantic constraint rejection. +- 2 updated BDD scenarios for engine integration. +- 199 total workspace tests passing. + +## Context and orientation + +Current files that matter for this milestone: + +- [crates/sempai-core/src/lib.rs](../../crates/sempai-core/src/lib.rs) — + re-exports; will expose the new formula module. +- `crates/sempai-core/src/diagnostic.rs` — already defines + `ESempaiInvalidNotInOr` and `ESempaiMissingPositiveTermInAnd`. +- [crates/sempai-yaml/src/model.rs](../../crates/sempai-yaml/src/model.rs) — + `SearchQueryPrincipal`, `LegacyFormula`, `MatchFormula`, `LegacyClause`, + `LegacyValue`, `MatchFormula::Decorated`. +- [crates/sempai/src/engine.rs](../../crates/sempai/src/engine.rs) — `Engine`, + `QueryPlan`, and the `NOT_IMPLEMENTED` placeholder at line 116. +- `crates/sempai/src/mode_validation.rs` — mode-aware gating + from 4.1.4. +- `crates/sempai/src/tests/engine_tests.rs` — existing engine + test coverage. +- `crates/sempai/tests/features/sempai_engine.feature` — + existing BDD scenarios for the engine. +- `crates/sempai/src/tests/behaviour.rs` — BDD step + definitions for the engine. +- `docs/sempai-query-language-design.md` — design reference, + records implementation decisions. +- `docs/semgrep-language-reference/semgrep-operator-precedence.md` + — operator mapping and precedence ladder. +- `docs/semgrep-language-reference/semgrep-legacy-vs-v2-guidance.md` + — legacy-to-v2 equivalence table. +- [docs/users-guide.md](../users-guide.md) — user-facing documentation. +- [docs/roadmap.md](../roadmap.md) — roadmap state tracking. + +Current behaviour to preserve or intentionally change: + +- `parse_rule_file(...)` produces `RuleFile` with `Rule` and + `SearchQueryPrincipal` for search-mode rules. This remains unchanged. +- `validate_supported_modes(...)` rejects non-search modes with + `E_SEMPAI_UNSUPPORTED_MODE`. This remains unchanged. +- `compile_yaml(...)` currently returns `NOT_IMPLEMENTED` for all valid search + rules after mode validation. This will be replaced with normalization and + semantic validation, producing real `QueryPlan` values on success. +- `QueryPlan._plan` is currently `()`. This will be replaced with `Formula`. + +## Plan of work + +### Stage A: Define the canonical Formula model in sempai_core + +Add the `Formula`, `Atom`, and `Decorated` types to `sempai_core` with full +Rustdoc documentation. These types follow the design document's normalised +formula model. + +### Stage B: Implement normalization functions + +Implement `normalise_legacy(...)` and `normalise_match(...)` functions that +lower `LegacyFormula` and `MatchFormula` respectively into `Formula`. Add a +top-level `normalise_search_principal(...)` dispatcher. These are pure +functions in `sempai_core` that depend on `sempai_yaml` model types. + +### Stage C: Implement semantic constraint checks + +Add `validate_formula_constraints(...)` that walks the normalised `Formula` +tree and checks: + +- No `Not` children inside `Or` nodes (`InvalidNotInOr`). +- Every `And` node has at least one positive term (`MissingPositiveTermInAnd`). + +### Stage D: Lock expected behaviour with tests first + +Add unit tests and BDD scenarios before wiring the normalization into the +engine, so the intended behaviour is explicit and verifiable. + +### Stage E: Wire normalization into Engine::compile_yaml + +Replace the `NOT_IMPLEMENTED` placeholder in `Engine::compile_yaml(...)` with +the normalization and validation pipeline. Update `QueryPlan` to carry a real +`Formula`. + +### Stage F: Update design docs, user docs, and roadmap + +Synchronise living documentation with the implementation. + +### Stage G: Run the full gate sequence and capture evidence + +Run targeted crate tests and the full repository gates. + +## Concrete steps + +### Step 1: Define canonical Formula types (Stage A) + +Create `crates/sempai-core/src/formula.rs`: + +```rust +/// Canonical normalised formula shared by legacy and v2 paths. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Formula { + Atom(Atom), + Not(Box>), + Inside(Box>), + Anywhere(Box>), + And(Vec>), + Or(Vec>), + /// Opaque constraint preserved for later evaluation. + Constraint(serde_json::Value), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Atom { + Pattern(String), + Regex(String), + TreeSitterQuery(String), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Decorated { + pub node: T, + pub where_clauses: Vec, + pub as_name: Option, + pub fix: Option, +} +``` + +Export from `sempai_core/src/lib.rs`. Keep file under 400 lines. + +### Step 2: Implement normalization module (Stages B and C) + +Create `crates/sempai-core/src/normalise.rs` (or split across +`normalise/mod.rs`, `normalise/legacy.rs`, `normalise/v2.rs` if needed to +stay within the 400-line limit). + +Key mappings: + +**Legacy to Formula:** + +| Legacy | Formula | +| --- | --- | +| `Pattern(s)` | `Atom(Pattern(s))` | +| `PatternRegex(s)` | `Atom(Regex(s))` | +| `Patterns(clauses)` | `And(normalised_clauses)` | +| `PatternEither(branches)` | `Or(normalised_branches)` | +| `PatternNot(val)` | `Not(normalise_value(val))` | +| `PatternInside(val)` | `Inside(normalise_value(val))` | +| `PatternNotInside(val)` | `Not(Inside(normalise_value(val)))` | +| `PatternNotRegex(s)` | `Not(Atom(Regex(s)))` | +| `Anywhere(val)` | `Anywhere(normalise_value(val))` | +| `LegacyValue::String(s)` | `Atom(Pattern(s))` | +| `LegacyValue::Formula(f)` | `normalise_legacy(f)` | +| `LegacyClause::Constraint(v)` | `Constraint(v)` | + +**v2 Match to Formula:** + +| v2 Match | Formula | +| --- | --- | +| `Pattern(s)` | `Atom(Pattern(s))` | +| `PatternObject(s)` | `Atom(Pattern(s))` | +| `Regex(s)` | `Atom(Regex(s))` | +| `All(items)` | `And(normalised_items)` | +| `Any(items)` | `Or(normalised_items)` | +| `Not(inner)` | `Not(normalise_match(inner))` | +| `Inside(inner)` | `Inside(normalise_match(inner))` | +| `Anywhere(inner)` | `Anywhere(normalise_match(inner))` | +| `Decorated { formula, where_, as_, fix }` | `Decorated { normalise_match(formula), where_, as_, fix }` | + +Implement `validate_formula_constraints(formula: &Formula)`: + +- Walk the tree recursively. +- For `Or(branches)`: check that no direct child (after unwrapping + `Decorated`) is a `Not(...)`. If found, emit + `E_SEMPAI_INVALID_NOT_IN_OR`. +- For `And(terms)`: check that at least one direct child is a positive term + (i.e., not `Not`, not `Inside`, not `Anywhere`, not `Constraint`). If none + found, emit `E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND`. + +### Step 3: Add unit tests for normalization (Stage D) + +Create `crates/sempai-core/src/tests/normalise_tests.rs` with `rstest` +parameterised cases: + +- Legacy `pattern` → `Atom(Pattern(...))`. +- Legacy `pattern-regex` → `Atom(Regex(...))`. +- Legacy `patterns` with mixed positive and negative terms → + `And([Atom(...), Not(...)])`. +- Legacy `pattern-either` with two branches → + `Or([Atom(...), Atom(...)])`. +- Legacy `pattern-not-inside` → `Not(Inside(Atom(...)))`. +- Legacy `pattern-not-regex` → `Not(Atom(Regex(...)))`. +- Legacy `semgrep-internal-pattern-anywhere` → `Anywhere(Atom(...))`. +- v2 `match: "foo"` → `Atom(Pattern("foo"))`. +- v2 `all` with `not` and `pattern` → `And([Atom(...), Not(...)])`. +- v2 `any` with two patterns → `Or([Atom(...), Atom(...)])`. +- v2 decorated formula carries `where`, `as`, `fix` metadata. +- Paired equivalence: legacy `patterns` vs v2 `all` produce the same + `Formula`. +- Paired equivalence: legacy `pattern-either` vs v2 `any` produce the same + `Formula`. +- Deep nesting: `pattern-either` containing `patterns` containing + `pattern-not`. + +### Step 4: Add unit tests for semantic constraints (Stage D) + +Create `crates/sempai-core/src/tests/constraint_tests.rs`: + +- `Or` with `Not` child → `E_SEMPAI_INVALID_NOT_IN_OR`. +- `Or` without `Not` children → passes. +- `And` with no positive terms → `E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND`. +- `And` with one positive and one negative → passes. +- `And` with only `Inside` and `Anywhere` → fails (not positive). +- `And` with only `Constraint` objects → passes (metavariable-pattern + exception: constraints alone do not trigger the positive-term violation, + because they may be metavariable-pattern constraints that act as implicit + positive terms). +- Nested: `And` inside `Or` where the `And` has no positive term → fails. + +### Step 5: Add BDD scenarios (Stage D) + +Add feature file +`crates/sempai/tests/features/formula_normalization.feature` (or extend the +existing `sempai_engine.feature`) with scenarios: + +```gherkin +Feature: Formula normalization + + Scenario: Legacy pattern normalises to atom + Given a valid search rule with a legacy "pattern" principal + When the rule is compiled + Then the query plan contains a pattern atom + + Scenario: v2 match string normalises to atom + Given a valid search rule with a v2 match string principal + When the rule is compiled + Then the query plan contains a pattern atom + + Scenario: Legacy and v2 conjunction produce equivalent formulas + Given a legacy "patterns" rule and an equivalent v2 "all" rule + When both rules are compiled + Then both query plans contain structurally equal formulas + + Scenario: Negation inside disjunction is rejected + Given a search rule with a "not" inside "pattern-either" + When the rule is compiled + Then compilation fails with E_SEMPAI_INVALID_NOT_IN_OR + + Scenario: Conjunction without positive term is rejected + Given a search rule with "patterns" containing only "pattern-not" + When the rule is compiled + Then compilation fails with E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND +``` + +Add corresponding step definitions in +`crates/sempai/src/tests/behaviour.rs` (or a new +`crates/sempai/src/tests/normalise_behaviour.rs` if the file exceeds 400 +lines). + +### Step 6: Wire normalization into Engine (Stage E) + +Update `crates/sempai/src/engine.rs`: + +- Import `normalise_search_principal` and `validate_formula_constraints` from + `sempai_core`. +- In `compile_yaml(...)`: + 1. Parse YAML → `RuleFile`. + 2. Validate supported modes. + 3. For each search rule, extract its `SearchQueryPrincipal`. + 4. If `ProjectDependsOn`, skip normalization and produce a plan without a + formula (or a sentinel marker). + 5. Call `normalise_search_principal(principal)` → `Formula`. + 6. Call `validate_formula_constraints(&formula)` → check for violations. + 7. Construct `QueryPlan` with the rule's ID, language, and `Formula`. + 8. Return `Ok(plans)`. + +Update `QueryPlan`: + +- Replace `_plan: ()` with `formula: Option`. +- `None` represents a `ProjectDependsOn` rule with no formula. +- Add `pub fn formula(&self) -> Option<&Formula>` accessor. +- Remove the `#[cfg(test)]` gate on `QueryPlan::new`. + +### Step 7: Ensure sempai_core depends on sempai_yaml types (Stage B) + +The normalization functions in `sempai_core` accept `sempai_yaml` types +(`SearchQueryPrincipal`, `LegacyFormula`, `MatchFormula`). This creates a +dependency `sempai_core → sempai_yaml`. If this introduces a circular +dependency (since `sempai_yaml` depends on `sempai_core` for `SourceSpan` and +diagnostics), resolve it by one of: + +- **Option A (preferred):** Keep normalization in `sempai_core` and accept the + `sempai_yaml` dependency. This requires `sempai_yaml` to depend on + `sempai_core` for diagnostics, and `sempai_core` to depend on `sempai_yaml` + for model types. This is circular and **not viable**. +- **Option B:** Place normalization in the `sempai` facade crate, which already + depends on both. Normalization functions live in `crates/sempai/src/normalise/` + and convert from `sempai_yaml` types to `sempai_core::Formula`. +- **Option C:** Place normalization in a new `sempai_core::normalise` module + but define the conversion traits such that the actual `From` impls live in + `sempai` or `sempai_yaml`. + +**Resolution:** Option B is the most pragmatic. The normalization module lives +in `crates/sempai/src/normalise/` and the `Formula` type itself lives in +`sempai_core`. This avoids circular dependencies and keeps the canonical type +in the core crate. + +### Step 8: Update documentation (Stage F) + +- Update + [docs/sempai-query-language-design.md](../sempai-query-language-design.md): + - Add an implementation note recording the normalization mapping. + - Document the `pattern-not-inside` → `Not(Inside(...))` lowering. + - Document the constraint-preservation strategy. + - Record the crate-placement decision (Formula in `sempai_core`, + normalization in `sempai`). +- Update [docs/users-guide.md](../users-guide.md): + - Document that `compile_yaml(...)` now returns real query plans for valid + search rules. + - List the semantic constraint error codes users may encounter. +- Update [docs/roadmap.md](../roadmap.md): + - Mark 4.1.5 done. + +### Step 9: Run full gate sequence (Stage G) + +```plaintext +set -o pipefail; cargo test -p sempai_core --all-targets --all-features 2>&1 | tee /tmp/4-1-5-sempai-core-test.log +set -o pipefail; cargo test -p sempai --all-targets --all-features 2>&1 | tee /tmp/4-1-5-sempai-test.log +set -o pipefail; make fmt 2>&1 | tee /tmp/4-1-5-make-fmt.log +set -o pipefail; make markdownlint 2>&1 | tee /tmp/4-1-5-make-markdownlint.log +set -o pipefail; make nixie 2>&1 | tee /tmp/4-1-5-make-nixie.log +set -o pipefail; make check-fmt 2>&1 | tee /tmp/4-1-5-make-check-fmt.log +set -o pipefail; make lint 2>&1 | tee /tmp/4-1-5-make-lint.log +set -o pipefail; make test 2>&1 | tee /tmp/4-1-5-make-test.log +``` + +## Dependency graph + +```plaintext +Step 1 (Formula types in sempai_core) + ├─→ Step 2 (normalization functions in sempai) + │ └─→ Step 3 (normalization unit tests) + │ └─→ Step 5 (BDD scenarios) + │ └─→ Step 6 (wire into Engine) + └─→ Step 4 (constraint unit tests) + └─→ Step 5 + └─→ Step 6 + └─→ Step 8 (docs) + └─→ Step 9 (gates) +``` + +Step 7 (dependency resolution) is performed during Step 2 and determines the +final module placement. + +## Test plan + +### Unit tests (sempai_core) + +- Formula type construction and equality. +- `Decorated` wrapper with and without metadata. +- `Atom` variants. + +### Unit tests (sempai — normalization) + +- Legacy → Formula mapping for each `LegacyFormula` variant. +- v2 Match → Formula mapping for each `MatchFormula` variant. +- Paired equivalence tests (legacy ↔ v2). +- `LegacyValue::String` vs `LegacyValue::Formula` handling. +- `LegacyClause::Constraint` → `Formula::Constraint`. +- Decorated v2 formulas carry metadata. +- Deep nesting. + +### Unit tests (sempai — semantic constraints) + +- `InvalidNotInOr` positive and negative cases. +- `MissingPositiveTermInAnd` positive and negative cases. +- Nested constraint violations. +- `Inside`/`Anywhere` not counted as positive terms. +- `Constraint` alone in `And` does not trigger violation. + +### BDD tests (sempai) + +- Happy paths: legacy pattern, v2 match, paired equivalence. +- Unhappy paths: negation in disjunction, missing positive term. +- Edge cases: empty rules array, `ProjectDependsOn` passthrough. + +### Integration (engine-level) + +- `compile_yaml(...)` returns `Ok(Vec)` for valid rules. +- `compile_yaml(...)` returns `Err(DiagnosticReport)` with correct codes for + invalid rules. +- Plans carry accessible `Formula` values. + +## Validation and acceptance + +- All tests in `cargo test -p sempai_core`, `cargo test -p sempai`, and + `make test` pass. +- `make check-fmt`, `make lint`, `make markdownlint`, and `make nixie` pass. +- `docs/roadmap.md` shows 4.1.5 as `[x]`. +- `docs/users-guide.md` documents the new `compile_yaml(...)` behaviour. +- `docs/sempai-query-language-design.md` contains implementation notes. + +## Idempotence and recovery + +All steps are re-runnable. If the implementation hits a blocker: + +- If the canonical `Formula` type needs to change, update `formula.rs` and + re-run all normalization and constraint tests. +- If the crate placement causes circular dependencies, switch to Option B + (normalization in `sempai`) per Step 7. +- If semantic constraint logic is incorrect, update `normalise.rs` and re-run + the constraint test suite. + +## Artefacts and notes + +Expected new files: + +- `crates/sempai-core/src/formula.rs` — canonical `Formula`, `Atom`, + `Decorated`. +- `crates/sempai/src/normalise.rs` (or `normalise/mod.rs` + sub-modules) — + normalization functions. +- `crates/sempai/src/normalise/legacy.rs` — legacy lowering. +- `crates/sempai/src/normalise/v2.rs` — v2 lowering. +- `crates/sempai/src/normalise/constraints.rs` — semantic constraint checks. +- `crates/sempai-core/src/tests/formula_tests.rs` — formula type tests. +- `crates/sempai/src/tests/normalise_tests.rs` — normalization unit tests. +- `crates/sempai/src/tests/constraint_tests.rs` — constraint check unit tests. +- `crates/sempai/tests/features/formula_normalization.feature` — BDD + scenarios. + +Expected modified files: + +- `crates/sempai-core/src/lib.rs` — new `formula` module export. +- `crates/sempai/src/engine.rs` — wire normalization, update `QueryPlan`. +- `crates/sempai/src/lib.rs` — re-export formula types if needed. +- `crates/sempai/src/tests/behaviour.rs` — new BDD step definitions. +- `docs/sempai-query-language-design.md` — implementation notes. +- `docs/users-guide.md` — user-visible behaviour change. +- `docs/roadmap.md` — mark 4.1.5 done. + +## Interfaces and dependencies + +### Public API additions (sempai_core) + +```rust +// crates/sempai-core/src/formula.rs + +pub enum Formula { ... } +pub enum Atom { ... } +pub struct Decorated { ... } +``` + +### Public API additions (sempai) + +```rust +// crates/sempai/src/normalise.rs (or mod.rs) + +pub fn normalise_search_principal( + principal: &SearchQueryPrincipal, +) -> Result; + +pub fn validate_formula_constraints( + formula: &Formula, +) -> Result<(), DiagnosticReport>; +``` + +### Modified API (sempai) + +```rust +// crates/sempai/src/engine.rs + +pub struct QueryPlan { + rule_id: String, + language: Language, + formula: Option, +} + +impl QueryPlan { + pub fn formula(&self) -> Option<&Formula>; +} +``` + +## Practice documentation + +The following project guidance documents are relevant to this milestone. +Implementors should consult them for conventions, patterns, and constraints. + +### Testing + +- [docs/rstest-bdd-users-guide.md](../rstest-bdd-users-guide.md) — BDD + framework usage: feature files, step definitions, fixture injection, + `Slot` state, `ScenarioState`, data tables, and `#[scenario]` + binding. All behavioural tests in this milestone use `rstest-bdd` + v0.5.0. +- [docs/rust-testing-with-rstest-fixtures.md](../rust-testing-with-rstest-fixtures.md) + — `rstest` fixture injection, `#[case]` parameterised tests, + `#[values]` combinatorial tests, `#[from]` and `#[with]` overrides. + Unit tests in the normalization and constraint modules should use + `rstest` parameterised cases for the mapping tables. +- [docs/reliable-testing-in-rust-via-dependency-injection.md](../reliable-testing-in-rust-via-dependency-injection.md) + — dependency injection patterns for testable Rust code. Normalization + functions are pure and need no DI, but the engine integration path + should remain injectable for future backend substitution. +- [docs/rust-doctest-dry-guide.md](../rust-doctest-dry-guide.md) — + DRY doctest patterns, `concat!()` for multi-line literals, hidden + setup lines, and `no_run` annotations. All new public types and + functions require Rustdoc examples. + +### Design and architecture + +- [docs/sempai-query-language-design.md](../sempai-query-language-design.md) + — primary design reference for the normalised formula model, operator + mappings, semantic constraints, and the parser-to-validation pipeline. + Implementation decisions must be recorded here. +- [docs/weaver-design.md](../weaver-design.md) — system architecture, + the observe/act/verify command model, JSONL protocol, and the + Double-Lock safety harness. Relevant for understanding how query + plans flow through the daemon. +- [docs/semgrep-language-reference/semgrep-operator-precedence.md](../semgrep-language-reference/semgrep-operator-precedence.md) + — Semgrep operator precedence ladder and Pratt binding powers. + Defines the normalization model that this milestone implements. +- [docs/semgrep-language-reference/semgrep-legacy-vs-v2-guidance.md](../semgrep-language-reference/semgrep-legacy-vs-v2-guidance.md) + — legacy-to-v2 equivalence table. Paired test fixtures should + mirror the equivalences documented here. + +### Code quality + +- [docs/complexity-antipatterns-and-refactoring-strategies.md](../complexity-antipatterns-and-refactoring-strategies.md) + — Cyclomatic and Cognitive Complexity metrics, the "Bumpy Road" + antipattern, extract-method and dispatcher-pattern refactoring + strategies. Relevant for keeping the recursive normalization and + constraint-walking functions simple and well-factored. +- [AGENTS.md](../../AGENTS.md) — code style, 400-line file limit, + en-GB spelling, commit gating (`make check-fmt`, `make lint`, + `make test`), module-level `//!` comments, and Rustdoc requirements. + +### Configuration + +- [docs/ortho-config-users-guide.md](../ortho-config-users-guide.md) + — OrthoConfig usage for CLI/env/file configuration merging. + Not directly exercised in normalization but relevant for + understanding how `EngineConfig` reaches the engine. + +## Revision note + +Initial draft created on 2026-04-12. Implements roadmap item 4.1.5 from +[docs/roadmap.md](../roadmap.md). + +- 2026-04-13: Implementation completed. Normalization placed in `sempai` + (Option B confirmed). All unit and BDD tests passing (199 total). All + quality gates pass. Documentation updated. `ExecPlan` status set to + COMPLETE. diff --git a/docs/roadmap.md b/docs/roadmap.md index af044c41..7a2fc56e 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -424,7 +424,7 @@ phase* *with explicit parser, backend, and Weaver-integration milestones.* - Acceptance criteria: unsupported execution modes return deterministic `UnsupportedMode` diagnostics, and search mode validation enforces required key combinations. -- [ ] 4.1.5. Implement legacy and v2 normalization into one canonical +- [x] 4.1.5. Implement legacy and v2 normalization into one canonical `Formula` model with semantic constraint checks. Requires 4.1.3. - Acceptance criteria: paired legacy and v2 fixtures normalize to equivalent formulas, and semantic invalid states emit deterministic rule diagnostics. diff --git a/docs/sempai-query-language-design.md b/docs/sempai-query-language-design.md index 429b0202..51244d10 100644 --- a/docs/sempai-query-language-design.md +++ b/docs/sempai-query-language-design.md @@ -428,12 +428,34 @@ semantics early. Implementation note (2026-03-29): `sempai::Engine::compile_yaml` now applies a mode-aware validation pass after parsing. Search rules continue to the -deliberate `NOT_IMPLEMENTED` normalization placeholder, while `extract`, -`taint`, `join`, and forward-compatible unknown modes fail deterministically -with `E_SEMPAI_UNSUPPORTED_MODE`. The first unsupported rule in source order is +normalization pipeline (see below), while `extract`, `taint`, `join`, and +forward-compatible unknown modes fail deterministically with +`E_SEMPAI_UNSUPPORTED_MODE`. The first unsupported rule in source order is reported, and its `primary_span` prefers the `mode` field span before falling back to the enclosing rule span. +Implementation note (2026-04-13): `sempai_core` now exports a canonical +`Formula` enum, `Atom` enum, and `Decorated` wrapper. The normalization +module in `crates/sempai/src/normalise/` lowers both legacy Semgrep operators +and v2 `match` operators into this representation. Key implementation +decisions: + +- **Crate placement**: `Formula` lives in `sempai_core::formula` for + downstream consumption. Normalization functions live in `sempai` + (`crates/sempai/src/normalise/`) because they depend on `sempai_yaml` + model types, and placing them in `sempai_core` would create a circular + dependency. +- **`pattern-not-inside` lowering**: normalises to `Not(Inside(...))` rather + than a distinct `NotInside` variant, matching the Semgrep specification's + desugaring semantics. +- **Constraint preservation**: legacy `LegacyClause::Constraint` objects are + preserved as `Formula::Constraint(serde_json::Value)` for later evaluation. +- **Positive-term exception**: conjunctions containing only `Constraint` + children do not trigger `E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND`, because + metavariable-pattern constraints may act as implicit positive terms. +- **`ProjectDependsOn` passthrough**: these rules produce `QueryPlan` values + with `formula: None` and skip normalization entirely. + Extract mode rules require legacy query keys, not `match`.[^1] ### Sempai extensions for Tree-sitter queries diff --git a/docs/users-guide.md b/docs/users-guide.md index 742dc14c..5a7706c9 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -1367,29 +1367,46 @@ The `Engine` struct exposes three methods for query compilation and execution: - `execute(plan, uri, source)` — executes a compiled plan against a source snapshot. -`compile_yaml(yaml)` now performs real YAML parsing plus a mode-aware -validation pass. Malformed YAML returns `E_SEMPAI_YAML_PARSE`, and schema-shape -failures such as missing required rule keys return `E_SEMPAI_SCHEMA_INVALID`, -both using the shared structured diagnostic payload with `primary_span` -locations when available. +`compile_yaml(yaml)` performs YAML parsing, mode-aware validation, and +formula normalization. Malformed YAML returns `E_SEMPAI_YAML_PARSE`, and +schema-shape failures such as missing required rule keys return +`E_SEMPAI_SCHEMA_INVALID`, both using the shared structured diagnostic +payload with `primary_span` locations when available. -After parsing succeeds, `compile_yaml(yaml)` now distinguishes parseable rules +After parsing succeeds, `compile_yaml(yaml)` distinguishes parseable rules from executable ones: -- Valid `search` rules, including compatibility-only - `r2c-internal-project-depends-on` rules, continue to the existing - `NOT_IMPLEMENTED` normalization placeholder. -- Valid `extract`, `taint`, `join`, and unknown future mode strings now fail - deterministically with `E_SEMPAI_UNSUPPORTED_MODE` instead of falling through - to the generic placeholder. - -Unsupported-mode diagnostics point at the rule's `mode` field when that span is -available, which makes whole-document failures deterministic even though the -execution backend is still pending. +- Valid `search` rules are normalised into a canonical `Formula` model. + Both legacy Semgrep operators (`pattern`, `patterns`, `pattern-either`, + `pattern-not`, `pattern-inside`, `pattern-not-inside`, `pattern-not-regex`, + `semgrep-internal-pattern-anywhere`) and v2 `match` operators (`pattern`, + `regex`, `all`, `any`, `not`, `inside`, `anywhere`) lower into the same + `Formula` tree. Paired legacy and v2 rules that express equivalent + queries produce structurally identical formulas. +- Compatibility-only `r2c-internal-project-depends-on` rules produce query + plans with no formula (`formula()` returns `None`), since they have no + formula semantics. +- Valid `extract`, `taint`, `join`, and unknown future mode strings fail + deterministically with `E_SEMPAI_UNSUPPORTED_MODE`. + +After normalization, semantic constraint checks validate the formula tree: + +- `E_SEMPAI_INVALID_NOT_IN_OR`: a negated formula inside a disjunction + (`pattern-either` / `any`) is not permitted. +- `E_SEMPAI_MISSING_POSITIVE_TERM_IN_AND`: a conjunction (`patterns` / + `all`) must contain at least one positive match-producing term. + +On success, `compile_yaml(yaml)` returns `Vec` with one plan +per rule per declared language. Each plan carries the normalised `Formula` +accessible via `plan.formula()`. + +Unsupported-mode diagnostics point at the rule's `mode` field when that +span is available, which makes whole-document failures deterministic even +though the execution backend is still pending. `compile_dsl(...)` and `execute(...)` still return "not implemented" -diagnostics. They will be wired to the DSL parser and Tree-sitter backend as -those components are delivered in subsequent roadmap phases. +diagnostics. They will be wired to the DSL parser and Tree-sitter backend +as those components are delivered in subsequent roadmap phases. All error conditions are reported through `DiagnosticReport`, which carries stable diagnostic codes suitable for programmatic consumption. Stub methods