From 8825b60d451a2cc090836a5557f5aa299bc8aba4 Mon Sep 17 00:00:00 2001 From: devjow Date: Tue, 17 Feb 2026 15:25:53 +0000 Subject: [PATCH 1/2] feat: gts validator resolves #58 Signed-off-by: devjow --- Cargo.lock | 118 ++---- Cargo.toml | 7 +- gts-validator/Cargo.toml | 41 ++ gts-validator/README.md | 49 +++ gts-validator/src/config.rs | 58 +++ gts-validator/src/error.rs | 112 +++++ gts-validator/src/format/json.rs | 336 +++++++++++++++ gts-validator/src/format/markdown.rs | 503 +++++++++++++++++++++++ gts-validator/src/format/mod.rs | 10 + gts-validator/src/format/yaml.rs | 231 +++++++++++ gts-validator/src/lib.rs | 125 ++++++ gts-validator/src/normalize.rs | 136 ++++++ gts-validator/src/output.rs | 125 ++++++ gts-validator/src/report.rs | 19 + gts-validator/src/strategy/fs.rs | 139 +++++++ gts-validator/src/strategy/mod.rs | 23 ++ gts-validator/src/validator.rs | 308 ++++++++++++++ gts-validator/tests/validate_fs_tests.rs | 336 +++++++++++++++ 18 files changed, 2592 insertions(+), 84 deletions(-) create mode 100644 gts-validator/Cargo.toml create mode 100644 gts-validator/README.md create mode 100644 gts-validator/src/config.rs create mode 100644 gts-validator/src/error.rs create mode 100644 gts-validator/src/format/json.rs create mode 100644 gts-validator/src/format/markdown.rs create mode 100644 gts-validator/src/format/mod.rs create mode 100644 gts-validator/src/format/yaml.rs create mode 100644 gts-validator/src/lib.rs create mode 100644 gts-validator/src/normalize.rs create mode 100644 gts-validator/src/output.rs create mode 100644 gts-validator/src/report.rs create mode 100644 gts-validator/src/strategy/fs.rs create mode 100644 gts-validator/src/strategy/mod.rs create mode 100644 gts-validator/src/validator.rs create mode 100644 gts-validator/tests/validate_fs_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 3a644da..c933e45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -303,6 +303,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -609,6 +618,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "gts-validator" +version = "0.7.8" +dependencies = [ + "anyhow", + "colored", + "glob", + "gts", + "regex", + "serde", + "serde-saphyr", + "serde_json", + "tempfile", + "walkdir", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1294,7 +1319,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2029,22 +2054,13 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.6", -] - [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.5", + "windows-targets", ] [[package]] @@ -2056,22 +2072,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", -] - [[package]] name = "windows-targets" version = "0.53.5" @@ -2079,106 +2079,58 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm 0.53.1", - "windows_aarch64_msvc 0.53.1", - "windows_i686_gnu 0.53.1", - "windows_i686_gnullvm 0.53.1", - "windows_i686_msvc 0.53.1", - "windows_x86_64_gnu 0.53.1", - "windows_x86_64_gnullvm 0.53.1", - "windows_x86_64_msvc 0.53.1", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" - [[package]] name = "windows_x86_64_msvc" version = "0.53.1" diff --git a/Cargo.toml b/Cargo.toml index 3c64dac..a651daf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ categories = ["development-tools::build-utils"] readme = "README.md" [workspace] -members = ["gts", "gts-cli", "gts-id", "gts-macros", "gts-macros-cli"] +members = ["gts", "gts-cli", "gts-id", "gts-macros", "gts-macros-cli", "gts-validator"] resolver = "2" [workspace.lints.rust] @@ -145,6 +145,7 @@ gts-cli = { version = "0.8.1", path = "gts-cli" } gts-id = { version = "0.8.1", path = "gts-id" } gts-macros = { version = "0.8.1", path = "gts-macros" } gts-macros-cli = { version = "0.8.1", path = "gts-macros-cli" } +gts-validator = { version = "0.7.8", path = "gts-validator" } # Core dependencies serde = { version = "1.0", features = ["derive"] } @@ -175,6 +176,10 @@ schemars = { version = "1.2", features = ["uuid1"] } # File system walkdir = "2.5" +glob = "0.3" + +# CLI and terminal output +colored = "3.0" # Format parsing serde-saphyr = "0.0.10" diff --git a/gts-validator/Cargo.toml b/gts-validator/Cargo.toml new file mode 100644 index 0000000..dc473f3 --- /dev/null +++ b/gts-validator/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "gts-validator" +version = "0.7.8" +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true +description = "GTS identifier validator for documentation and configuration files" +keywords = ["gts", "validator", "documentation", "linting"] +categories.workspace = true +readme = "README.md" +publish = true + +[lints] +workspace = true + +[dependencies] +# GTS library for ID validation +gts.workspace = true + +# File system traversal +walkdir.workspace = true +glob.workspace = true + +# Regex for pattern matching +regex.workspace = true + +# Serialization +serde.workspace = true +serde_json.workspace = true +serde-saphyr.workspace = true + +# Error handling +anyhow.workspace = true + +# For colored terminal output +colored.workspace = true + +[dev-dependencies] +tempfile = "3.8" diff --git a/gts-validator/README.md b/gts-validator/README.md new file mode 100644 index 0000000..eae20be --- /dev/null +++ b/gts-validator/README.md @@ -0,0 +1,49 @@ +# gts-validator + +GTS identifier validator for documentation and configuration files (.md, .json, .yaml). + +## Overview + +`gts-validator` provides a library for validating GTS (Global Type System) identifiers found in documentation and configuration files. + +The crate provides a clean separation between: +- **Core validation engine** (input-agnostic): normalize → validate → report +- **Input strategies** (starting with filesystem scanning) + +## Usage + +```rust +use std::path::PathBuf; +use gts_validator::{validate_fs, FsSourceConfig, ValidationConfig}; + +let mut fs_config = FsSourceConfig::default(); +fs_config.paths = vec![PathBuf::from("docs"), PathBuf::from("modules")]; +fs_config.exclude = vec!["target/*".to_owned()]; + +let mut validation_config = ValidationConfig::default(); +validation_config.vendor = Some("x".to_owned()); + +let report = validate_fs(&fs_config, &validation_config).unwrap(); +println!("Files scanned: {}", report.files_scanned); +println!("Errors: {}", report.errors_count); +println!("OK: {}", report.ok); +``` + +## Output Formatting + +The crate includes output formatters for rendering validation reports: + +```rust +use gts_validator::output; + +// JSON output +let mut stdout = std::io::stdout(); +output::write_json(&report, &mut stdout).unwrap(); + +// Human-readable output (with color support) +output::write_human(&report, &mut stdout, true).unwrap(); +``` + +## License + +Apache-2.0 diff --git a/gts-validator/src/config.rs b/gts-validator/src/config.rs new file mode 100644 index 0000000..b2c296c --- /dev/null +++ b/gts-validator/src/config.rs @@ -0,0 +1,58 @@ +//! Configuration types for GTS validation. +//! +//! Split into core validation config (universal) and source-specific config +//! (how content is discovered). This ensures the core API does not leak +//! filesystem concerns. + +use std::path::PathBuf; + +/// Core validation config — applies regardless of input source. +#[derive(Debug, Clone, Default)] +#[non_exhaustive] +pub struct ValidationConfig { + /// Expected vendor for all GTS IDs (e.g., "x"). + /// Example vendors (acme, globex, etc.) are always tolerated. + pub vendor: Option, + /// Scan JSON/YAML object keys for GTS identifiers (default: off). + pub scan_keys: bool, + /// Enable relaxed discovery (catches more candidates, including malformed ones). + /// + /// - `false` (default): only well-formed GTS patterns are discovered — fewer false positives. + /// - `true`: a permissive regex catches ALL gts.* strings, including malformed IDs, + /// so they can be reported as errors. Use this for strict CI enforcement. + pub strict: bool, + /// Additional skip tokens for markdown scanning. + /// If any of these strings appear before a GTS candidate on the same line, + /// validation is skipped for that candidate. Case-insensitive matching. + /// Example: `vec!["**given**".to_owned()]` to skip BDD-style bold formatting. + pub skip_tokens: Vec, +} + +/// Filesystem-specific source options. +/// +/// NOTE: `paths` is required and must be non-empty. Default scan roots +/// (e.g. `docs/modules/libs/examples`) are a CLI/wrapper concern, not +/// baked into the library — keeps `gts-validator` repo-layout-agnostic. +#[derive(Debug, Clone)] +#[non_exhaustive] +pub struct FsSourceConfig { + /// Paths to scan (files or directories). Required, must be non-empty. + pub paths: Vec, + /// Exclude patterns (glob format). + pub exclude: Vec, + /// Maximum file size in bytes (default: 10 MB). + pub max_file_size: u64, + /// Whether to follow symbolic links (default: true — preserves current behavior). + pub follow_links: bool, +} + +impl Default for FsSourceConfig { + fn default() -> Self { + Self { + paths: Vec::new(), + exclude: Vec::new(), + max_file_size: 10_485_760, + follow_links: true, + } + } +} diff --git a/gts-validator/src/error.rs b/gts-validator/src/error.rs new file mode 100644 index 0000000..109525c --- /dev/null +++ b/gts-validator/src/error.rs @@ -0,0 +1,112 @@ +//! Error types for GTS validation. + +use std::path::PathBuf; + +use serde::Serialize; + +/// A single validation error found in a documentation/config file. +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[non_exhaustive] +pub struct ValidationError { + /// File path where the error was found + pub file: PathBuf, + /// Line number (1-indexed) — for .md files; 0 for structured files + pub line: usize, + /// Column number (1-indexed) — for .md files; 0 for structured files + pub column: usize, + /// JSON path (e.g., "$.properties.type.x-gts-ref") — for .json/.yaml files; empty for .md + pub json_path: String, + /// The original raw string that was found + pub raw_value: String, + /// The normalized GTS identifier (after stripping gts://, etc.) + pub normalized_id: String, + /// Human-readable error description + pub error: String, + /// Surrounding context (for .md: the line content; for .json/.yaml: the parent key) + pub context: String, +} + +impl ValidationError { + /// Format the error for human-readable output. + /// + /// For markdown errors: `{file}:{line}:{column}: {error} [{raw_value}]` + /// For JSON/YAML errors: `{file}: {error} [{raw_value}] (at {json_path})` + #[must_use] + pub fn format_human_readable(&self) -> String { + if self.line > 0 && self.column > 0 { + // Markdown error with line/column + format!( + "{}:{}:{}: {} [{}]", + self.file.display(), + self.line, + self.column, + self.error, + self.raw_value + ) + } else if !self.json_path.is_empty() { + // JSON/YAML error with json_path + format!( + "{}: {} [{}] (at {})", + self.file.display(), + self.error, + self.raw_value, + self.json_path + ) + } else { + // Fallback: just file and error + format!( + "{}: {} [{}]", + self.file.display(), + self.error, + self.raw_value + ) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn test_format_markdown_error() { + let err = ValidationError { + file: PathBuf::from("docs/test.md"), + line: 42, + column: 10, + json_path: String::new(), + raw_value: "gts.invalid".to_owned(), + normalized_id: "gts.invalid".to_owned(), + error: "Invalid GTS ID".to_owned(), + context: "Some context".to_owned(), + }; + + let formatted = err.format_human_readable(); + assert!(formatted.contains("docs/test.md:42:10")); + assert!(formatted.contains("Invalid GTS ID")); + assert!(formatted.contains("[gts.invalid]")); + assert!(!formatted.contains("(at")); + } + + #[test] + fn test_format_json_error() { + let err = ValidationError { + file: PathBuf::from("config/test.json"), + line: 0, + column: 0, + json_path: "$.properties.type.x-gts-ref".to_owned(), + raw_value: "gts.invalid".to_owned(), + normalized_id: "gts.invalid".to_owned(), + error: "Invalid GTS ID".to_owned(), + context: "x-gts-ref".to_owned(), + }; + + let formatted = err.format_human_readable(); + assert!(formatted.contains("config/test.json")); + assert!(formatted.contains("Invalid GTS ID")); + assert!(formatted.contains("[gts.invalid]")); + assert!(formatted.contains("(at $.properties.type.x-gts-ref)")); + assert!(!formatted.contains(":0:0")); + } +} diff --git a/gts-validator/src/format/json.rs b/gts-validator/src/format/json.rs new file mode 100644 index 0000000..61701ee --- /dev/null +++ b/gts-validator/src/format/json.rs @@ -0,0 +1,336 @@ +//! JSON file scanner for GTS identifiers. +//! +//! Uses tree-walking to scan string values (not keys by default). + +use std::path::Path; + +use serde_json::Value; + +use crate::error::ValidationError; +use crate::normalize::normalize_candidate; +use crate::validator::validate_candidate; + +/// Scan JSON content for GTS identifiers. +pub fn scan_json_content( + content: &str, + path: &Path, + vendor: Option<&str>, + scan_keys: bool, +) -> Vec { + let value: Value = match serde_json::from_str(content) { + Ok(v) => v, + Err(_e) => return vec![], + }; + + let mut errors = Vec::new(); + walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); + errors +} + +/// Scan a JSON file for GTS identifiers (file-based convenience wrapper). +#[cfg(test)] +pub fn scan_json_file( + path: &Path, + vendor: Option<&str>, + max_file_size: u64, + scan_keys: bool, +) -> Vec { + // Check file size + if let Ok(metadata) = std::fs::metadata(path) + && metadata.len() > max_file_size + { + return vec![]; + } + + // Read as UTF-8; skip file on encoding error + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_e) => return vec![], + }; + + scan_json_content(&content, path, vendor, scan_keys) +} + +/// Walk a JSON value tree and validate GTS identifiers in string values. +/// This is shared by both JSON and YAML scanners. +pub fn walk_json_value( + value: &Value, + path: &Path, + vendor: Option<&str>, + errors: &mut Vec, + json_path: &str, + scan_keys: bool, +) { + match value { + Value::String(s) => { + let candidate_str = s.as_str(); + let is_xgts_ref = json_path.ends_with(".x-gts-ref"); + + // PRE-FILTER: x-gts-ref special values that are NOT GTS identifiers. + // These must be checked BEFORE normalization to avoid misleading errors. + // Spec section 9.6 defines allowed x-gts-ref values: + // - GTS identifier (gts.vendor.pkg...) + // - Wildcard pattern (gts.*) + // - Bare wildcard (*) + // - Relative JSON pointer (/$id, /properties/id, etc.) + if is_xgts_ref && (candidate_str.starts_with('/') || candidate_str == "*") { + return; // valid x-gts-ref value, not a GTS ID to validate + } + + // Only consider strings that look like GTS identifiers + // Skip filenames that contain GTS IDs (e.g., "gts.x.core.type.v1~.schema.json") + // A string is likely a filename if it contains a tilde followed by a dot and extension + let looks_like_filename = candidate_str.contains("~.") + && candidate_str + .rfind('.') + .is_some_and(|pos| pos > candidate_str.rfind('~').unwrap_or(0)); + + if (candidate_str.starts_with("gts://gts.") || candidate_str.starts_with("gts.")) + && !looks_like_filename + { + match normalize_candidate(candidate_str) { + Ok(candidate) => { + let allow_wildcards = is_xgts_ref; + let validation_errors = + validate_candidate(&candidate, vendor, allow_wildcards); + for err in validation_errors { + errors.push(ValidationError { + file: path.to_owned(), + line: 0, + column: 0, + json_path: json_path.to_owned(), + raw_value: candidate.original.clone(), + normalized_id: candidate.gts_id.clone(), + error: err, + context: json_path.to_owned(), + }); + } + } + Err(e) => { + errors.push(ValidationError { + file: path.to_owned(), + line: 0, + column: 0, + json_path: json_path.to_owned(), + raw_value: candidate_str.to_owned(), + normalized_id: String::new(), + error: e, + context: json_path.to_owned(), + }); + } + } + } + } + Value::Object(map) => { + for (key, val) in map { + // Optionally scan keys + if scan_keys && (key.starts_with("gts://") || key.starts_with("gts.")) { + match normalize_candidate(key) { + Ok(candidate) => { + let validation_errors = validate_candidate(&candidate, vendor, false); + for err in validation_errors { + errors.push(ValidationError { + file: path.to_owned(), + line: 0, + column: 0, + json_path: format!("{json_path}.{key}"), + raw_value: candidate.original.clone(), + normalized_id: candidate.gts_id.clone(), + error: err, + context: format!("key: {key}"), + }); + } + } + Err(e) => { + errors.push(ValidationError { + file: path.to_owned(), + line: 0, + column: 0, + json_path: format!("{json_path}.{key}"), + raw_value: key.clone(), + normalized_id: String::new(), + error: e, + context: format!("key: {key}"), + }); + } + } + } + walk_json_value( + val, + path, + vendor, + errors, + &format!("{json_path}.{key}"), + scan_keys, + ); + } + } + Value::Array(arr) => { + for (i, val) in arr.iter().enumerate() { + walk_json_value( + val, + path, + vendor, + errors, + &format!("{json_path}[{i}]"), + scan_keys, + ); + } + } + _ => {} + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + fn create_temp_json(content: &str) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(content.as_bytes()).unwrap(); + file + } + + #[test] + fn test_scan_json_valid_id() { + let content = r#"{"$id": "gts://gts.x.core.events.type.v1~"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_scan_json_invalid_id() { + let content = r#"{"$id": "gts.invalid"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!(!errors.is_empty()); + } + + #[test] + fn test_scan_json_xgts_ref_wildcard() { + let content = r#"{"x-gts-ref": "gts.x.core.*"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Wildcards in x-gts-ref should be allowed" + ); + } + + #[test] + fn test_scan_json_xgts_ref_bare_wildcard() { + let content = r#"{"x-gts-ref": "*"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Bare wildcard in x-gts-ref should be skipped" + ); + } + + #[test] + fn test_scan_json_xgts_ref_relative_pointer() { + let content = r#"{"x-gts-ref": "/$id"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Relative pointers in x-gts-ref should be skipped" + ); + } + + #[test] + fn test_scan_json_nested_values() { + let content = r#"{ + "properties": { + "type": { + "x-gts-ref": "gts.x.core.events.type.v1~" + } + } + }"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Nested values should be found and validated" + ); + } + + #[test] + fn test_scan_json_array_values() { + let content = r#"{ + "capabilities": [ + "gts.x.core.events.type.v1~", + "gts.x.core.events.topic.v1~" + ] + }"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Array values should be found and validated" + ); + } + + #[test] + fn test_scan_json_invalid_json() { + let content = r#"{"invalid": json}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Invalid JSON should be skipped with warning" + ); + } + + #[test] + fn test_scan_json_error_includes_json_path() { + let content = r#"{"properties": {"type": {"x-gts-ref": "gts.invalid"}}}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!(!errors.is_empty()); + assert!(errors[0].json_path.contains("properties.type.x-gts-ref")); + } + + #[test] + fn test_scan_json_vendor_mismatch() { + let content = r#"{"$id": "gts://gts.hx.core.events.type.v1~"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), Some("x"), 10_485_760, false); + assert!(!errors.is_empty()); + assert!(errors[0].error.contains("Vendor mismatch")); + } + + #[test] + fn test_scan_json_keys_not_scanned_by_default() { + let content = r#"{"gts.x.core.type.v1~": "value"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Keys should not be scanned by default"); + } + + #[test] + fn test_scan_json_keys_scanned_when_enabled() { + let content = r#"{"gts.x.core.events.type.v1~": "value"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, true); + assert!( + errors.is_empty(), + "Valid GTS ID keys should pass validation" + ); + } + + #[test] + fn test_scan_json_invalid_key_when_scanning_enabled() { + let content = r#"{"gts.invalid": "value"}"#; + let file = create_temp_json(content); + let errors = scan_json_file(file.path(), None, 10_485_760, true); + assert!( + !errors.is_empty(), + "Invalid GTS ID keys should be caught when key scanning is enabled" + ); + } +} diff --git a/gts-validator/src/format/markdown.rs b/gts-validator/src/format/markdown.rs new file mode 100644 index 0000000..a808e27 --- /dev/null +++ b/gts-validator/src/format/markdown.rs @@ -0,0 +1,503 @@ +//! Markdown file scanner for GTS identifiers. +//! +//! Uses a two-stage approach: +//! 1. Discovery regex finds candidates +//! 2. `normalize_candidate()` → `validate_candidate()` validates them + +use std::collections::HashSet; +use std::path::Path; +use std::sync::LazyLock; + +use regex::Regex; + +use crate::error::ValidationError; +use crate::normalize::normalize_candidate; +use crate::validator::{is_bad_example_context, is_wildcard_context, validate_candidate}; + +/// Markdown parsing state for code block tracking +#[derive(Debug, Clone, PartialEq, Eq)] +enum MarkdownState { + Prose, + FencedBlock { + skip: bool, + fence_char: char, + opening_fence_len: usize, + }, +} + +fn parse_fence(trimmed_line: &str) -> Option<(char, usize)> { + let fence_char = match trimmed_line.as_bytes().first() { + Some(b'`') => '`', + Some(b'~') => '~', + _ => return None, + }; + + let fence_len = trimmed_line + .chars() + .take_while(|&c| c == fence_char) + .count(); + if fence_len >= 3 { + Some((fence_char, fence_len)) + } else { + None + } +} + +/// Discovery regex (relaxed): finds strings that LOOK like GTS identifiers. +/// This is intentionally broader than the spec — validation is done by `GtsID::new()`. +/// +/// Strategy: Match gts. followed by 4+ dot-separated segments where at least one +/// segment looks like a version (starts with 'v' followed by digit). +/// This catches both valid and malformed IDs for validation (more errors reported). +/// Stops at tilde followed by non-alphanumeric to avoid matching filenames like "id.v1~.schema.json" +static GTS_DISCOVERY_PATTERN_RELAXED: LazyLock = LazyLock::new(|| { + match Regex::new(concat!( + r"(?:gts://)?", // optional URI prefix + r"\bgts\.", // mandatory gts. prefix (word boundary prevents xgts. match) + r"(?:[a-z_*][a-z0-9_*.-]*\.){3,}", // at least 3 segments (permissive: allows -, .) + r"[a-z_*][a-z0-9_*.-]*", // final segment before version + r"\.v[0-9]+", // version segment (required anchor) + r"(?:\.[0-9]+)?", // optional minor version + r"(?:~[a-z_][a-z0-9_.-]*)*", // optional chained segments (permissive) + r"~?", // optional trailing tilde (but not if followed by .) + )) { + Ok(regex) => regex, + Err(err) => panic!("Invalid discovery regex: {err}"), + } +}); + +/// Discovery regex (well-formed): only matches well-formed GTS identifiers. +/// Requires exactly 5 segments with proper structure (fewer errors reported). +static GTS_DISCOVERY_PATTERN_WELL_FORMED: LazyLock = LazyLock::new(|| { + match Regex::new(concat!( + r"(?:gts://)?", // optional URI prefix + r"\bgts\.", // mandatory gts. prefix (word boundary prevents xgts. match) + r"[a-z_*][a-z0-9_*]*\.", // vendor + r"[a-z_*][a-z0-9_*]*\.", // package + r"[a-z_*][a-z0-9_*]*\.", // namespace + r"[a-z_*][a-z0-9_*]*\.", // type + r"v[0-9]+", // major version (required) + r"(?:\.[0-9]+)?", // optional minor version + r"(?:~[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.[a-z_][a-z0-9_]*\.v[0-9]+(?:\.[0-9]+)?)*", // chained segments + r"~?", // optional trailing tilde + )) { + Ok(regex) => regex, + Err(err) => panic!("Invalid discovery regex: {err}"), + } +}); + +/// Scan markdown content for GTS identifiers. +pub fn scan_markdown_content( + content: &str, + path: &Path, + vendor: Option<&str>, + strict: bool, + skip_tokens: &[String], +) -> Vec { + let pattern = if strict { + &*GTS_DISCOVERY_PATTERN_RELAXED + } else { + &*GTS_DISCOVERY_PATTERN_WELL_FORMED + }; + let mut errors = Vec::new(); + let mut state = MarkdownState::Prose; + let mut seen_candidates: HashSet<(usize, String)> = HashSet::new(); + + for (line_num, line) in content.lines().enumerate() { + let line_number = line_num + 1; // 1-indexed + + // Update markdown state for code blocks (``` and ~~~ per CommonMark spec) + let trimmed_line = line.trim_start(); + if let Some((fence_char, fence_len)) = parse_fence(trimmed_line) { + match &state { + MarkdownState::Prose => { + // Entering a fenced block + let language = trimmed_line[fence_len..].trim().to_lowercase(); + + // Skip grammar/pattern definition blocks + let skip = matches!( + language.as_str(), + "ebnf" | "regex" | "bnf" | "abnf" | "grammar" + ); + + state = MarkdownState::FencedBlock { + skip, + fence_char, + opening_fence_len: fence_len, + }; + continue; + } + MarkdownState::FencedBlock { + fence_char: open_fence_char, + opening_fence_len, + .. + } => { + // Exiting a fenced block requires matching delimiter with sufficient length. + if fence_char == *open_fence_char && fence_len >= *opening_fence_len { + state = MarkdownState::Prose; + continue; + } + } + } + } + + // Skip lines inside skip blocks + if let MarkdownState::FencedBlock { skip: true, .. } = state { + continue; + } + + // Find all GTS candidates on this line + for mat in pattern.find_iter(line) { + let candidate_str = mat.as_str(); + let match_start = mat.start(); + + // Deduplicate: skip if we've seen this candidate on this line + if !seen_candidates.insert((line_number, candidate_str.to_owned())) { + continue; + } + + // Skip validation if this is a "bad example" context + if is_bad_example_context(line, mat.start()) { + continue; + } + + // Check consumer-provided skip tokens + if !skip_tokens.is_empty() + && let Some(before) = line.get(..mat.start()) + { + let before_lower = before.to_lowercase(); + if skip_tokens + .iter() + .any(|token| before_lower.contains(&token.to_lowercase())) + { + continue; + } + } + + // Normalize the candidate + let candidate = match normalize_candidate(candidate_str) { + Ok(c) => c, + Err(e) => { + errors.push(ValidationError { + file: path.to_owned(), + line: line_number, + column: match_start + 1, // 1-indexed + json_path: String::new(), + raw_value: candidate_str.to_owned(), + normalized_id: String::new(), + error: e, + context: line.to_owned(), + }); + continue; + } + }; + + // Check if wildcards are allowed in this context + let allow_wildcards = is_wildcard_context(line, match_start); + + // Validate the candidate + let validation_errors = validate_candidate(&candidate, vendor, allow_wildcards); + for err in validation_errors { + errors.push(ValidationError { + file: path.to_owned(), + line: line_number, + column: match_start + 1, // 1-indexed + json_path: String::new(), + raw_value: candidate.original.clone(), + normalized_id: candidate.gts_id.clone(), + error: err, + context: line.to_owned(), + }); + } + } + } + + errors +} + +/// Scan a markdown file for GTS identifiers (file-based convenience wrapper). +#[cfg(test)] +pub fn scan_markdown_file( + path: &Path, + vendor: Option<&str>, + max_file_size: u64, + strict: bool, +) -> Vec { + // Check file size + if let Ok(metadata) = std::fs::metadata(path) + && metadata.len() > max_file_size + { + return vec![]; + } + + // Read as UTF-8; skip file on encoding error + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_e) => return vec![], + }; + + scan_markdown_content(&content, path, vendor, strict, &[]) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + fn create_temp_md(content: &str) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(content.as_bytes()).unwrap(); + file + } + + #[test] + fn test_scan_markdown_valid_id() { + let file = create_temp_md("The type is gts.x.core.events.type.v1~"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_scan_markdown_invalid_id() { + let file = create_temp_md("The type is gts.x.core.events.type.v1"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + !errors.is_empty(), + "Single-segment instance ID should be rejected" + ); + } + + #[test] + fn test_scan_markdown_skip_ebnf_block() { + let content = r" +```ebnf +gts.invalid.pattern.here.v1~ +``` +"; + let file = create_temp_md(content); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "EBNF blocks should be skipped"); + } + + #[test] + fn test_scan_markdown_validate_json_block() { + let content = r#" +```json +{"$id": "gts://gts.x.core.events.type.v1~"} +``` +"#; + let file = create_temp_md(content); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "JSON blocks should be validated"); + } + + #[test] + fn test_scan_markdown_skip_invalid_context() { + let file = create_temp_md("\u{274c} gts.invalid.id.here.v1"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Invalid examples should be skipped"); + } + + #[test] + fn test_scan_markdown_wildcard_in_pattern_context() { + let file = create_temp_md("pattern: gts.x.core.events.type.v1~"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Valid IDs in pattern context should be allowed" + ); + } + + #[test] + fn test_scan_markdown_wildcard_not_in_pattern_context() { + let file = create_temp_md("The type is gts.x.core.events.type.v1~"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Valid IDs should pass"); + } + + #[test] + fn test_scan_markdown_gts_uri() { + let file = create_temp_md(r#"Use "$id": "gts://gts.x.core.events.type.v1~""#); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "gts:// URIs should be normalized and validated" + ); + } + + #[test] + fn test_scan_markdown_vendor_mismatch() { + let file = create_temp_md("The type is gts.hx.core.events.type.v1~"); + let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false); + assert!(!errors.is_empty()); + assert!(errors[0].error.contains("Vendor mismatch")); + } + + #[test] + fn test_scan_markdown_example_vendor_tolerated() { + let file = create_temp_md("Example: gts.acme.core.events.type.v1~"); + let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false); + assert!(errors.is_empty(), "Example vendors should be tolerated"); + } + + #[test] + fn test_scan_markdown_deduplication() { + // Use an invalid ID (wrong vendor) twice on the same line — dedup should produce exactly 1 error + let file = create_temp_md( + "gts.wrongvendor.core.events.type.v1~ and gts.wrongvendor.core.events.type.v1~ again", + ); + let errors = scan_markdown_file(file.path(), Some("x"), 10_485_760, false); + assert_eq!( + errors.len(), + 1, + "Duplicate invalid ID on same line should produce exactly 1 error, got: {errors:?}" + ); + } + + #[test] + fn test_scan_markdown_error_after_gts_id() { + let file = create_temp_md("gts.x.core.events.type.v1~ handles error cases"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Valid ID should not be suppressed by 'error' appearing after it" + ); + } + + #[test] + fn test_scan_markdown_invalid_before_gts_id() { + let file = create_temp_md("invalid: gts.bad.format.here.v1"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Invalid examples should be skipped"); + } + + #[test] + fn test_scan_markdown_strict_mode_catches_malformed() { + let file = create_temp_md("The type is gts.my-vendor.core.events.type.v1~"); + let errors_strict = scan_markdown_file(file.path(), None, 10_485_760, true); + let errors_normal = scan_markdown_file(file.path(), None, 10_485_760, false); + + assert!( + !errors_strict.is_empty(), + "Strict mode should catch malformed ID with hyphens" + ); + assert!( + errors_normal.is_empty(), + "Normal mode won't match malformed pattern" + ); + } + + #[test] + fn test_scan_markdown_strict_mode_catches_extra_dots() { + let file = create_temp_md("The type is gts.x.core.events.type.name.v1~"); + let errors_strict = scan_markdown_file(file.path(), None, 10_485_760, true); + + assert!( + !errors_strict.is_empty(), + "Strict mode should catch ID with extra segments" + ); + } + + #[test] + fn test_scan_markdown_normal_mode_well_formed_only() { + let file = create_temp_md("Valid: gts.x.core.events.type.v1~ and malformed: gts.bad-id.v1"); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + + assert!( + errors.is_empty(), + "Normal mode should only validate well-formed patterns" + ); + } + + #[test] + fn test_scan_markdown_skip_tokens() { + // skip_tokens should suppress validation when the token appears before the candidate + let content = "**given** gts.bad.format.here.v1~"; + let errors = scan_markdown_content( + content, + Path::new("test.md"), + None, + true, // strict to ensure the relaxed regex would catch it + &["**given**".to_owned()], + ); + assert!( + errors.is_empty(), + "skip_tokens should suppress validation: {errors:?}" + ); + + // Same content without skip_tokens should produce errors (vendor mismatch) + let content_mismatch = "**given** gts.y.core.pkg.mytype.v1~ is registered"; + let errors_no_skip = scan_markdown_content( + content_mismatch, + Path::new("test.md"), + Some("x"), + false, + &[], + ); + assert!( + !errors_no_skip.is_empty(), + "Without skip_tokens, vendor mismatch should be reported" + ); + + // With skip_tokens, the same content should be suppressed + let errors_with_skip = scan_markdown_content( + content_mismatch, + Path::new("test.md"), + Some("x"), + false, + &["**given**".to_owned()], + ); + assert!( + errors_with_skip.is_empty(), + "With skip_tokens, vendor mismatch should be suppressed: {errors_with_skip:?}" + ); + } + + #[test] + fn test_scan_markdown_tilde_fence() { + // ~~~ fences should be handled the same as ``` fences + let content = "~~~ebnf\ngts.invalid.pattern.here.v1~\n~~~\n"; + let file = create_temp_md(content); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "~~~ EBNF blocks should be skipped: {errors:?}" + ); + } + + #[test] + fn test_scan_markdown_tilde_fence_json_validated() { + // ~~~json blocks should be validated (same as ```json) + let content = "~~~json\n{\"$id\": \"gts://gts.x.core.events.type.v1~\"}\n~~~\n"; + let file = create_temp_md(content); + let errors = scan_markdown_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "~~~json blocks should be validated and pass: {errors:?}" + ); + } + + #[test] + fn test_scan_markdown_mismatched_fence_does_not_close_block() { + // A ~~~ line must not close a ``` block. + let content = "```ebnf\n~~~\ngts.bad.format.here.v1~\n```\n"; + let file = create_temp_md(content); + let errors = scan_markdown_file(file.path(), None, 10_485_760, true); + assert!( + errors.is_empty(), + "Mismatched fence should not close block; content inside ebnf block must be skipped: {errors:?}" + ); + } + + #[test] + fn test_scan_markdown_word_boundary() { + // Regex should NOT match "xgts.x.core.events.type.v1~" (no word boundary) + let content = "The identifier xgts.x.core.events.type.v1~ is wrong"; + let errors = scan_markdown_content(content, Path::new("test.md"), None, false, &[]); + assert!( + errors.is_empty(), + "Word boundary should prevent matching xgts.*: {errors:?}" + ); + } +} diff --git a/gts-validator/src/format/mod.rs b/gts-validator/src/format/mod.rs new file mode 100644 index 0000000..48b38fe --- /dev/null +++ b/gts-validator/src/format/mod.rs @@ -0,0 +1,10 @@ +//! Format-specific scanners for GTS identifier discovery and validation. +//! +//! Each sub-module handles a specific file format: +//! - `markdown` — Markdown files with code-block state machine +//! - `json` — JSON tree-walker +//! - `yaml` — YAML scanner (delegates to JSON walker via `serde_json::Value`) + +pub mod json; +pub mod markdown; +pub mod yaml; diff --git a/gts-validator/src/format/yaml.rs b/gts-validator/src/format/yaml.rs new file mode 100644 index 0000000..383b01d --- /dev/null +++ b/gts-validator/src/format/yaml.rs @@ -0,0 +1,231 @@ +//! YAML file scanner for GTS identifiers. +//! +//! Uses tree-walking to scan string values (not keys by default). + +use std::path::Path; + +use serde_json::Value; + +use crate::error::ValidationError; +use crate::format::json::walk_json_value; + +fn split_yaml_documents(content: &str) -> Vec { + let mut documents = Vec::new(); + let mut current_doc: Vec<&str> = Vec::new(); + + for line in content.lines() { + if line.trim() == "---" { + let doc = current_doc.join("\n"); + if !doc.trim().is_empty() { + documents.push(doc); + } + current_doc.clear(); + continue; + } + current_doc.push(line); + } + + let doc = current_doc.join("\n"); + if !doc.trim().is_empty() { + documents.push(doc); + } + + documents +} + +/// Scan YAML content for GTS identifiers. +pub fn scan_yaml_content( + content: &str, + path: &Path, + vendor: Option<&str>, + scan_keys: bool, +) -> Vec { + let mut errors = Vec::new(); + + // Parse all documents with the YAML stream parser first. + // If this fails (e.g., one malformed document in the stream), fall back to per-document + // parsing so valid sibling documents are still validated. + let documents: Vec = match serde_saphyr::from_multiple(content) { + Ok(docs) => docs, + Err(_e) => { + for segment in split_yaml_documents(content) { + let value: Value = match serde_saphyr::from_str(&segment) { + Ok(doc) => doc, + Err(_segment_err) => continue, + }; + + walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); + } + + return errors; + } + }; + + for value in documents { + // Reuse the JSON walker since both operate on serde_json::Value + walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); + } + + errors +} + +/// Scan a YAML file for GTS identifiers (file-based convenience wrapper). +#[cfg(test)] +pub fn scan_yaml_file( + path: &Path, + vendor: Option<&str>, + max_file_size: u64, + scan_keys: bool, +) -> Vec { + // Check file size + if let Ok(metadata) = std::fs::metadata(path) + && metadata.len() > max_file_size + { + return vec![]; + } + + // Read as UTF-8; skip file on encoding error + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(_e) => return vec![], + }; + + scan_yaml_content(&content, path, vendor, scan_keys) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + fn create_temp_yaml(content: &str) -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(content.as_bytes()).unwrap(); + file + } + + #[test] + fn test_scan_yaml_valid_id() { + let content = r" +$id: gts://gts.x.core.events.type.v1~ +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_scan_yaml_invalid_id() { + let content = r" +$id: gts.invalid +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!(!errors.is_empty()); + } + + #[test] + fn test_scan_yaml_xgts_ref_wildcard() { + let content = r" +x-gts-ref: gts.x.core.* +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Wildcards in x-gts-ref should be allowed" + ); + } + + #[test] + fn test_scan_yaml_xgts_ref_bare_wildcard() { + let content = r#" +x-gts-ref: "*" +"#; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Bare wildcard in x-gts-ref should be skipped" + ); + } + + #[test] + fn test_scan_yaml_nested_values() { + let content = r" +properties: + type: + x-gts-ref: gts.x.core.events.type.v1~ +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Nested values should be found and validated" + ); + } + + #[test] + fn test_scan_yaml_array_values() { + let content = r" +capabilities: + - gts.x.core.events.type.v1~ + - gts.x.core.events.topic.v1~ +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Array values should be found and validated" + ); + } + + #[test] + fn test_scan_yaml_invalid_yaml() { + let content = r" +invalid: yaml: syntax: +"; + let file = create_temp_yaml(content); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + assert!( + errors.is_empty(), + "Invalid YAML should be skipped with warning" + ); + } + + #[test] + fn test_scan_yaml_multi_document_all_validated() { + // All documents in a multi-document stream must be validated. + let content = "\ +$id: gts.x.core.events.type.v1~ +--- +$id: gts.invalid +"; + let errors = scan_yaml_content(content, Path::new("multi.yaml"), None, false); + // Both documents are parsed — gts.invalid in doc 2 must produce an error + assert!( + !errors.is_empty(), + "Multi-document YAML: second document with invalid ID should be caught, got no errors" + ); + } + + #[test] + fn test_scan_yaml_multi_document_malformed_doc_does_not_suppress_valid_doc() { + // A malformed document must be skipped, but valid documents around it must still be validated. + let content = "\ +$id: gts.y.core.pkg.mytype.v1~ +--- +invalid: yaml: syntax: +--- +$id: gts.y.core.pkg.mytype.v1~ +"; + // With vendor "x", both valid docs should produce vendor-mismatch errors. + // If the malformed middle doc caused an early return, errors would be empty. + let errors = scan_yaml_content(content, Path::new("multi.yaml"), Some("x"), false); + assert!( + !errors.is_empty(), + "Valid documents must be validated even when a sibling document is malformed, got no errors" + ); + } +} diff --git a/gts-validator/src/lib.rs b/gts-validator/src/lib.rs new file mode 100644 index 0000000..83bff09 --- /dev/null +++ b/gts-validator/src/lib.rs @@ -0,0 +1,125 @@ +//! # gts-validator +//! +//! GTS identifier validator for documentation and configuration files. +//! +//! This crate provides a clean separation between the **core validation engine** +//! (input-agnostic) and **input strategies** (starting with filesystem scanning). +//! +//! ## Quick Start +//! +//! ```rust,no_run +//! use std::path::PathBuf; +//! use gts_validator::{validate_fs, FsSourceConfig, ValidationConfig}; +//! +//! let mut fs_config = FsSourceConfig::default(); +//! fs_config.paths = vec![PathBuf::from("docs"), PathBuf::from("modules")]; +//! fs_config.exclude = vec!["target/*".to_owned()]; +//! +//! let mut validation_config = ValidationConfig::default(); +//! validation_config.vendor = Some("x".to_owned()); +//! +//! let report = validate_fs(&fs_config, &validation_config).unwrap(); +//! println!("Files scanned: {}", report.files_scanned); +//! println!("Errors: {}", report.errors_count); +//! println!("OK: {}", report.ok); +//! ``` + +mod config; +mod error; +mod format; +mod normalize; +pub mod output; +mod report; +mod strategy; +mod validator; + +pub use config::{FsSourceConfig, ValidationConfig}; +pub use error::ValidationError; +pub use report::ValidationReport; + +use strategy::ContentFormat; +use strategy::fs::{content_format_for, find_files, read_validation_item}; + +/// Validate GTS identifiers in files on disk. +/// +/// This is the primary public API for Phase 1. +/// +/// # Arguments +/// +/// * `fs_config` - Filesystem-specific source options (paths, exclude, max file size, etc.) +/// * `validation_config` - Core validation config (vendor, `scan_keys`, strict) +/// +/// # Errors +/// +/// Returns an error if `fs_config.paths` is empty or if any provided path does not exist. +/// Returns `Ok` with `files_scanned: 0` if paths exist but contain no scannable files. +pub fn validate_fs( + fs_config: &FsSourceConfig, + validation_config: &ValidationConfig, +) -> anyhow::Result { + if fs_config.paths.is_empty() { + anyhow::bail!("No paths provided for validation"); + } + + // Validate explicitly provided paths exist + for path in &fs_config.paths { + if !path.exists() { + anyhow::bail!("Path does not exist: {}", path.display()); + } + } + + let files = find_files(fs_config); + + if files.is_empty() { + return Ok(ValidationReport { + files_scanned: 0, + errors_count: 0, + ok: true, + errors: vec![], + }); + } + + let mut errors = Vec::new(); + let vendor = validation_config.vendor.as_deref(); + let mut files_scanned: usize = 0; + + for file_path in &files { + let Some(item) = read_validation_item(file_path, fs_config.max_file_size) else { + continue; // skip unreadable/oversized files — don't count as scanned + }; + + let file_errors = match content_format_for(file_path) { + Some(ContentFormat::Markdown) => format::markdown::scan_markdown_content( + &item.content, + file_path, + vendor, + validation_config.strict, + &validation_config.skip_tokens, + ), + Some(ContentFormat::Json) => format::json::scan_json_content( + &item.content, + file_path, + vendor, + validation_config.scan_keys, + ), + Some(ContentFormat::Yaml) => format::yaml::scan_yaml_content( + &item.content, + file_path, + vendor, + validation_config.scan_keys, + ), + None => continue, + }; + + files_scanned += 1; + errors.extend(file_errors); + } + + let errors_count = errors.len(); + Ok(ValidationReport { + files_scanned, + errors_count, + ok: errors.is_empty(), + errors, + }) +} diff --git a/gts-validator/src/normalize.rs b/gts-validator/src/normalize.rs new file mode 100644 index 0000000..0071cfa --- /dev/null +++ b/gts-validator/src/normalize.rs @@ -0,0 +1,136 @@ +//! Normalization of GTS identifier candidates. +//! +//! This module provides a single normalization function that ALL scanners must call +//! before passing candidates to the validator. It handles: +//! - Trimming whitespace +//! - Stripping surrounding quotes +//! - Stripping `gts://` URI prefix +//! - Rejecting URI fragments (#) and query strings (?) +//! - Verifying the `gts.` prefix + +/// Result of normalizing a raw candidate string. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NormalizedCandidate { + /// The canonical GTS identifier string (ready for `GtsID::new()`) + pub gts_id: String, + /// The original raw string (for error reporting) + pub original: String, +} + +/// Normalize a raw candidate string into a form suitable for `GtsID::new()`. +/// +/// Steps: +/// 1. Trim whitespace +/// 2. Strip surrounding quotes (" or ') +/// 3. Strip `gts://` prefix if present +/// 4. Reject if URI fragment (#) or query (?) is present after gts:// +/// 5. Verify starts with `gts.` +/// +/// # Errors +/// +/// Returns an error if: +/// - The string contains URI fragments (#) or query strings (?) after `gts://` +/// - The string does not start with `gts.` after normalization +pub fn normalize_candidate(raw: &str) -> Result { + let mut trimmed = raw.trim(); + // Strip a single layer of surrounding quotes (not greedy trim_matches which + // would strip multiple layers and trailing apostrophes like 's) + if (trimmed.starts_with('"') && trimmed.ends_with('"')) + || (trimmed.starts_with('\'') && trimmed.ends_with('\'')) + { + trimmed = &trimmed[1..trimmed.len() - 1]; + } + + let gts_id = if let Some(stripped) = trimmed.strip_prefix("gts://") { + // Reject URI fragments and query strings — spec section 9.1 says + // remainder must be a plain GTS identifier + if stripped.contains('#') || stripped.contains('?') { + return Err(format!( + "gts:// URI must not contain fragments (#) or query strings (?): '{raw}'" + )); + } + stripped.to_owned() + } else { + trimmed.to_owned() + }; + + if !gts_id.starts_with("gts.") { + return Err(format!("Does not start with 'gts.': '{raw}'")); + } + + Ok(NormalizedCandidate { + gts_id, + original: raw.to_owned(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalize_gts_uri() { + let result = normalize_candidate("gts://gts.x.core.type.v1~").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.type.v1~"); + assert_eq!(result.original, "gts://gts.x.core.type.v1~"); + } + + #[test] + fn test_normalize_plain_gts_id() { + let result = normalize_candidate("gts.x.core.type.v1~").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.type.v1~"); + assert_eq!(result.original, "gts.x.core.type.v1~"); + } + + #[test] + fn test_normalize_with_whitespace() { + let result = normalize_candidate(" gts.x.core.type.v1~ ").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.type.v1~"); + } + + #[test] + fn test_normalize_with_quotes() { + let result = normalize_candidate("\"gts.x.core.type.v1~\"").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.type.v1~"); + + let result = normalize_candidate("'gts.x.core.type.v1~'").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.type.v1~"); + } + + #[test] + fn test_reject_fragment() { + let result = normalize_candidate("gts://gts.x.core.type.v1~#foo"); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("fragments (#)")); + } + + #[test] + fn test_reject_query_string() { + let result = normalize_candidate("gts://gts.x.core.type.v1~?bar=1"); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("query strings (?)")); + } + + #[test] + fn test_reject_no_gts_prefix() { + let result = normalize_candidate("x.core.type.v1~"); + assert!(result.is_err()); + assert!(result.unwrap_err().contains("Does not start with 'gts.'")); + } + + #[test] + fn test_normalize_chained_id() { + let result = + normalize_candidate("gts.x.core.events.type.v1~ven.app._.custom_event.v1~").unwrap(); + assert_eq!( + result.gts_id, + "gts.x.core.events.type.v1~ven.app._.custom_event.v1~" + ); + } + + #[test] + fn test_normalize_with_wildcard() { + let result = normalize_candidate("gts.x.core.*").unwrap(); + assert_eq!(result.gts_id, "gts.x.core.*"); + } +} diff --git a/gts-validator/src/output.rs b/gts-validator/src/output.rs new file mode 100644 index 0000000..72709fc --- /dev/null +++ b/gts-validator/src/output.rs @@ -0,0 +1,125 @@ +//! Shared output formatting for validation reports. +//! +//! Provides JSON and human-readable formatters for `ValidationReport`. + +use std::io::Write; + +use colored::Colorize; + +use crate::report::ValidationReport; + +/// Format a `ValidationReport` as JSON to a writer. +/// +/// # Errors +/// +/// Returns an error if serialization or writing fails. +pub fn write_json(report: &ValidationReport, writer: &mut dyn Write) -> anyhow::Result<()> { + let json = serde_json::to_string_pretty(report)?; + writeln!(writer, "{json}")?; + Ok(()) +} + +/// Format a `ValidationReport` as human-readable text to a writer. +/// +/// # Errors +/// +/// Returns an error if writing fails. +pub fn write_human( + report: &ValidationReport, + writer: &mut dyn Write, + use_color: bool, +) -> anyhow::Result<()> { + writeln!(writer)?; + writeln!(writer, "{}", "=".repeat(80))?; + if use_color { + writeln!(writer, " {}", "GTS DOCUMENTATION VALIDATOR".bold())?; + } else { + writeln!(writer, " GTS DOCUMENTATION VALIDATOR")?; + } + writeln!(writer, "{}", "=".repeat(80))?; + writeln!(writer)?; + writeln!(writer, " Files scanned: {}", report.files_scanned)?; + writeln!(writer, " Errors found: {}", report.errors_count)?; + writeln!(writer)?; + + if !report.errors.is_empty() { + writeln!(writer, "{}", "-".repeat(80))?; + if use_color { + writeln!(writer, " {}", "ERRORS".red().bold())?; + } else { + writeln!(writer, " ERRORS")?; + } + writeln!(writer, "{}", "-".repeat(80))?; + + // Print errors + for error in &report.errors { + let formatted = error.format_human_readable(); + if use_color { + writeln!(writer, "{}", formatted.red())?; + } else { + writeln!(writer, "{formatted}")?; + } + } + writeln!(writer)?; + } + + writeln!(writer, "{}", "=".repeat(80))?; + if report.ok { + let msg = format!( + "\u{2713} All {} files passed validation", + report.files_scanned + ); + if use_color { + writeln!(writer, "{}", msg.green())?; + } else { + writeln!(writer, "{msg}")?; + } + } else { + let msg = format!( + "\u{2717} {} invalid GTS identifiers found", + report.errors_count + ); + if use_color { + writeln!(writer, "{}", msg.red())?; + } else { + writeln!(writer, "{msg}")?; + } + writeln!(writer)?; + writeln!(writer, " To fix:")?; + + // Only show hints relevant to the actual errors found + let has_vendor_mismatch = report + .errors + .iter() + .any(|e| e.error.contains("Vendor mismatch")); + let has_wildcard_error = report.errors.iter().any(|e| e.error.contains("Wildcard")); + let has_parse_error = report + .errors + .iter() + .any(|e| !e.error.contains("Vendor mismatch") && !e.error.contains("Wildcard")); + + if has_parse_error { + writeln!( + writer, + " - Schema IDs must end with ~ (e.g., gts.x.core.type.v1~)" + )?; + writeln!( + writer, + " - Each segment needs 5 parts: vendor.package.namespace.type.version" + )?; + writeln!(writer, " - No hyphens allowed, use underscores")?; + } + if has_wildcard_error { + writeln!( + writer, + " - Wildcards (*) only in filter/pattern contexts" + )?; + } + if has_vendor_mismatch { + writeln!(writer, " - Ensure all GTS IDs use the expected vendor")?; + } + } + writeln!(writer, "{}", "=".repeat(80))?; + + Ok(()) +} diff --git a/gts-validator/src/report.rs b/gts-validator/src/report.rs new file mode 100644 index 0000000..ca78365 --- /dev/null +++ b/gts-validator/src/report.rs @@ -0,0 +1,19 @@ +//! Validation report types. + +use serde::Serialize; + +use crate::error::ValidationError; + +/// Result of a validation run. +#[derive(Debug, Clone, Serialize)] +#[non_exhaustive] +pub struct ValidationReport { + /// Number of files scanned. + pub files_scanned: usize, + /// Number of errors found. + pub errors_count: usize, + /// Whether all files passed validation. + pub ok: bool, + /// Individual validation errors. + pub errors: Vec, +} diff --git a/gts-validator/src/strategy/fs.rs b/gts-validator/src/strategy/fs.rs new file mode 100644 index 0000000..1069870 --- /dev/null +++ b/gts-validator/src/strategy/fs.rs @@ -0,0 +1,139 @@ +//! Filesystem validation source. +//! +//! Discovers files on disk and yields `ValidationItem`s for the validation pipeline. + +use std::path::{Path, PathBuf}; + +use glob::Pattern; +use walkdir::WalkDir; + +use crate::config::FsSourceConfig; +use crate::strategy::{ContentFormat, ValidationItem}; + +/// Directories to skip +pub const SKIP_DIRS: &[&str] = &["target", "node_modules", ".git", "vendor", ".gts-spec"]; + +/// Files to skip (path suffixes). +/// NOTE: Repo-specific paths should be passed via `FsSourceConfig.exclude` instead. +/// This list is reserved for files that are universally irrelevant across GTS repos. +pub const SKIP_FILES: &[&str] = &[]; + +/// Check if a path matches any of the exclude patterns +fn matches_exclude(path: &Path, exclude_patterns: &[Pattern]) -> bool { + let path_str = path.to_string_lossy(); + for pattern in exclude_patterns { + if pattern.matches(&path_str) + || path + .file_name() + .is_some_and(|name| pattern.matches(&name.to_string_lossy())) + { + return true; + } + } + false +} + +/// Check if a directory entry is a skip directory (for `WalkDir::filter_entry`). +/// Returns `true` if the entry should be **included** (i.e., is NOT a skip dir). +fn is_not_skip_dir(entry: &walkdir::DirEntry) -> bool { + if entry.file_type().is_dir() + && let Some(name) = entry.file_name().to_str() + { + return !SKIP_DIRS.contains(&name); + } + true +} + +/// Check if file has a supported extension. +fn matches_file_pattern(path: &Path) -> bool { + matches!( + path.extension().and_then(|e| e.to_str()), + Some("md" | "json" | "yaml" | "yml") + ) +} + +/// Find all files to scan in the given paths. +#[must_use] +pub fn find_files(config: &FsSourceConfig) -> Vec { + let mut files = Vec::new(); + + // Parse exclude patterns + let exclude_patterns: Vec = config + .exclude + .iter() + .filter_map(|p| Pattern::new(p).ok()) + .collect(); + + for path in &config.paths { + if path.is_file() { + if matches_file_pattern(path) && !matches_exclude(path, &exclude_patterns) { + files.push(path.clone()); + } + } else if path.is_dir() { + for entry in WalkDir::new(path) + .follow_links(config.follow_links) + .into_iter() + .filter_entry(is_not_skip_dir) + .filter_map(Result::ok) + { + let file_path = entry.path(); + + // Only process files + if !file_path.is_file() { + continue; + } + + // Check file pattern + if !matches_file_pattern(file_path) { + continue; + } + + // Check exclude patterns + if matches_exclude(file_path, &exclude_patterns) { + continue; + } + + // Check against skip files (suffix match, not substring) + let rel_path = file_path.to_string_lossy(); + if SKIP_FILES.iter().any(|skip| rel_path.ends_with(skip)) { + continue; + } + + files.push(file_path.to_path_buf()); + } + } + } + + files.sort(); + files.dedup(); + files +} + +/// Determine the content format from a file extension. +pub fn content_format_for(path: &Path) -> Option { + match path.extension().and_then(|e| e.to_str()) { + Some("md") => Some(ContentFormat::Markdown), + Some("json") => Some(ContentFormat::Json), + Some("yaml" | "yml") => Some(ContentFormat::Yaml), + _ => None, + } +} + +/// Read a file into a `ValidationItem`, respecting `max_file_size`. +/// +/// Returns `None` if the file should be skipped (too large, read error, unsupported format). +pub fn read_validation_item(path: &Path, max_file_size: u64) -> Option { + // Check file size + if let Ok(metadata) = std::fs::metadata(path) + && metadata.len() > max_file_size + { + return None; + } + + // Verify the file has a supported format before reading + content_format_for(path)?; + + let content = std::fs::read_to_string(path).ok()?; + + Some(ValidationItem { content }) +} diff --git a/gts-validator/src/strategy/mod.rs b/gts-validator/src/strategy/mod.rs new file mode 100644 index 0000000..8cd4447 --- /dev/null +++ b/gts-validator/src/strategy/mod.rs @@ -0,0 +1,23 @@ +//! Validation source strategies. +//! +//! Phase 1 provides only the filesystem strategy (`fs` module) with a concrete +//! `validate_fs()` public API. A `ValidationSource` trait may be introduced in +//! a future phase when a second concrete strategy demands it — until then, the +//! design stays concrete to avoid speculative abstraction. + +pub mod fs; + +/// A single item to validate. +#[derive(Debug, Clone)] +pub struct ValidationItem { + /// The textual content to scan. + pub content: String, +} + +/// Content format for dispatching to the correct scanner. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ContentFormat { + Markdown, + Json, + Yaml, +} diff --git a/gts-validator/src/validator.rs b/gts-validator/src/validator.rs new file mode 100644 index 0000000..d4bf8dd --- /dev/null +++ b/gts-validator/src/validator.rs @@ -0,0 +1,308 @@ +//! GTS ID validation logic. +//! +//! This module provides validation of GTS identifiers by delegating to the +//! authoritative `gts` crate. It does NOT re-implement GTS parsing. + +use crate::normalize::NormalizedCandidate; + +/// Contexts where wildcards are allowed (in documentation) +pub const WILDCARD_ALLOWED_CONTEXTS: &[&str] = &[ + "pattern", + "filter", + "query", + "$filter", + "starts_with", + "with_pattern", + "resource_pattern", + "discovery", + "match", + "wildcard", + "differs from", + "get", + "list", +]; + +/// Contexts that indicate "bad example" or intentionally invalid identifiers. +/// Tightened from original: removed overly generic tokens like "error", "fail", "bad". +/// These must appear on the same line as the candidate, before it (proximity constraint). +pub const SKIP_VALIDATION_CONTEXTS: &[&str] = &[ + "\u{274c}", + "\u{2717}", + "invalid:", // colon required to avoid matching "invalid" in prose + "wrong:", + "bad:", + "// invalid", // code comment prefix + "not allowed:", +]; + +/// Example vendors used in documentation that are tolerated during vendor validation. +/// These are placeholder/example vendors commonly used in docs and tutorials. +pub const EXAMPLE_VENDORS: &[&str] = &[ + "acme", // Classic example company name + "globex", // Another example company name + "example", // Generic example + "demo", // Demo purposes + "test", // Test purposes + "sample", // Sample code + "tutorial", // Tutorial examples +]; + +/// Check if a vendor is an example/placeholder vendor that should be tolerated +#[must_use] +pub fn is_example_vendor(vendor: &str) -> bool { + EXAMPLE_VENDORS.contains(&vendor) +} + +/// Check if the GTS identifier is in a context where wildcards are allowed. +/// Checks the text before the match on the same line. +#[must_use] +pub fn is_wildcard_context(line: &str, match_start: usize) -> bool { + // Use get() to safely handle potential mid-codepoint byte offsets + let before = match line.get(..match_start) { + Some(s) => s.to_lowercase(), + None => return false, // Invalid byte offset, assume not wildcard context + }; + + for ctx in WILDCARD_ALLOWED_CONTEXTS { + if before.contains(ctx) { + return true; + } + } + + false +} + +/// Check if the GTS identifier is in a "bad example" context. +/// Tightened: same-line only (no 3-line lookback), with proximity constraint. +/// The skip token must appear BEFORE the candidate on the same line. +#[must_use] +pub fn is_bad_example_context(line: &str, match_start: usize) -> bool { + // Check current line only, before the match position + let before = match line.get(..match_start) { + Some(s) => s.to_lowercase(), + None => return false, + }; + + for ctx in SKIP_VALIDATION_CONTEXTS { + if before.contains(ctx) { + return true; + } + } + + false +} + +/// Validate a GTS identifier candidate. +/// +/// This function delegates all validation to `gts::GtsID::new()` and `gts::GtsWildcard::new()`. +/// It does NOT re-implement GTS parsing. +/// +/// # Arguments +/// +/// * `candidate` - A normalized candidate (after stripping gts://, quotes, etc.) +/// * `expected_vendor` - Optional vendor to check against (with `EXAMPLE_VENDORS` tolerance) +/// * `allow_wildcards` - Whether wildcard patterns are allowed in this context +/// +/// # Returns +/// +/// A vector of error messages. Empty if valid. +pub fn validate_candidate( + candidate: &NormalizedCandidate, + expected_vendor: Option<&str>, + allow_wildcards: bool, +) -> Vec { + let mut errors = Vec::new(); + let gts_id = &candidate.gts_id; + + // Handle wildcards + if gts_id.contains('*') { + if !allow_wildcards { + return vec![format!( + "Wildcards not allowed outside pattern contexts: '{}'", + candidate.original + )]; + } + // GtsWildcard::new() delegates to GtsID::new() internally, + // so all spec rules are enforced. Single parse — vendor check + // only runs on success to avoid duplicate/misleading errors. + match gts::GtsWildcard::new(gts_id) { + Ok(parsed) => { + if let Some(expected) = expected_vendor + && let Some(first_seg) = parsed.gts_id_segments.first() + && !first_seg.vendor.contains('*') + && first_seg.vendor != expected + && !is_example_vendor(&first_seg.vendor) + { + errors.push(format!( + "Vendor mismatch: expected '{expected}', found '{}'", + first_seg.vendor + )); + } + } + Err(e) => { + errors.push(format!("{e}")); + } + } + } else { + // Delegate to gts crate — the single source of truth + match gts::GtsID::new(gts_id) { + Ok(parsed) => { + // Vendor check + if let Some(expected) = expected_vendor + && let Some(first_seg) = parsed.gts_id_segments.first() + && first_seg.vendor != expected + && !is_example_vendor(&first_seg.vendor) + { + errors.push(format!( + "Vendor mismatch: expected '{expected}', found '{}'", + first_seg.vendor + )); + } + } + Err(e) => { + errors.push(format!("{e}")); + } + } + } + + errors +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::normalize::normalize_candidate; + + #[test] + fn test_validate_candidate_valid_type() { + let candidate = normalize_candidate("gts.x.idp.users.user.v1.0~").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_validate_candidate_valid_chained() { + let candidate = + normalize_candidate("gts.x.core.events.type.v1~ven.app._.custom_event.v1~").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_validate_candidate_vendor_match() { + let candidate = normalize_candidate("gts.x.core.modkit.plugin.v1~").unwrap(); + let errors = validate_candidate(&candidate, Some("x"), false); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_validate_candidate_vendor_mismatch() { + let candidate = normalize_candidate("gts.hx.core.modkit.plugin.v1~").unwrap(); + let errors = validate_candidate(&candidate, Some("x"), false); + assert!(!errors.is_empty()); + assert!(errors[0].contains("Vendor mismatch")); + } + + #[test] + fn test_validate_candidate_example_vendor_tolerated() { + let candidate = normalize_candidate("gts.acme.core.events.user_created.v1~").unwrap(); + let errors = validate_candidate(&candidate, Some("x"), false); + assert!( + errors.is_empty(), + "Example vendor 'acme' should be tolerated: {errors:?}" + ); + + let candidate = normalize_candidate("gts.globex.core.events.order.v1~").unwrap(); + let errors = validate_candidate(&candidate, Some("x"), false); + assert!( + errors.is_empty(), + "Example vendor 'globex' should be tolerated: {errors:?}" + ); + } + + #[test] + fn test_validate_candidate_invalid_hyphen() { + let candidate = normalize_candidate("gts.my-vendor.core.events.type.v1~").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(!errors.is_empty()); + } + + #[test] + fn test_validate_candidate_invalid_uppercase() { + let candidate = normalize_candidate("gts.X.core.events.type.v1~").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(!errors.is_empty()); + } + + #[test] + fn test_validate_candidate_invalid_digit_start() { + let candidate = normalize_candidate("gts.1vendor.core.events.type.v1~").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(!errors.is_empty()); + } + + #[test] + fn test_validate_candidate_wildcard_allowed() { + let candidate = normalize_candidate("gts.x.*").unwrap(); + let errors = validate_candidate(&candidate, None, true); + assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); + } + + #[test] + fn test_validate_candidate_wildcard_not_allowed() { + let candidate = normalize_candidate("gts.x.*").unwrap(); + let errors = validate_candidate(&candidate, None, false); + assert!(!errors.is_empty()); + assert!(errors[0].contains("Wildcards")); + } + + #[test] + fn test_is_example_vendor() { + assert!(is_example_vendor("acme")); + assert!(is_example_vendor("globex")); + assert!(is_example_vendor("example")); + assert!(is_example_vendor("demo")); + assert!(is_example_vendor("test")); + assert!(!is_example_vendor("x")); + assert!(!is_example_vendor("hx")); + assert!(!is_example_vendor("cf")); + } + + #[test] + fn test_is_wildcard_context() { + assert!(is_wildcard_context( + "$filter=type_id eq 'gts.x.*'", + "$filter=type_id eq '".len() + )); + assert!(is_wildcard_context( + "Use this pattern: gts.x.core.*", + "Use this pattern: ".len() + )); + assert!(!is_wildcard_context( + "The type gts.x.core.type.v1~", + "The type ".len() + )); + } + + #[test] + fn test_is_bad_example_context_same_line_only() { + // Skip token before the match + assert!(is_bad_example_context( + "invalid: gts.bad.id", + "invalid: ".len() + )); + assert!(is_bad_example_context( + "\u{274c} gts.x.y.z.a.v1~", + "\u{274c} ".len() + )); + + // Skip token after the match should NOT skip + assert!(!is_bad_example_context("gts.x.core.type.v1~ is invalid", 0)); + + // Generic "error" in unrelated context should NOT skip (removed from list) + assert!(!is_bad_example_context( + "The error handling uses gts.x.core.type.v1~", + "The error handling uses ".len() + )); + } +} diff --git a/gts-validator/tests/validate_fs_tests.rs b/gts-validator/tests/validate_fs_tests.rs new file mode 100644 index 0000000..8ecb178 --- /dev/null +++ b/gts-validator/tests/validate_fs_tests.rs @@ -0,0 +1,336 @@ +//! Integration tests for `gts_validator::validate_fs`. + +use std::fs; +use std::path::PathBuf; + +use gts_validator::{FsSourceConfig, ValidationConfig, validate_fs}; +use tempfile::TempDir; + +fn default_validation_config() -> ValidationConfig { + ValidationConfig::default() +} + +fn default_fs_config(paths: Vec) -> FsSourceConfig { + let mut cfg = FsSourceConfig::default(); + cfg.paths = paths; + cfg +} + +#[test] +fn test_validate_fs_empty_paths_errors() { + let fs_config = default_fs_config(vec![]); + let result = validate_fs(&fs_config, &default_validation_config()); + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("No paths provided"), "got: {msg}"); +} + +#[test] +fn test_validate_fs_nonexistent_path_errors() { + let fs_config = default_fs_config(vec![PathBuf::from("/nonexistent/path/abc123")]); + let result = validate_fs(&fs_config, &default_validation_config()); + assert!(result.is_err()); + let msg = result.unwrap_err().to_string(); + assert!(msg.contains("does not exist"), "got: {msg}"); +} + +#[test] +fn test_validate_fs_valid_markdown() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + fs::write(&md, "# Title\n\nUses `gts.x.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + assert_eq!(report.files_scanned, 1); + assert!(report.ok, "expected ok, got errors: {:?}", report.errors); + assert_eq!(report.errors_count, 0); +} + +#[test] +fn test_validate_fs_invalid_markdown_vendor_mismatch() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + // Valid structure but wrong vendor — triggers vendor mismatch error + fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let mut config = ValidationConfig::default(); + config.vendor = Some("x".to_owned()); + let report = validate_fs(&fs_config, &config).unwrap(); + + assert_eq!(report.files_scanned, 1); + assert!(!report.ok); + assert!(report.errors_count > 0); +} + +#[test] +fn test_validate_fs_vendor_mismatch() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let mut config = ValidationConfig::default(); + config.vendor = Some("x".to_owned()); + let report = validate_fs(&fs_config, &config).unwrap(); + + assert!(!report.ok); + assert!( + report + .errors + .iter() + .any(|e| e.error.contains("Vendor mismatch")), + "expected vendor mismatch error, got: {:?}", + report.errors + ); +} + +#[test] +fn test_validate_fs_valid_json() { + let tmp = TempDir::new().unwrap(); + let json_file = tmp.path().join("test.json"); + fs::write( + &json_file, + r#"{"schema": "gts.x.core.pkg.mytype.v1~", "name": "test"}"#, + ) + .unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + assert_eq!(report.files_scanned, 1); + assert!(report.ok, "expected ok, got errors: {:?}", report.errors); +} + +#[test] +fn test_validate_fs_valid_yaml() { + let tmp = TempDir::new().unwrap(); + let yaml_file = tmp.path().join("test.yaml"); + fs::write( + &yaml_file, + "schema: gts.x.core.pkg.mytype.v1~\nname: test\n", + ) + .unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + assert_eq!(report.files_scanned, 1); + assert!(report.ok, "expected ok, got errors: {:?}", report.errors); +} + +#[test] +fn test_validate_fs_json_output_contract() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + fs::write(&md, "# Title\n\nUses `gts.x.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + // Verify JSON serialization contract + let mut buf = Vec::new(); + gts_validator::output::write_json(&report, &mut buf).unwrap(); + let json: serde_json::Value = serde_json::from_slice(&buf).unwrap(); + + assert!(json.get("files_scanned").is_some()); + assert!(json.get("errors_count").is_some()); + assert!(json.get("ok").is_some()); + assert!(json.get("errors").is_some()); + assert!(json["ok"].as_bool().unwrap()); +} + +#[test] +fn test_validate_fs_exclude_pattern() { + let tmp = TempDir::new().unwrap(); + + // Create an included file (valid) and an excluded file (would fail vendor check) + let included = tmp.path().join("included.md"); + fs::write( + &included, + "# Title\n\nUses `gts.x.core.pkg.mytype.v1~` schema.\n", + ) + .unwrap(); + + let excluded_dir = tmp.path().join("excluded"); + fs::create_dir(&excluded_dir).unwrap(); + let excluded_md = excluded_dir.join("test.md"); + fs::write( + &excluded_md, + "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n", + ) + .unwrap(); + + let mut config = ValidationConfig::default(); + config.vendor = Some("x".to_owned()); + + // Without exclude: should find vendor mismatch in excluded/test.md + let mut fs_config_no_exclude = FsSourceConfig::default(); + fs_config_no_exclude.paths = vec![tmp.path().to_path_buf()]; + let report_no_exclude = validate_fs(&fs_config_no_exclude, &config).unwrap(); + assert_eq!(report_no_exclude.files_scanned, 2); + assert!( + !report_no_exclude.ok, + "should find vendor mismatch without exclude" + ); + + // With exclude: excluded dir should be skipped, only included.md scanned + let mut fs_config_with_exclude = FsSourceConfig::default(); + fs_config_with_exclude.paths = vec![tmp.path().to_path_buf()]; + fs_config_with_exclude.exclude = vec!["test.md".to_owned()]; + let report_with_exclude = validate_fs(&fs_config_with_exclude, &config).unwrap(); + assert_eq!( + report_with_exclude.files_scanned, 1, + "exclude should reduce file count" + ); + assert!( + report_with_exclude.ok, + "only included.md (valid vendor) should remain" + ); +} + +#[test] +fn test_write_human_success_output() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + fs::write(&md, "# Title\n\nUses `gts.x.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + let mut buf = Vec::new(); + gts_validator::output::write_human(&report, &mut buf, false).unwrap(); + let output = String::from_utf8(buf).unwrap(); + + assert!( + output.contains("GTS DOCUMENTATION VALIDATOR"), + "missing header, got: {output}" + ); + assert!(output.contains("Files scanned: 1"), "missing file count"); + assert!(output.contains("Errors found: 0"), "missing error count"); + assert!( + output.contains("All 1 files passed"), + "missing success message" + ); + assert!( + !output.contains("ERRORS"), + "should not contain ERRORS section" + ); +} + +#[test] +fn test_write_human_failure_output() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("test.md"); + fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let mut config = ValidationConfig::default(); + config.vendor = Some("x".to_owned()); + let report = validate_fs(&fs_config, &config).unwrap(); + + let mut buf = Vec::new(); + gts_validator::output::write_human(&report, &mut buf, false).unwrap(); + let output = String::from_utf8(buf).unwrap(); + + assert!(output.contains("ERRORS"), "missing ERRORS section"); + assert!( + output.contains("Vendor mismatch"), + "missing vendor mismatch hint" + ); + assert!( + output.contains("invalid GTS identifiers found"), + "missing failure summary" + ); + assert!( + output.contains("Ensure all GTS IDs use the expected vendor"), + "missing vendor hint" + ); +} + +#[test] +fn test_validate_fs_no_matching_files_returns_ok() { + let tmp = TempDir::new().unwrap(); + // Create a directory with no .md/.json/.yaml files + let txt = tmp.path().join("readme.txt"); + fs::write(&txt, "This is not a markdown file").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + assert_eq!(report.files_scanned, 0); + assert!(report.ok, "empty scan should be ok, not an error"); +} + +#[test] +fn test_validate_fs_max_file_size_skips_large_files() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("big.md"); + // Write a file with a vendor-mismatch ID that would normally fail + fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); + + // Set max_file_size to 10 bytes — file is larger, so it should be skipped + let mut fs_config = FsSourceConfig::default(); + fs_config.paths = vec![tmp.path().to_path_buf()]; + fs_config.max_file_size = 10; + + let mut config = ValidationConfig::default(); + config.vendor = Some("x".to_owned()); + let report = validate_fs(&fs_config, &config).unwrap(); + + assert_eq!( + report.files_scanned, 0, + "Oversized file should be skipped, not scanned" + ); + assert!(report.ok, "No files scanned means ok"); +} + +#[test] +fn test_validate_fs_non_utf8_file_skipped() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("binary.md"); + // Write invalid UTF-8 bytes + fs::write(&md, [0xFF, 0xFE, 0x00, 0x01, 0x80, 0x81]).unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + // Non-UTF-8 file should be silently skipped (read_to_string fails) + assert_eq!(report.files_scanned, 0, "Non-UTF-8 file should be skipped"); + assert!(report.ok); +} + +#[test] +fn test_validate_fs_skip_tokens_integration() { + let tmp = TempDir::new().unwrap(); + let md = tmp.path().join("bdd.md"); + // BDD-style content where **given** precedes a GTS ID with wrong vendor + fs::write( + &md, + "# BDD\n\n**given** gts.y.core.pkg.mytype.v1~ is registered\n", + ) + .unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + + // Without skip_tokens: should report vendor mismatch + let mut config_no_skip = ValidationConfig::default(); + config_no_skip.vendor = Some("x".to_owned()); + let report_no_skip = validate_fs(&fs_config, &config_no_skip).unwrap(); + assert!( + !report_no_skip.ok, + "Without skip_tokens, vendor mismatch should be reported" + ); + + // With skip_tokens: should suppress the error + let mut config_skip = ValidationConfig::default(); + config_skip.vendor = Some("x".to_owned()); + config_skip.skip_tokens = vec!["**given**".to_owned()]; + let report_skip = validate_fs(&fs_config, &config_skip).unwrap(); + assert!( + report_skip.ok, + "With skip_tokens, the error should be suppressed: {:?}", + report_skip.errors + ); +} From 6cc2e363f9fdd0802cfe81c10b9c8be443f87ad1 Mon Sep 17 00:00:00 2001 From: devjow Date: Thu, 19 Feb 2026 18:42:38 +0000 Subject: [PATCH 2/2] feat: improve safety, correctness and archicture Signed-off-by: devjow --- Cargo.lock | 10 - gts-validator/Cargo.toml | 5 +- gts-validator/src/config.rs | 60 +++++- gts-validator/src/error.rs | 48 +++++ gts-validator/src/format/json.rs | 89 +++++---- gts-validator/src/format/markdown.rs | 26 +-- gts-validator/src/format/yaml.rs | 141 +++++++++----- gts-validator/src/lib.rs | 203 +++++++++++++++----- gts-validator/src/output.rs | 143 +++++++------- gts-validator/src/report.rs | 37 +++- gts-validator/src/strategy/fs.rs | 232 ++++++++++++++++++----- gts-validator/src/strategy/mod.rs | 7 - gts-validator/tests/validate_fs_tests.rs | 166 ++++++++++------ 13 files changed, 802 insertions(+), 365 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c933e45..716275c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -303,15 +303,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" -[[package]] -name = "colored" -version = "3.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -623,7 +614,6 @@ name = "gts-validator" version = "0.7.8" dependencies = [ "anyhow", - "colored", "glob", "gts", "regex", diff --git a/gts-validator/Cargo.toml b/gts-validator/Cargo.toml index dc473f3..623bb49 100644 --- a/gts-validator/Cargo.toml +++ b/gts-validator/Cargo.toml @@ -34,8 +34,5 @@ serde-saphyr.workspace = true # Error handling anyhow.workspace = true -# For colored terminal output -colored.workspace = true - [dev-dependencies] -tempfile = "3.8" +tempfile = "3.15" diff --git a/gts-validator/src/config.rs b/gts-validator/src/config.rs index b2c296c..3f11450 100644 --- a/gts-validator/src/config.rs +++ b/gts-validator/src/config.rs @@ -6,21 +6,45 @@ use std::path::PathBuf; +/// Vendor matching policy for GTS ID validation. +#[derive(Debug, Clone, Default)] +#[non_exhaustive] +pub enum VendorPolicy { + /// Accept any vendor (no vendor enforcement). + #[default] + Any, + /// All GTS IDs must match this exact vendor (example vendors are always tolerated). + MustMatch(String), + /// All GTS IDs must match one of the listed vendors (example vendors are always tolerated). + AllowList(Vec), +} + +/// Controls how GTS identifier candidates are discovered in markdown files. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[non_exhaustive] +pub enum DiscoveryMode { + /// Only match well-formed GTS patterns — fewer false positives (default). + #[default] + StrictSpecOnly, + /// Permissive regex catches ALL gts.* strings including malformed IDs. + /// Use for strict CI enforcement where every malformed ID must be reported. + Heuristic, +} + /// Core validation config — applies regardless of input source. #[derive(Debug, Clone, Default)] #[non_exhaustive] pub struct ValidationConfig { - /// Expected vendor for all GTS IDs (e.g., "x"). - /// Example vendors (acme, globex, etc.) are always tolerated. - pub vendor: Option, + /// Vendor matching policy for all GTS IDs. + /// Example vendors (acme, globex, etc.) are always tolerated regardless of policy. + pub vendor_policy: VendorPolicy, /// Scan JSON/YAML object keys for GTS identifiers (default: off). pub scan_keys: bool, - /// Enable relaxed discovery (catches more candidates, including malformed ones). + /// Discovery mode for markdown scanning. /// - /// - `false` (default): only well-formed GTS patterns are discovered — fewer false positives. - /// - `true`: a permissive regex catches ALL gts.* strings, including malformed IDs, - /// so they can be reported as errors. Use this for strict CI enforcement. - pub strict: bool, + /// - `StrictSpecOnly` (default): only well-formed GTS patterns are discovered. + /// - `Heuristic`: a permissive regex catches ALL gts.* strings, including malformed IDs. + pub discovery_mode: DiscoveryMode, /// Additional skip tokens for markdown scanning. /// If any of these strings appear before a GTS candidate on the same line, /// validation is skipped for that candidate. Case-insensitive matching. @@ -42,8 +66,21 @@ pub struct FsSourceConfig { pub exclude: Vec, /// Maximum file size in bytes (default: 10 MB). pub max_file_size: u64, - /// Whether to follow symbolic links (default: true — preserves current behavior). + /// Whether to follow symbolic links. + /// + /// **Defaults to `false`** — following symlinks allows escaping the repository + /// root, traversing system directories, and reading secrets in CI environments. + /// Only enable if you explicitly trust all symlinks in the repository. pub follow_links: bool, + /// Maximum directory traversal depth (default: 64). + /// Prevents infinite recursion via deeply nested symlinks or directories. + pub max_depth: usize, + /// Maximum total number of files to scan (default: `100_000`). + /// Prevents memory exhaustion on pathological repositories. + pub max_files: usize, + /// Maximum total bytes to read across all files (default: 512 MB). + /// Prevents memory exhaustion when many large files are present. + pub max_total_bytes: u64, } impl Default for FsSourceConfig { @@ -52,7 +89,10 @@ impl Default for FsSourceConfig { paths: Vec::new(), exclude: Vec::new(), max_file_size: 10_485_760, - follow_links: true, + follow_links: false, + max_depth: 64, + max_files: 100_000, + max_total_bytes: 536_870_912, } } } diff --git a/gts-validator/src/error.rs b/gts-validator/src/error.rs index 109525c..ba46a2a 100644 --- a/gts-validator/src/error.rs +++ b/gts-validator/src/error.rs @@ -4,6 +4,54 @@ use std::path::PathBuf; use serde::Serialize; +/// The kind of scan-level failure that prevented a file from being validated. +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[non_exhaustive] +pub enum ScanErrorKind { + /// An I/O error occurred while reading the file. + IoError, + /// The file exceeded the configured maximum size limit. + FileTooLarge, + /// The file content could not be parsed as valid JSON. + JsonParseError, + /// The file content could not be parsed as valid YAML. + YamlParseError, + /// The file content is not valid UTF-8. + InvalidEncoding, + /// The resolved path is outside the repository root (symlink escape). + OutsideRepository, + /// A resource limit (`max_files` or `max_total_bytes`) was reached, truncating the scan. + LimitExceeded, + /// A directory traversal error (permission denied, loop detected, etc.). + WalkError, + /// An exclude glob pattern could not be parsed. + InvalidExcludePattern, +} + +/// A scan-level error: a file that could not be validated at all. +/// +/// These are distinct from `ValidationError` (which represents a GTS ID that +/// was found and failed validation). A `ScanError` means the file could not +/// even be read or parsed — CI must treat these as failures. +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[non_exhaustive] +pub struct ScanError { + /// The file path that could not be scanned. + pub file: PathBuf, + /// The kind of failure. + pub kind: ScanErrorKind, + /// Human-readable description of the failure. + pub message: String, +} + +impl ScanError { + /// Format the error for human-readable output. + #[must_use] + pub fn format_human_readable(&self) -> String { + format!("{}: [scan error] {}", self.file.display(), self.message) + } +} + /// A single validation error found in a documentation/config file. #[derive(Debug, Clone, Serialize, PartialEq, Eq)] #[non_exhaustive] diff --git a/gts-validator/src/format/json.rs b/gts-validator/src/format/json.rs index 61701ee..2f684ba 100644 --- a/gts-validator/src/format/json.rs +++ b/gts-validator/src/format/json.rs @@ -6,53 +6,56 @@ use std::path::Path; use serde_json::Value; -use crate::error::ValidationError; +use crate::error::{ScanError, ScanErrorKind, ValidationError}; use crate::normalize::normalize_candidate; use crate::validator::validate_candidate; /// Scan JSON content for GTS identifiers. +/// +/// # Errors +/// +/// Returns a `ScanError` if the content is not valid JSON. +/// Invalid JSON must be reported as a scan failure — never silently ignored. pub fn scan_json_content( content: &str, path: &Path, vendor: Option<&str>, scan_keys: bool, -) -> Vec { - let value: Value = match serde_json::from_str(content) { - Ok(v) => v, - Err(_e) => return vec![], - }; +) -> Result, ScanError> { + let value: Value = serde_json::from_str(content).map_err(|e| ScanError { + file: path.to_owned(), + kind: ScanErrorKind::JsonParseError, + message: format!("JSON parse error: {e}"), + })?; let mut errors = Vec::new(); walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); - errors + Ok(errors) } -/// Scan a JSON file for GTS identifiers (file-based convenience wrapper). +/// Scan a JSON file for GTS identifiers (file-based convenience wrapper for tests). #[cfg(test)] pub fn scan_json_file( path: &Path, vendor: Option<&str>, max_file_size: u64, scan_keys: bool, -) -> Vec { - // Check file size - if let Ok(metadata) = std::fs::metadata(path) - && metadata.len() > max_file_size - { - return vec![]; - } +) -> Result, ScanError> { + use crate::strategy::fs::{ScanResult, read_file_bounded}; - // Read as UTF-8; skip file on encoding error - let content = match std::fs::read_to_string(path) { - Ok(c) => c, - Err(_e) => return vec![], + let content = match read_file_bounded(path, max_file_size) { + ScanResult::Ok(c) => c, + ScanResult::Err(e) => return Err(e), }; scan_json_content(&content, path, vendor, scan_keys) } /// Walk a JSON value tree and validate GTS identifiers in string values. -/// This is shared by both JSON and YAML scanners. +/// This is shared by both JSON and YAML scanners (YAML documents are +/// deserialized to `serde_json::Value` and validated through this same path). +/// Markdown scanning uses regex-based discovery instead, where the pattern +/// itself stops at tilde-followed-by-dot to naturally exclude filenames. pub fn walk_json_value( value: &Value, path: &Path, @@ -80,14 +83,18 @@ pub fn walk_json_value( // Only consider strings that look like GTS identifiers // Skip filenames that contain GTS IDs (e.g., "gts.x.core.type.v1~.schema.json") // A string is likely a filename if it contains a tilde followed by a dot and extension - let looks_like_filename = candidate_str.contains("~.") + let looks_like_filename = !candidate_str.starts_with("gts://") + && candidate_str.contains("~.") && candidate_str .rfind('.') .is_some_and(|pos| pos > candidate_str.rfind('~').unwrap_or(0)); - if (candidate_str.starts_with("gts://gts.") || candidate_str.starts_with("gts.")) - && !looks_like_filename - { + // For plain gts. strings, skip if it looks like a filename (e.g., "gts.x.type.v1~.schema.json") + if looks_like_filename { + return; + } + + if candidate_str.starts_with("gts://gts.") || candidate_str.starts_with("gts.") { match normalize_candidate(candidate_str) { Ok(candidate) => { let allow_wildcards = is_xgts_ref; @@ -197,7 +204,7 @@ mod tests { fn test_scan_json_valid_id() { let content = r#"{"$id": "gts://gts.x.core.events.type.v1~"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); } @@ -205,7 +212,7 @@ mod tests { fn test_scan_json_invalid_id() { let content = r#"{"$id": "gts.invalid"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!(!errors.is_empty()); } @@ -213,7 +220,7 @@ mod tests { fn test_scan_json_xgts_ref_wildcard() { let content = r#"{"x-gts-ref": "gts.x.core.*"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Wildcards in x-gts-ref should be allowed" @@ -224,7 +231,7 @@ mod tests { fn test_scan_json_xgts_ref_bare_wildcard() { let content = r#"{"x-gts-ref": "*"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Bare wildcard in x-gts-ref should be skipped" @@ -235,7 +242,7 @@ mod tests { fn test_scan_json_xgts_ref_relative_pointer() { let content = r#"{"x-gts-ref": "/$id"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Relative pointers in x-gts-ref should be skipped" @@ -252,7 +259,7 @@ mod tests { } }"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Nested values should be found and validated" @@ -268,7 +275,7 @@ mod tests { ] }"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Array values should be found and validated" @@ -276,21 +283,23 @@ mod tests { } #[test] - fn test_scan_json_invalid_json() { + fn test_scan_json_invalid_json_is_scan_error() { let content = r#"{"invalid": json}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let result = scan_json_file(file.path(), None, 10_485_760, false); assert!( - errors.is_empty(), - "Invalid JSON should be skipped with warning" + result.is_err(), + "Invalid JSON must produce a ScanError, not silent success" ); + let err = result.unwrap_err(); + assert_eq!(err.kind, crate::error::ScanErrorKind::JsonParseError); } #[test] fn test_scan_json_error_includes_json_path() { let content = r#"{"properties": {"type": {"x-gts-ref": "gts.invalid"}}}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!(!errors.is_empty()); assert!(errors[0].json_path.contains("properties.type.x-gts-ref")); } @@ -299,7 +308,7 @@ mod tests { fn test_scan_json_vendor_mismatch() { let content = r#"{"$id": "gts://gts.hx.core.events.type.v1~"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), Some("x"), 10_485_760, false); + let errors = scan_json_file(file.path(), Some("x"), 10_485_760, false).unwrap(); assert!(!errors.is_empty()); assert!(errors[0].error.contains("Vendor mismatch")); } @@ -308,7 +317,7 @@ mod tests { fn test_scan_json_keys_not_scanned_by_default() { let content = r#"{"gts.x.core.type.v1~": "value"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, false); + let errors = scan_json_file(file.path(), None, 10_485_760, false).unwrap(); assert!(errors.is_empty(), "Keys should not be scanned by default"); } @@ -316,7 +325,7 @@ mod tests { fn test_scan_json_keys_scanned_when_enabled() { let content = r#"{"gts.x.core.events.type.v1~": "value"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, true); + let errors = scan_json_file(file.path(), None, 10_485_760, true).unwrap(); assert!( errors.is_empty(), "Valid GTS ID keys should pass validation" @@ -327,7 +336,7 @@ mod tests { fn test_scan_json_invalid_key_when_scanning_enabled() { let content = r#"{"gts.invalid": "value"}"#; let file = create_temp_json(content); - let errors = scan_json_file(file.path(), None, 10_485_760, true); + let errors = scan_json_file(file.path(), None, 10_485_760, true).unwrap(); assert!( !errors.is_empty(), "Invalid GTS ID keys should be caught when key scanning is enabled" diff --git a/gts-validator/src/format/markdown.rs b/gts-validator/src/format/markdown.rs index a808e27..cc3b2c9 100644 --- a/gts-validator/src/format/markdown.rs +++ b/gts-validator/src/format/markdown.rs @@ -91,10 +91,10 @@ pub fn scan_markdown_content( content: &str, path: &Path, vendor: Option<&str>, - strict: bool, + heuristic: bool, skip_tokens: &[String], ) -> Vec { - let pattern = if strict { + let pattern = if heuristic { &*GTS_DISCOVERY_PATTERN_RELAXED } else { &*GTS_DISCOVERY_PATTERN_WELL_FORMED @@ -221,7 +221,7 @@ pub fn scan_markdown_file( path: &Path, vendor: Option<&str>, max_file_size: u64, - strict: bool, + heuristic: bool, ) -> Vec { // Check file size if let Ok(metadata) = std::fs::metadata(path) @@ -236,7 +236,7 @@ pub fn scan_markdown_file( Err(_e) => return vec![], }; - scan_markdown_content(&content, path, vendor, strict, &[]) + scan_markdown_content(&content, path, vendor, heuristic, &[]) } #[cfg(test)] @@ -373,14 +373,14 @@ gts.invalid.pattern.here.v1~ } #[test] - fn test_scan_markdown_strict_mode_catches_malformed() { + fn test_scan_markdown_heuristic_mode_catches_malformed() { let file = create_temp_md("The type is gts.my-vendor.core.events.type.v1~"); - let errors_strict = scan_markdown_file(file.path(), None, 10_485_760, true); + let errors_heuristic = scan_markdown_file(file.path(), None, 10_485_760, true); let errors_normal = scan_markdown_file(file.path(), None, 10_485_760, false); assert!( - !errors_strict.is_empty(), - "Strict mode should catch malformed ID with hyphens" + !errors_heuristic.is_empty(), + "Heuristic mode should catch malformed ID with hyphens" ); assert!( errors_normal.is_empty(), @@ -389,13 +389,13 @@ gts.invalid.pattern.here.v1~ } #[test] - fn test_scan_markdown_strict_mode_catches_extra_dots() { + fn test_scan_markdown_heuristic_mode_catches_extra_dots() { let file = create_temp_md("The type is gts.x.core.events.type.name.v1~"); - let errors_strict = scan_markdown_file(file.path(), None, 10_485_760, true); + let errors_heuristic = scan_markdown_file(file.path(), None, 10_485_760, true); assert!( - !errors_strict.is_empty(), - "Strict mode should catch ID with extra segments" + !errors_heuristic.is_empty(), + "Heuristic mode should catch ID with extra segments" ); } @@ -418,7 +418,7 @@ gts.invalid.pattern.here.v1~ content, Path::new("test.md"), None, - true, // strict to ensure the relaxed regex would catch it + true, // heuristic mode to ensure the relaxed regex would catch it &["**given**".to_owned()], ); assert!( diff --git a/gts-validator/src/format/yaml.rs b/gts-validator/src/format/yaml.rs index 383b01d..d2ba70f 100644 --- a/gts-validator/src/format/yaml.rs +++ b/gts-validator/src/format/yaml.rs @@ -6,7 +6,7 @@ use std::path::Path; use serde_json::Value; -use crate::error::ValidationError; +use crate::error::{ScanError, ScanErrorKind, ValidationError}; use crate::format::json::walk_json_value; fn split_yaml_documents(content: &str) -> Vec { @@ -34,63 +34,96 @@ fn split_yaml_documents(content: &str) -> Vec { } /// Scan YAML content for GTS identifiers. +/// +/// Returns `(validation_errors, scan_errors)`: +/// - `validation_errors`: GTS ID validation failures found in successfully-parsed documents. +/// - `scan_errors`: per-document parse failures in multi-document streams, or a single +/// file-level parse failure if no document could be parsed at all. +/// +/// This separation ensures malformed YAML documents are counted in `failed_files` +/// and never silently mixed into the validation error layer. pub fn scan_yaml_content( content: &str, path: &Path, vendor: Option<&str>, scan_keys: bool, -) -> Vec { - let mut errors = Vec::new(); +) -> (Vec, Vec) { + let mut validation_errors = Vec::new(); + let mut scan_errors = Vec::new(); // Parse all documents with the YAML stream parser first. // If this fails (e.g., one malformed document in the stream), fall back to per-document // parsing so valid sibling documents are still validated. let documents: Vec = match serde_saphyr::from_multiple(content) { Ok(docs) => docs, - Err(_e) => { - for segment in split_yaml_documents(content) { - let value: Value = match serde_saphyr::from_str(&segment) { - Ok(doc) => doc, - Err(_segment_err) => continue, - }; - - walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); + Err(stream_err) => { + let segments = split_yaml_documents(content); + let mut any_parsed = false; + + for (idx, segment) in segments.iter().enumerate() { + match serde_saphyr::from_str::(segment) { + Ok(doc) => { + any_parsed = true; + walk_json_value(&doc, path, vendor, &mut validation_errors, "$", scan_keys); + } + Err(doc_err) => { + // Per-document parse failure → ScanError (not ValidationError) + scan_errors.push(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::YamlParseError, + message: format!( + "YAML parse error in document {} of multi-document stream: {doc_err}", + idx + 1 + ), + }); + } + } + } + + if !any_parsed { + // No document parsed at all — replace per-doc errors with a single file-level error + scan_errors.clear(); + scan_errors.push(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::YamlParseError, + message: format!("YAML parse error: {stream_err}"), + }); } - return errors; + return (validation_errors, scan_errors); } }; for value in documents { - // Reuse the JSON walker since both operate on serde_json::Value - walk_json_value(&value, path, vendor, &mut errors, "$", scan_keys); + walk_json_value(&value, path, vendor, &mut validation_errors, "$", scan_keys); } - errors + (validation_errors, scan_errors) } -/// Scan a YAML file for GTS identifiers (file-based convenience wrapper). +/// Scan a YAML file for GTS identifiers (file-based convenience wrapper for tests). +/// +/// Returns `Err` if the file cannot be read or if any scan-level error occurred. +/// Returns `Ok(validation_errors)` if the file was parsed (even partially for multi-doc streams). #[cfg(test)] pub fn scan_yaml_file( path: &Path, vendor: Option<&str>, max_file_size: u64, scan_keys: bool, -) -> Vec { - // Check file size - if let Ok(metadata) = std::fs::metadata(path) - && metadata.len() > max_file_size - { - return vec![]; - } +) -> Result, ScanError> { + use crate::strategy::fs::{ScanResult, read_file_bounded}; - // Read as UTF-8; skip file on encoding error - let content = match std::fs::read_to_string(path) { - Ok(c) => c, - Err(_e) => return vec![], + let content = match read_file_bounded(path, max_file_size) { + ScanResult::Ok(c) => c, + ScanResult::Err(e) => return Err(e), }; - scan_yaml_content(&content, path, vendor, scan_keys) + let (val_errs, scan_errs) = scan_yaml_content(&content, path, vendor, scan_keys); + if let Some(first_scan_err) = scan_errs.into_iter().next() { + return Err(first_scan_err); + } + Ok(val_errs) } #[cfg(test)] @@ -111,7 +144,7 @@ mod tests { $id: gts://gts.x.core.events.type.v1~ "; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!(errors.is_empty(), "Unexpected errors: {errors:?}"); } @@ -121,7 +154,7 @@ $id: gts://gts.x.core.events.type.v1~ $id: gts.invalid "; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!(!errors.is_empty()); } @@ -131,7 +164,7 @@ $id: gts.invalid x-gts-ref: gts.x.core.* "; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Wildcards in x-gts-ref should be allowed" @@ -144,7 +177,7 @@ x-gts-ref: gts.x.core.* x-gts-ref: "*" "#; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Bare wildcard in x-gts-ref should be skipped" @@ -159,7 +192,7 @@ properties: x-gts-ref: gts.x.core.events.type.v1~ "; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Nested values should be found and validated" @@ -174,7 +207,7 @@ capabilities: - gts.x.core.events.topic.v1~ "; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let errors = scan_yaml_file(file.path(), None, 10_485_760, false).unwrap(); assert!( errors.is_empty(), "Array values should be found and validated" @@ -182,16 +215,17 @@ capabilities: } #[test] - fn test_scan_yaml_invalid_yaml() { - let content = r" -invalid: yaml: syntax: -"; + fn test_scan_yaml_invalid_yaml_is_scan_error() { + // Completely invalid YAML (not parseable as any document) must be a ScanError + let content = ": : :\n - [unclosed\n"; let file = create_temp_yaml(content); - let errors = scan_yaml_file(file.path(), None, 10_485_760, false); + let result = scan_yaml_file(file.path(), None, 10_485_760, false); assert!( - errors.is_empty(), - "Invalid YAML should be skipped with warning" + result.is_err(), + "Completely invalid YAML must produce a ScanError, not silent success" ); + let err = result.unwrap_err(); + assert_eq!(err.kind, crate::error::ScanErrorKind::YamlParseError); } #[test] @@ -202,10 +236,15 @@ $id: gts.x.core.events.type.v1~ --- $id: gts.invalid "; - let errors = scan_yaml_content(content, Path::new("multi.yaml"), None, false); + let (val_errs, scan_errs) = + scan_yaml_content(content, Path::new("multi.yaml"), None, false); + assert!( + scan_errs.is_empty(), + "No scan errors expected for well-formed stream: {scan_errs:?}" + ); // Both documents are parsed — gts.invalid in doc 2 must produce an error assert!( - !errors.is_empty(), + !val_errs.is_empty(), "Multi-document YAML: second document with invalid ID should be caught, got no errors" ); } @@ -221,11 +260,21 @@ invalid: yaml: syntax: $id: gts.y.core.pkg.mytype.v1~ "; // With vendor "x", both valid docs should produce vendor-mismatch errors. - // If the malformed middle doc caused an early return, errors would be empty. - let errors = scan_yaml_content(content, Path::new("multi.yaml"), Some("x"), false); + // The malformed middle doc must produce a ScanError, not suppress valid docs. + let (val_errs, scan_errs) = + scan_yaml_content(content, Path::new("multi.yaml"), Some("x"), false); assert!( - !errors.is_empty(), + !val_errs.is_empty(), "Valid documents must be validated even when a sibling document is malformed, got no errors" ); + assert!( + !scan_errs.is_empty(), + "Malformed document must produce a ScanError, got none" + ); + assert_eq!( + scan_errs[0].kind, + crate::error::ScanErrorKind::YamlParseError, + "Malformed doc scan error must have YamlParseError kind" + ); } } diff --git a/gts-validator/src/lib.rs b/gts-validator/src/lib.rs index 83bff09..b25c7ab 100644 --- a/gts-validator/src/lib.rs +++ b/gts-validator/src/lib.rs @@ -9,18 +9,19 @@ //! //! ```rust,no_run //! use std::path::PathBuf; -//! use gts_validator::{validate_fs, FsSourceConfig, ValidationConfig}; +//! use gts_validator::{validate_fs, FsSourceConfig, ValidationConfig, VendorPolicy}; //! //! let mut fs_config = FsSourceConfig::default(); //! fs_config.paths = vec![PathBuf::from("docs"), PathBuf::from("modules")]; //! fs_config.exclude = vec!["target/*".to_owned()]; //! //! let mut validation_config = ValidationConfig::default(); -//! validation_config.vendor = Some("x".to_owned()); +//! validation_config.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); //! //! let report = validate_fs(&fs_config, &validation_config).unwrap(); -//! println!("Files scanned: {}", report.files_scanned); -//! println!("Errors: {}", report.errors_count); +//! println!("Files scanned: {}", report.scanned_files); +//! println!("Validation errors: {}", report.errors_count()); +//! println!("Scan errors: {}", report.scan_errors.len()); //! println!("OK: {}", report.ok); //! ``` @@ -33,26 +34,28 @@ mod report; mod strategy; mod validator; -pub use config::{FsSourceConfig, ValidationConfig}; -pub use error::ValidationError; +pub use config::{DiscoveryMode, FsSourceConfig, ValidationConfig, VendorPolicy}; +pub use error::{ScanError, ScanErrorKind, ValidationError}; pub use report::ValidationReport; use strategy::ContentFormat; -use strategy::fs::{content_format_for, find_files, read_validation_item}; +use strategy::fs::{ScanResult, content_format_for, find_files, read_file_bounded}; /// Validate GTS identifiers in files on disk. /// -/// This is the primary public API for Phase 1. +/// This is the primary public API. /// /// # Arguments /// -/// * `fs_config` - Filesystem-specific source options (paths, exclude, max file size, etc.) -/// * `validation_config` - Core validation config (vendor, `scan_keys`, strict) +/// * `fs_config` - Filesystem-specific source options (paths, exclude, max file size, limits) +/// * `validation_config` - Core validation config (vendor policy, `scan_keys`, discovery mode) /// /// # Errors /// /// Returns an error if `fs_config.paths` is empty or if any provided path does not exist. -/// Returns `Ok` with `files_scanned: 0` if paths exist but contain no scannable files. +/// Returns `Ok` with `scanned_files: 0` if paths exist but contain no scannable files. +/// Scan failures (unreadable files, parse errors, etc.) are reported in `report.scan_errors` +/// and never silently discarded. pub fn validate_fs( fs_config: &FsSourceConfig, validation_config: &ValidationConfig, @@ -61,65 +64,177 @@ pub fn validate_fs( anyhow::bail!("No paths provided for validation"); } - // Validate explicitly provided paths exist for path in &fs_config.paths { if !path.exists() { anyhow::bail!("Path does not exist: {}", path.display()); } } - let files = find_files(fs_config); + let (files, mut scan_errors) = find_files(fs_config); - if files.is_empty() { + if files.is_empty() && scan_errors.is_empty() { return Ok(ValidationReport { - files_scanned: 0, - errors_count: 0, + scanned_files: 0, + failed_files: 0, ok: true, - errors: vec![], + validation_errors: vec![], + scan_errors: vec![], }); } - let mut errors = Vec::new(); - let vendor = validation_config.vendor.as_deref(); - let mut files_scanned: usize = 0; + let heuristic = validation_config.discovery_mode == DiscoveryMode::Heuristic; + // For AllowList, pass a sentinel vendor that no real GTS ID can match. + // This causes validate_candidate to emit "Vendor mismatch" for every non-example + // vendor, and apply_allow_list_filter then removes the allowed ones — leaving only + // genuinely disallowed vendors as errors. + let effective_vendor = effective_vendor_for_scanning(&validation_config.vendor_policy); - for file_path in &files { - let Some(item) = read_validation_item(file_path, fs_config.max_file_size) else { - continue; // skip unreadable/oversized files — don't count as scanned + let mut validation_errors = Vec::new(); + let mut scanned_files: usize = 0; + // Discovery-stage failures (walk errors, boundary violations, canonicalization errors) + // are already in scan_errors from find_files. Count them as failed files upfront. + let mut failed_files: usize = scan_errors.len(); + let mut total_bytes: u64 = 0; + + 'files: for file_path in &files { + if scanned_files + failed_files >= fs_config.max_files { + scan_errors.push(ScanError { + file: file_path.clone(), + kind: ScanErrorKind::LimitExceeded, + message: format!( + "Scan aborted: max_files limit ({}) reached; remaining files not scanned", + fs_config.max_files + ), + }); + failed_files += 1; + break; + } + + let content = match read_file_bounded(file_path, fs_config.max_file_size) { + ScanResult::Ok(c) => c, + ScanResult::Err(e) => { + scan_errors.push(e); + failed_files += 1; + continue; + } }; + let file_bytes = content.len() as u64; + if total_bytes.saturating_add(file_bytes) > fs_config.max_total_bytes { + scan_errors.push(ScanError { + file: file_path.clone(), + kind: ScanErrorKind::LimitExceeded, + message: format!( + "Scan aborted: max_total_bytes limit ({}) reached; remaining files not scanned", + fs_config.max_total_bytes + ), + }); + failed_files += 1; + break; + } + total_bytes = total_bytes.saturating_add(file_bytes); + + let vendor = effective_vendor.as_deref(); let file_errors = match content_format_for(file_path) { Some(ContentFormat::Markdown) => format::markdown::scan_markdown_content( - &item.content, + &content, file_path, vendor, - validation_config.strict, + heuristic, &validation_config.skip_tokens, ), - Some(ContentFormat::Json) => format::json::scan_json_content( - &item.content, - file_path, - vendor, - validation_config.scan_keys, - ), - Some(ContentFormat::Yaml) => format::yaml::scan_yaml_content( - &item.content, - file_path, - vendor, - validation_config.scan_keys, - ), + Some(ContentFormat::Json) => { + match format::json::scan_json_content( + &content, + file_path, + vendor, + validation_config.scan_keys, + ) { + Ok(errs) => errs, + Err(scan_err) => { + scan_errors.push(scan_err); + failed_files += 1; + continue 'files; + } + } + } + Some(ContentFormat::Yaml) => { + let (val_errs, yaml_scan_errs) = format::yaml::scan_yaml_content( + &content, + file_path, + vendor, + validation_config.scan_keys, + ); + if !yaml_scan_errs.is_empty() { + failed_files += 1; + scan_errors.extend(yaml_scan_errs); + } + val_errs + } None => continue, }; - files_scanned += 1; - errors.extend(file_errors); + scanned_files += 1; + + // For AllowList: filter out errors where the vendor IS in the allow list. + // The sentinel vendor caused mismatches for all vendors; remove the allowed ones. + let file_errors = apply_allow_list_filter(file_errors, &validation_config.vendor_policy); + validation_errors.extend(file_errors); } - let errors_count = errors.len(); + let ok = validation_errors.is_empty() && scan_errors.is_empty(); Ok(ValidationReport { - files_scanned, - errors_count, - ok: errors.is_empty(), - errors, + scanned_files, + failed_files, + ok, + validation_errors, + scan_errors, }) } + +/// Determine the effective vendor string to pass to scanners for a given policy. +/// +/// - `Any` → `None` (no vendor enforcement). +/// - `MustMatch(v)` → `Some(v)` (scanner enforces exact match directly). +/// - `AllowList(_)` → `Some("\x00")` (sentinel that no real GTS vendor can match). +/// GTS vendors must be lowercase alphanumeric, so `\x00` is guaranteed to never +/// equal any real vendor. This causes `validate_candidate` to emit "Vendor mismatch" +/// for every non-example vendor, and `apply_allow_list_filter` then removes the +/// vendors that are in the allow list — leaving only genuinely disallowed vendors. +fn effective_vendor_for_scanning(policy: &VendorPolicy) -> Option { + match policy { + VendorPolicy::Any => None, + VendorPolicy::MustMatch(v) => Some(v.clone()), + VendorPolicy::AllowList(_) => Some("\x00".to_owned()), + } +} + +/// For `VendorPolicy::AllowList`, remove validation errors whose vendor IS in the list. +/// +/// Scanners run with a sentinel vendor (`\x00`) that generates "Vendor mismatch" for +/// every non-example vendor. This function retains only errors where the vendor is NOT +/// in the allow list — i.e., genuinely disallowed vendors produce errors. +fn apply_allow_list_filter( + errors: Vec, + policy: &VendorPolicy, +) -> Vec { + let VendorPolicy::AllowList(allowed) = policy else { + return errors; + }; + + errors + .into_iter() + .filter(|e| { + // Keep the error only if it is NOT a vendor-mismatch for an allowed vendor. + // Vendor-mismatch errors contain "Vendor mismatch" in the message. + // Extract the actual vendor from normalized_id (first segment before '.'). + if !e.error.contains("Vendor mismatch") { + return true; // non-vendor errors always kept + } + // normalized_id format: "gts....." + // The vendor is the second dot-separated segment (index 1). + let id_vendor = e.normalized_id.split('.').nth(1).unwrap_or(""); + !allowed.iter().any(|a| a == id_vendor) + }) + .collect() +} diff --git a/gts-validator/src/output.rs b/gts-validator/src/output.rs index 72709fc..ac3d778 100644 --- a/gts-validator/src/output.rs +++ b/gts-validator/src/output.rs @@ -1,11 +1,11 @@ //! Shared output formatting for validation reports. //! -//! Provides JSON and human-readable formatters for `ValidationReport`. +//! Provides JSON and plain-text formatters for `ValidationReport`. +//! Color/terminal formatting is intentionally excluded from this core module — +//! that concern belongs to the CLI layer. use std::io::Write; -use colored::Colorize; - use crate::report::ValidationReport; /// Format a `ValidationReport` as JSON to a writer. @@ -19,104 +19,101 @@ pub fn write_json(report: &ValidationReport, writer: &mut dyn Write) -> anyhow:: Ok(()) } -/// Format a `ValidationReport` as human-readable text to a writer. +/// Format a `ValidationReport` as human-readable plain text to a writer. +/// +/// Color/ANSI formatting is the responsibility of the caller (CLI layer). /// /// # Errors /// /// Returns an error if writing fails. -pub fn write_human( - report: &ValidationReport, - writer: &mut dyn Write, - use_color: bool, -) -> anyhow::Result<()> { +pub fn write_human(report: &ValidationReport, writer: &mut dyn Write) -> anyhow::Result<()> { writeln!(writer)?; writeln!(writer, "{}", "=".repeat(80))?; - if use_color { - writeln!(writer, " {}", "GTS DOCUMENTATION VALIDATOR".bold())?; - } else { - writeln!(writer, " GTS DOCUMENTATION VALIDATOR")?; - } + writeln!(writer, " GTS DOCUMENTATION VALIDATOR")?; writeln!(writer, "{}", "=".repeat(80))?; writeln!(writer)?; - writeln!(writer, " Files scanned: {}", report.files_scanned)?; - writeln!(writer, " Errors found: {}", report.errors_count)?; + writeln!(writer, " Files scanned: {}", report.scanned_files)?; + writeln!(writer, " Files failed: {}", report.failed_files)?; + writeln!(writer, " Errors found: {}", report.errors_count())?; writeln!(writer)?; - if !report.errors.is_empty() { + if !report.scan_errors.is_empty() { writeln!(writer, "{}", "-".repeat(80))?; - if use_color { - writeln!(writer, " {}", "ERRORS".red().bold())?; - } else { - writeln!(writer, " ERRORS")?; - } + writeln!(writer, " SCAN ERRORS (files that could not be validated)")?; writeln!(writer, "{}", "-".repeat(80))?; + for scan_err in &report.scan_errors { + writeln!(writer, "{}", scan_err.format_human_readable())?; + } + writeln!(writer)?; + } - // Print errors - for error in &report.errors { - let formatted = error.format_human_readable(); - if use_color { - writeln!(writer, "{}", formatted.red())?; - } else { - writeln!(writer, "{formatted}")?; - } + if !report.validation_errors.is_empty() { + writeln!(writer, "{}", "-".repeat(80))?; + writeln!(writer, " VALIDATION ERRORS")?; + writeln!(writer, "{}", "-".repeat(80))?; + for error in &report.validation_errors { + writeln!(writer, "{}", error.format_human_readable())?; } writeln!(writer)?; } writeln!(writer, "{}", "=".repeat(80))?; if report.ok { - let msg = format!( + writeln!( + writer, "\u{2713} All {} files passed validation", - report.files_scanned - ); - if use_color { - writeln!(writer, "{}", msg.green())?; - } else { - writeln!(writer, "{msg}")?; - } + report.scanned_files + )?; } else { - let msg = format!( - "\u{2717} {} invalid GTS identifiers found", - report.errors_count - ); - if use_color { - writeln!(writer, "{}", msg.red())?; - } else { - writeln!(writer, "{msg}")?; - } - writeln!(writer)?; - writeln!(writer, " To fix:")?; - - // Only show hints relevant to the actual errors found - let has_vendor_mismatch = report - .errors - .iter() - .any(|e| e.error.contains("Vendor mismatch")); - let has_wildcard_error = report.errors.iter().any(|e| e.error.contains("Wildcard")); - let has_parse_error = report - .errors - .iter() - .any(|e| !e.error.contains("Vendor mismatch") && !e.error.contains("Wildcard")); - - if has_parse_error { - writeln!( - writer, - " - Schema IDs must end with ~ (e.g., gts.x.core.type.v1~)" - )?; + if !report.scan_errors.is_empty() { writeln!( writer, - " - Each segment needs 5 parts: vendor.package.namespace.type.version" + "\u{2717} {} file(s) could not be scanned \u{2014} CI must treat this as a failure", + report.failed_files )?; - writeln!(writer, " - No hyphens allowed, use underscores")?; } - if has_wildcard_error { + if !report.validation_errors.is_empty() { writeln!( writer, - " - Wildcards (*) only in filter/pattern contexts" + "\u{2717} {} invalid GTS identifier(s) found", + report.errors_count() )?; - } - if has_vendor_mismatch { - writeln!(writer, " - Ensure all GTS IDs use the expected vendor")?; + writeln!(writer)?; + writeln!(writer, " To fix:")?; + + let has_vendor_mismatch = report + .validation_errors + .iter() + .any(|e| e.error.contains("Vendor mismatch")); + let has_wildcard_error = report + .validation_errors + .iter() + .any(|e| e.error.contains("Wildcard")); + let has_parse_error = report + .validation_errors + .iter() + .any(|e| !e.error.contains("Vendor mismatch") && !e.error.contains("Wildcard")); + + if has_parse_error { + writeln!( + writer, + " - Schema IDs must end with ~ (e.g., gts.x.core.type.v1~)" + )?; + writeln!( + writer, + " - Each segment needs 5 parts: vendor.package.namespace.type.version" + )?; + writeln!(writer, " - No hyphens allowed, use underscores")?; + } + if has_wildcard_error { + writeln!( + writer, + " - Wildcards (*) only in filter/pattern contexts" + )?; + } + if has_vendor_mismatch { + writeln!(writer, " - Ensure all GTS IDs use the expected vendor")?; + } } } writeln!(writer, "{}", "=".repeat(80))?; diff --git a/gts-validator/src/report.rs b/gts-validator/src/report.rs index ca78365..87a1fdf 100644 --- a/gts-validator/src/report.rs +++ b/gts-validator/src/report.rs @@ -2,18 +2,39 @@ use serde::Serialize; -use crate::error::ValidationError; +use crate::error::{ScanError, ValidationError}; /// Result of a validation run. +/// +/// CI pipelines must check both `validation_errors` and `scan_errors`. +/// A non-empty `scan_errors` means the validator did not fully run — +/// treat this as a build failure regardless of `validation_errors`. #[derive(Debug, Clone, Serialize)] #[non_exhaustive] pub struct ValidationReport { - /// Number of files scanned. - pub files_scanned: usize, - /// Number of errors found. - pub errors_count: usize, - /// Whether all files passed validation. + /// Number of files successfully scanned (read + parsed). + pub scanned_files: usize, + /// Number of files that could not be scanned (read/parse failures). + pub failed_files: usize, + /// Whether all scanned files passed validation AND no scan errors occurred. pub ok: bool, - /// Individual validation errors. - pub errors: Vec, + /// Individual GTS ID validation errors found in scanned files. + pub validation_errors: Vec, + /// Scan-level errors: files that could not be read or parsed. + /// Non-empty means the validator did not fully cover the repository. + pub scan_errors: Vec, +} + +impl ValidationReport { + /// Total number of files attempted (scanned + failed). + #[must_use] + pub fn files_attempted(&self) -> usize { + self.scanned_files + self.failed_files + } + + /// Number of validation errors found. + #[must_use] + pub fn errors_count(&self) -> usize { + self.validation_errors.len() + } } diff --git a/gts-validator/src/strategy/fs.rs b/gts-validator/src/strategy/fs.rs index 1069870..d3191d7 100644 --- a/gts-validator/src/strategy/fs.rs +++ b/gts-validator/src/strategy/fs.rs @@ -1,14 +1,22 @@ //! Filesystem validation source. //! -//! Discovers files on disk and yields `ValidationItem`s for the validation pipeline. +//! Discovers files on disk and reads them safely for the validation pipeline. +//! Security properties enforced here: +//! - Symlinks are not followed by default (`follow_links: false`) +//! - Resolved paths are checked to remain within the repository root +//! - Device files, pipes, and sockets are skipped +//! - Maximum directory depth is enforced to prevent infinite recursion +//! - Bounded streaming reads prevent TOCTOU and memory `DoS` +use std::io::Read; use std::path::{Path, PathBuf}; use glob::Pattern; use walkdir::WalkDir; use crate::config::FsSourceConfig; -use crate::strategy::{ContentFormat, ValidationItem}; +use crate::error::{ScanError, ScanErrorKind}; +use crate::strategy::ContentFormat; /// Directories to skip pub const SKIP_DIRS: &[&str] = &["target", "node_modules", ".git", "vendor", ".gts-spec"]; @@ -18,6 +26,14 @@ pub const SKIP_DIRS: &[&str] = &["target", "node_modules", ".git", "vendor", ".g /// This list is reserved for files that are universally irrelevant across GTS repos. pub const SKIP_FILES: &[&str] = &[]; +/// Result of attempting to read a file for scanning. +pub enum ScanResult { + /// File was read successfully; contains the UTF-8 content. + Ok(String), + /// File could not be read or validated; contains the scan error. + Err(ScanError), +} + /// Check if a path matches any of the exclude patterns fn matches_exclude(path: &Path, exclude_patterns: &[Pattern]) -> bool { let path_str = path.to_string_lossy(); @@ -53,60 +69,144 @@ fn matches_file_pattern(path: &Path) -> bool { } /// Find all files to scan in the given paths. -#[must_use] -pub fn find_files(config: &FsSourceConfig) -> Vec { +/// +/// Returns `(files, scan_errors)`: +/// - `files`: paths that passed all filters and are ready to read. +/// - `scan_errors`: walk errors (permission denied, loop, etc.) and boundary violations. +/// These are never silently discarded — CI must treat them as failures. +pub fn find_files(config: &FsSourceConfig) -> (Vec, Vec) { let mut files = Vec::new(); + let mut scan_errors = Vec::new(); - // Parse exclude patterns - let exclude_patterns: Vec = config - .exclude - .iter() - .filter_map(|p| Pattern::new(p).ok()) - .collect(); - - for path in &config.paths { - if path.is_file() { - if matches_file_pattern(path) && !matches_exclude(path, &exclude_patterns) { - files.push(path.clone()); + let mut exclude_patterns = Vec::with_capacity(config.exclude.len()); + for pat_str in &config.exclude { + match Pattern::new(pat_str) { + Ok(pat) => exclude_patterns.push(pat), + Err(e) => { + scan_errors.push(ScanError { + file: PathBuf::from(pat_str), + kind: ScanErrorKind::InvalidExcludePattern, + message: format!("Invalid exclude glob pattern '{pat_str}': {e}"), + }); } - } else if path.is_dir() { - for entry in WalkDir::new(path) - .follow_links(config.follow_links) - .into_iter() - .filter_entry(is_not_skip_dir) - .filter_map(Result::ok) - { - let file_path = entry.path(); + } + } - // Only process files - if !file_path.is_file() { - continue; - } + for root in &config.paths { + // Canonicalize the root once so we can enforce the boundary for every entry. + let canonical_root = match root.canonicalize() { + Ok(r) => r, + Err(e) => { + scan_errors.push(ScanError { + file: root.clone(), + kind: ScanErrorKind::IoError, + message: format!("Failed to canonicalize root path: {e}"), + }); + continue; + } + }; - // Check file pattern - if !matches_file_pattern(file_path) { + if root.is_file() { + if matches_file_pattern(root) && !matches_exclude(root, &exclude_patterns) { + files.push(root.clone()); + } + continue; + } + + if !root.is_dir() { + continue; + } + + for entry_result in WalkDir::new(root) + .follow_links(config.follow_links) + .max_depth(config.max_depth) + .into_iter() + .filter_entry(is_not_skip_dir) + { + let entry = match entry_result { + Ok(e) => e, + Err(walk_err) => { + // Propagate walk errors (permission denied, loop, etc.) as ScanErrors. + let path = walk_err + .path() + .map_or_else(|| root.clone(), Path::to_path_buf); + scan_errors.push(ScanError { + file: path, + kind: ScanErrorKind::WalkError, + message: format!("Directory traversal error: {walk_err}"), + }); continue; } + }; + + let file_path = entry.path(); + + if !file_path.is_file() { + continue; + } - // Check exclude patterns - if matches_exclude(file_path, &exclude_patterns) { + // Enforce repository boundary: canonicalize and verify the resolved path + // stays within the root. This catches symlink escapes even when follow_links + // is true, and rejects any path that resolves outside the scan root. + match file_path.canonicalize() { + Ok(canonical_path) => { + if !canonical_path.starts_with(&canonical_root) { + scan_errors.push(ScanError { + file: file_path.to_path_buf(), + kind: ScanErrorKind::OutsideRepository, + message: format!( + "Path resolves outside repository root: {} -> {}", + file_path.display(), + canonical_path.display() + ), + }); + continue; + } + } + Err(e) => { + scan_errors.push(ScanError { + file: file_path.to_path_buf(), + kind: ScanErrorKind::IoError, + message: format!("Failed to canonicalize path: {e}"), + }); continue; } + } - // Check against skip files (suffix match, not substring) - let rel_path = file_path.to_string_lossy(); - if SKIP_FILES.iter().any(|skip| rel_path.ends_with(skip)) { + // Skip devices, pipes, sockets — only regular files + #[cfg(unix)] + { + use std::os::unix::fs::FileTypeExt; + if let Ok(ft) = entry.metadata().map(|m| m.file_type()) + && (ft.is_block_device() + || ft.is_char_device() + || ft.is_fifo() + || ft.is_socket()) + { continue; } + } - files.push(file_path.to_path_buf()); + if !matches_file_pattern(file_path) { + continue; } + + if matches_exclude(file_path, &exclude_patterns) { + continue; + } + + let rel_path = file_path.to_string_lossy(); + if SKIP_FILES.iter().any(|skip| rel_path.ends_with(skip)) { + continue; + } + + files.push(file_path.to_path_buf()); } } files.sort(); files.dedup(); - files + (files, scan_errors) } /// Determine the content format from a file extension. @@ -119,21 +219,55 @@ pub fn content_format_for(path: &Path) -> Option { } } -/// Read a file into a `ValidationItem`, respecting `max_file_size`. +/// Read a file using a bounded streaming read, enforcing `max_file_size`. /// -/// Returns `None` if the file should be skipped (too large, read error, unsupported format). -pub fn read_validation_item(path: &Path, max_file_size: u64) -> Option { - // Check file size - if let Ok(metadata) = std::fs::metadata(path) - && metadata.len() > max_file_size - { - return None; - } +/// Uses `Read::take` to avoid TOCTOU races and prevent memory `DoS`: +/// the kernel size check and the actual read are the same operation. +/// Never calls `read_to_string` on an unbounded handle. +/// +/// Returns `ScanResult::Err` (never silently discards failures) if: +/// - The file exceeds `max_file_size` +/// - An I/O error occurs +/// - The content is not valid UTF-8 +pub fn read_file_bounded(path: &Path, max_file_size: u64) -> ScanResult { + let file = match std::fs::File::open(path) { + Ok(f) => f, + Err(e) => { + return ScanResult::Err(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::IoError, + message: format!("Failed to open file: {e}"), + }); + } + }; - // Verify the file has a supported format before reading - content_format_for(path)?; + // Read at most max_file_size + 1 bytes to detect oversized files + let mut buffer = Vec::new(); + match file.take(max_file_size + 1).read_to_end(&mut buffer) { + Ok(_) => {} + Err(e) => { + return ScanResult::Err(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::IoError, + message: format!("Failed to read file: {e}"), + }); + } + } - let content = std::fs::read_to_string(path).ok()?; + if buffer.len() as u64 > max_file_size { + return ScanResult::Err(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::FileTooLarge, + message: format!("File exceeds maximum size of {max_file_size} bytes"), + }); + } - Some(ValidationItem { content }) + match String::from_utf8(buffer) { + Ok(content) => ScanResult::Ok(content), + Err(_) => ScanResult::Err(ScanError { + file: path.to_owned(), + kind: ScanErrorKind::InvalidEncoding, + message: "File is not valid UTF-8".to_owned(), + }), + } } diff --git a/gts-validator/src/strategy/mod.rs b/gts-validator/src/strategy/mod.rs index 8cd4447..677f646 100644 --- a/gts-validator/src/strategy/mod.rs +++ b/gts-validator/src/strategy/mod.rs @@ -7,13 +7,6 @@ pub mod fs; -/// A single item to validate. -#[derive(Debug, Clone)] -pub struct ValidationItem { - /// The textual content to scan. - pub content: String, -} - /// Content format for dispatching to the correct scanner. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ContentFormat { diff --git a/gts-validator/tests/validate_fs_tests.rs b/gts-validator/tests/validate_fs_tests.rs index 8ecb178..332193b 100644 --- a/gts-validator/tests/validate_fs_tests.rs +++ b/gts-validator/tests/validate_fs_tests.rs @@ -3,7 +3,7 @@ use std::fs; use std::path::PathBuf; -use gts_validator::{FsSourceConfig, ValidationConfig, validate_fs}; +use gts_validator::{FsSourceConfig, ValidationConfig, VendorPolicy, validate_fs}; use tempfile::TempDir; fn default_validation_config() -> ValidationConfig { @@ -27,7 +27,9 @@ fn test_validate_fs_empty_paths_errors() { #[test] fn test_validate_fs_nonexistent_path_errors() { - let fs_config = default_fs_config(vec![PathBuf::from("/nonexistent/path/abc123")]); + let tmp = TempDir::new().unwrap(); + let nonexistent = tmp.path().join("does_not_exist"); + let fs_config = default_fs_config(vec![nonexistent]); let result = validate_fs(&fs_config, &default_validation_config()); assert!(result.is_err()); let msg = result.unwrap_err().to_string(); @@ -43,47 +45,37 @@ fn test_validate_fs_valid_markdown() { let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); - assert_eq!(report.files_scanned, 1); - assert!(report.ok, "expected ok, got errors: {:?}", report.errors); - assert_eq!(report.errors_count, 0); -} - -#[test] -fn test_validate_fs_invalid_markdown_vendor_mismatch() { - let tmp = TempDir::new().unwrap(); - let md = tmp.path().join("test.md"); - // Valid structure but wrong vendor — triggers vendor mismatch error - fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); - - let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); - let mut config = ValidationConfig::default(); - config.vendor = Some("x".to_owned()); - let report = validate_fs(&fs_config, &config).unwrap(); - - assert_eq!(report.files_scanned, 1); - assert!(!report.ok); - assert!(report.errors_count > 0); + assert_eq!(report.scanned_files, 1); + assert!( + report.ok, + "expected ok, got errors: {:?}", + report.validation_errors + ); + assert_eq!(report.errors_count(), 0); } #[test] fn test_validate_fs_vendor_mismatch() { let tmp = TempDir::new().unwrap(); let md = tmp.path().join("test.md"); + // Valid structure but wrong vendor — triggers vendor mismatch error fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let mut config = ValidationConfig::default(); - config.vendor = Some("x".to_owned()); + config.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); let report = validate_fs(&fs_config, &config).unwrap(); + assert_eq!(report.scanned_files, 1); assert!(!report.ok); + assert!(report.errors_count() > 0); assert!( report - .errors + .validation_errors .iter() .any(|e| e.error.contains("Vendor mismatch")), "expected vendor mismatch error, got: {:?}", - report.errors + report.validation_errors ); } @@ -100,8 +92,12 @@ fn test_validate_fs_valid_json() { let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); - assert_eq!(report.files_scanned, 1); - assert!(report.ok, "expected ok, got errors: {:?}", report.errors); + assert_eq!(report.scanned_files, 1); + assert!( + report.ok, + "expected ok, got errors: {:?}", + report.validation_errors + ); } #[test] @@ -117,8 +113,12 @@ fn test_validate_fs_valid_yaml() { let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); - assert_eq!(report.files_scanned, 1); - assert!(report.ok, "expected ok, got errors: {:?}", report.errors); + assert_eq!(report.scanned_files, 1); + assert!( + report.ok, + "expected ok, got errors: {:?}", + report.validation_errors + ); } #[test] @@ -135,10 +135,11 @@ fn test_validate_fs_json_output_contract() { gts_validator::output::write_json(&report, &mut buf).unwrap(); let json: serde_json::Value = serde_json::from_slice(&buf).unwrap(); - assert!(json.get("files_scanned").is_some()); - assert!(json.get("errors_count").is_some()); + assert!(json.get("scanned_files").is_some()); + assert!(json.get("failed_files").is_some()); assert!(json.get("ok").is_some()); - assert!(json.get("errors").is_some()); + assert!(json.get("validation_errors").is_some()); + assert!(json.get("scan_errors").is_some()); assert!(json["ok"].as_bool().unwrap()); } @@ -164,25 +165,25 @@ fn test_validate_fs_exclude_pattern() { .unwrap(); let mut config = ValidationConfig::default(); - config.vendor = Some("x".to_owned()); + config.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); // Without exclude: should find vendor mismatch in excluded/test.md let mut fs_config_no_exclude = FsSourceConfig::default(); fs_config_no_exclude.paths = vec![tmp.path().to_path_buf()]; let report_no_exclude = validate_fs(&fs_config_no_exclude, &config).unwrap(); - assert_eq!(report_no_exclude.files_scanned, 2); + assert_eq!(report_no_exclude.scanned_files, 2); assert!( !report_no_exclude.ok, "should find vendor mismatch without exclude" ); - // With exclude: excluded dir should be skipped, only included.md scanned + // With exclude: file matching "test.md" should be skipped, only included.md scanned let mut fs_config_with_exclude = FsSourceConfig::default(); fs_config_with_exclude.paths = vec![tmp.path().to_path_buf()]; fs_config_with_exclude.exclude = vec!["test.md".to_owned()]; let report_with_exclude = validate_fs(&fs_config_with_exclude, &config).unwrap(); assert_eq!( - report_with_exclude.files_scanned, 1, + report_with_exclude.scanned_files, 1, "exclude should reduce file count" ); assert!( @@ -201,22 +202,22 @@ fn test_write_human_success_output() { let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); let mut buf = Vec::new(); - gts_validator::output::write_human(&report, &mut buf, false).unwrap(); + gts_validator::output::write_human(&report, &mut buf).unwrap(); let output = String::from_utf8(buf).unwrap(); assert!( output.contains("GTS DOCUMENTATION VALIDATOR"), "missing header, got: {output}" ); - assert!(output.contains("Files scanned: 1"), "missing file count"); - assert!(output.contains("Errors found: 0"), "missing error count"); + assert!(output.contains("Files scanned: 1"), "missing file count"); + assert!(output.contains("Errors found: 0"), "missing error count"); assert!( output.contains("All 1 files passed"), "missing success message" ); assert!( - !output.contains("ERRORS"), - "should not contain ERRORS section" + !output.contains("VALIDATION ERRORS"), + "should not contain VALIDATION ERRORS section" ); } @@ -228,20 +229,23 @@ fn test_write_human_failure_output() { let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let mut config = ValidationConfig::default(); - config.vendor = Some("x".to_owned()); + config.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); let report = validate_fs(&fs_config, &config).unwrap(); let mut buf = Vec::new(); - gts_validator::output::write_human(&report, &mut buf, false).unwrap(); + gts_validator::output::write_human(&report, &mut buf).unwrap(); let output = String::from_utf8(buf).unwrap(); - assert!(output.contains("ERRORS"), "missing ERRORS section"); + assert!( + output.contains("VALIDATION ERRORS"), + "missing VALIDATION ERRORS section" + ); assert!( output.contains("Vendor mismatch"), "missing vendor mismatch hint" ); assert!( - output.contains("invalid GTS identifiers found"), + output.contains("invalid GTS identifier"), "missing failure summary" ); assert!( @@ -259,46 +263,86 @@ fn test_validate_fs_no_matching_files_returns_ok() { let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); - assert_eq!(report.files_scanned, 0); + assert_eq!(report.scanned_files, 0); assert!(report.ok, "empty scan should be ok, not an error"); } #[test] -fn test_validate_fs_max_file_size_skips_large_files() { +fn test_validate_fs_max_file_size_produces_scan_error() { let tmp = TempDir::new().unwrap(); let md = tmp.path().join("big.md"); - // Write a file with a vendor-mismatch ID that would normally fail fs::write(&md, "# Title\n\nUses `gts.y.core.pkg.mytype.v1~` schema.\n").unwrap(); - // Set max_file_size to 10 bytes — file is larger, so it should be skipped + // Set max_file_size to 10 bytes — file is larger, so it should produce a scan error let mut fs_config = FsSourceConfig::default(); fs_config.paths = vec![tmp.path().to_path_buf()]; fs_config.max_file_size = 10; let mut config = ValidationConfig::default(); - config.vendor = Some("x".to_owned()); + config.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); let report = validate_fs(&fs_config, &config).unwrap(); assert_eq!( - report.files_scanned, 0, - "Oversized file should be skipped, not scanned" + report.scanned_files, 0, + "Oversized file should not be counted as scanned" + ); + assert_eq!( + report.failed_files, 1, + "Oversized file must produce a scan error" ); - assert!(report.ok, "No files scanned means ok"); + assert!(!report.ok, "Scan errors must make the report not-ok"); } #[test] -fn test_validate_fs_non_utf8_file_skipped() { +fn test_validate_fs_non_utf8_file_produces_scan_error() { let tmp = TempDir::new().unwrap(); let md = tmp.path().join("binary.md"); - // Write invalid UTF-8 bytes fs::write(&md, [0xFF, 0xFE, 0x00, 0x01, 0x80, 0x81]).unwrap(); let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); - // Non-UTF-8 file should be silently skipped (read_to_string fails) - assert_eq!(report.files_scanned, 0, "Non-UTF-8 file should be skipped"); - assert!(report.ok); + assert_eq!( + report.scanned_files, 0, + "Non-UTF-8 file should not be counted as scanned" + ); + assert_eq!( + report.failed_files, 1, + "Non-UTF-8 file must produce a scan error" + ); + assert!(!report.ok, "Scan errors must make the report not-ok"); +} + +#[test] +fn test_validate_fs_invalid_json_produces_scan_error() { + let tmp = TempDir::new().unwrap(); + let json_file = tmp.path().join("bad.json"); + fs::write(&json_file, "{ not valid json !!!").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + assert_eq!( + report.failed_files, 1, + "Invalid JSON must produce a scan error" + ); + assert!(!report.ok, "Scan errors must make the report not-ok"); +} + +#[test] +fn test_validate_fs_invalid_yaml_produces_scan_error() { + let tmp = TempDir::new().unwrap(); + let yaml_file = tmp.path().join("bad.yaml"); + fs::write(&yaml_file, "key: [unclosed bracket").unwrap(); + + let fs_config = default_fs_config(vec![tmp.path().to_path_buf()]); + let report = validate_fs(&fs_config, &default_validation_config()).unwrap(); + + assert_eq!( + report.failed_files, 1, + "Invalid YAML must produce a scan error" + ); + assert!(!report.ok, "Scan errors must make the report not-ok"); } #[test] @@ -316,7 +360,7 @@ fn test_validate_fs_skip_tokens_integration() { // Without skip_tokens: should report vendor mismatch let mut config_no_skip = ValidationConfig::default(); - config_no_skip.vendor = Some("x".to_owned()); + config_no_skip.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); let report_no_skip = validate_fs(&fs_config, &config_no_skip).unwrap(); assert!( !report_no_skip.ok, @@ -325,12 +369,12 @@ fn test_validate_fs_skip_tokens_integration() { // With skip_tokens: should suppress the error let mut config_skip = ValidationConfig::default(); - config_skip.vendor = Some("x".to_owned()); + config_skip.vendor_policy = VendorPolicy::MustMatch("x".to_owned()); config_skip.skip_tokens = vec!["**given**".to_owned()]; let report_skip = validate_fs(&fs_config, &config_skip).unwrap(); assert!( report_skip.ok, "With skip_tokens, the error should be suppressed: {:?}", - report_skip.errors + report_skip.validation_errors ); }