diff --git a/docs/src/classification.md b/docs/src/classification.md index 044bfc5..fea0ba3 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -46,7 +46,7 @@ Raw String -> Pattern Matching -> Tag Assignment - **Windows Pattern**: `^[A-Za-z]:\\[^\0\n\r]*` - **UNC Pattern**: `^\\\\[a-zA-Z0-9.-]+\\[^\0\n\r]*` - **Examples**: `/usr/bin/malware`, `C:\\Windows\\System32\\evil.dll`, `\\\\server\\share\\file.txt` -- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects double path separators (`//` for POSIX, `\\` for Windows); applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter +- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects consecutive path separators in POSIX paths (`//`) and consecutive backslashes in Windows paths (for example, `folder\\\\file.txt`), while allowing UNC paths that start with `\\\\`; applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter - **Suspicious path examples**: `/etc/cron.d/`, `/etc/init.d/`, `/usr/local/bin/`, `/tmp/`, `/var/tmp/`; `C:\\Windows\\System32\\`, `C:\\Windows\\Temp\\`, `...\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\` - **Security relevance**: Medium-High - persistence and execution locations diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index f1c8e0c..812b87a 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -380,17 +380,12 @@ impl SemanticClassifier { let ip_tags = self.classify_ip_addresses(&string.text); tags.extend(ip_tags); - // Check for file paths (POSIX, Windows, UNC) - if let Some(tag) = self.classify_posix_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_windows_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_unc_path(&string.text) { - tags.push(tag); + // Check for file paths (POSIX, Windows, UNC) - only add FilePath tag once + if self.classify_posix_path(&string.text).is_some() + || self.classify_windows_path(&string.text).is_some() + || self.classify_unc_path(&string.text).is_some() + { + tags.push(Tag::FilePath); } // Check for registry paths @@ -741,10 +736,27 @@ impl SemanticClassifier { /// Checks if the registry path matches known persistence locations pub fn is_suspicious_registry_path(&self, text: &str) -> bool { - let text_lower = text.to_ascii_lowercase(); SUSPICIOUS_REGISTRY_PATHS .iter() - .any(|path| text_lower.contains(&path.to_ascii_lowercase())) + .any(|path| self.contains_ascii_case_insensitive(text, path)) + } + + /// Case-insensitive ASCII substring search without allocations + fn contains_ascii_case_insensitive(&self, haystack: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + + let haystack_bytes = haystack.as_bytes(); + let needle_bytes = needle.as_bytes(); + + if needle_bytes.len() > haystack_bytes.len() { + return false; + } + + haystack_bytes + .windows(needle_bytes.len()) + .any(|window| window.eq_ignore_ascii_case(needle_bytes)) } /// Detects printf-style placeholders to reduce false positives diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index da78516..710f2ea 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -88,6 +88,10 @@ fn test_classification_performance() { } let elapsed = start.elapsed(); + // Timeout is set to 500ms to accommodate slower CI environments while still detecting + // performance regressions. This processes 1050 samples (350 iterations x 3 samples each). + // The timeout is higher than typical development performance (~50-100ms) to ensure + // CI stability across different runner configurations and load conditions. assert!(elapsed < Duration::from_millis(500)); }