From 536421424e99867c2b70b562b76035094160eec7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 21:44:35 +0000 Subject: [PATCH 1/2] Initial plan From c5fd26a794f5f2893ed65c5ed5167bedad9180ef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 21:48:28 +0000 Subject: [PATCH 2/2] fix: address code review feedback on path classification - Clarify Windows path separator validation in docs - Optimize registry path matching with zero-allocation case-insensitive search - Prevent duplicate Tag::FilePath entries - Add comment explaining 500ms performance test timeout Co-authored-by: unclesp1d3r <251112+unclesp1d3r@users.noreply.github.com> --- docs/src/classification.md | 2 +- src/classification/semantic.rs | 38 +++++++++++++++++++---------- tests/classification_integration.rs | 4 +++ 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/docs/src/classification.md b/docs/src/classification.md index 044bfc5..fea0ba3 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -46,7 +46,7 @@ Raw String -> Pattern Matching -> Tag Assignment - **Windows Pattern**: `^[A-Za-z]:\\[^\0\n\r]*` - **UNC Pattern**: `^\\\\[a-zA-Z0-9.-]+\\[^\0\n\r]*` - **Examples**: `/usr/bin/malware`, `C:\\Windows\\System32\\evil.dll`, `\\\\server\\share\\file.txt` -- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects double path separators (`//` for POSIX, `\\` for Windows); applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter +- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects consecutive path separators in POSIX paths (`//`) and consecutive backslashes in Windows paths (for example, `folder\\\\file.txt`), while allowing UNC paths that start with `\\\\`; applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter - **Suspicious path examples**: `/etc/cron.d/`, `/etc/init.d/`, `/usr/local/bin/`, `/tmp/`, `/var/tmp/`; `C:\\Windows\\System32\\`, `C:\\Windows\\Temp\\`, `...\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\` - **Security relevance**: Medium-High - persistence and execution locations diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index f1c8e0c..812b87a 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -380,17 +380,12 @@ impl SemanticClassifier { let ip_tags = self.classify_ip_addresses(&string.text); tags.extend(ip_tags); - // Check for file paths (POSIX, Windows, UNC) - if let Some(tag) = self.classify_posix_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_windows_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_unc_path(&string.text) { - tags.push(tag); + // Check for file paths (POSIX, Windows, UNC) - only add FilePath tag once + if self.classify_posix_path(&string.text).is_some() + || self.classify_windows_path(&string.text).is_some() + || self.classify_unc_path(&string.text).is_some() + { + tags.push(Tag::FilePath); } // Check for registry paths @@ -741,10 +736,27 @@ impl SemanticClassifier { /// Checks if the registry path matches known persistence locations pub fn is_suspicious_registry_path(&self, text: &str) -> bool { - let text_lower = text.to_ascii_lowercase(); SUSPICIOUS_REGISTRY_PATHS .iter() - .any(|path| text_lower.contains(&path.to_ascii_lowercase())) + .any(|path| self.contains_ascii_case_insensitive(text, path)) + } + + /// Case-insensitive ASCII substring search without allocations + fn contains_ascii_case_insensitive(&self, haystack: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + + let haystack_bytes = haystack.as_bytes(); + let needle_bytes = needle.as_bytes(); + + if needle_bytes.len() > haystack_bytes.len() { + return false; + } + + haystack_bytes + .windows(needle_bytes.len()) + .any(|window| window.eq_ignore_ascii_case(needle_bytes)) } /// Detects printf-style placeholders to reduce false positives diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index da78516..710f2ea 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -88,6 +88,10 @@ fn test_classification_performance() { } let elapsed = start.elapsed(); + // Timeout is set to 500ms to accommodate slower CI environments while still detecting + // performance regressions. This processes 1050 samples (350 iterations x 3 samples each). + // The timeout is higher than typical development performance (~50-100ms) to ensure + // CI stability across different runner configurations and load conditions. assert!(elapsed < Duration::from_millis(500)); }