From 0182e8b6a37da325cd5a5ac5ab1f6207e3b49bf8 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 15:19:37 -0500 Subject: [PATCH 01/19] feat(docs): add AI agent guidelines and character usage policy Signed-off-by: UncleSp1d3r --- .github/copilot-instructions.md | 198 +++++++++++++++++++++++++++++ AGENTS.md | 217 ++++++++++++++++++++++++++++++++ CLAUDE.md | 1 + 3 files changed, 416 insertions(+) create mode 100644 .github/copilot-instructions.md create mode 100644 AGENTS.md create mode 100644 CLAUDE.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..2d188f1 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,198 @@ +# Copilot Instructions for Stringy + +## Project Overview + +Stringy is a **smarter strings tool** for extracting meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike the standard `strings` command, Stringy is data-structure aware, section-aware, and semantically intelligent. + +## Architecture & Data Flow + +``` +Binary → Format Detection (goblin) → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output +``` + +### Module Organization + +- **`container/`** [COMPLETE]: Format detection (ELF/PE/Mach-O), section analysis, imports/exports via `goblin` +- **`extraction/`** [COMPLETE]: ASCII/UTF-8/UTF-16 string extraction, deduplication, PE resources +- **`classification/`** [PARTIAL]: Semantic tagging (URLs, IPs, domains, paths, GUIDs, etc.) +- **`output/`** [PLANNED]: JSON/human-readable/YARA-friendly formatting +- **`types/`** [COMPLETE]: Core data structures (`FoundString`, `ContainerInfo`, etc.), error handling + +## Critical Coding Standards + +### Zero Tolerance Policies + +- **No `unsafe` code**: `#![forbid(unsafe_code)]` enforced at package level +- **Zero warnings**: `cargo clippy -- -D warnings` must pass (`#![deny(warnings)]` enforced) +- **Rust 2024 Edition**: MSRV 1.91+, always use latest edition features +- **File size limit**: Keep files ≤500-600 lines; split larger files into focused modules +- **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules +- **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) + +### Error Handling with `thiserror` + +Use structured errors with detailed context (see `src/types.rs`): + +```rust +#[derive(Debug, Error)] +pub enum StringyError { + #[error("Binary parsing error: {0}")] + ParseError(String), + + #[error("Invalid encoding at offset {offset}")] + EncodingError { offset: u64 }, +} +``` + +Convert external errors with `From` implementations. Provide offsets, section names, and file paths in error messages. + +## Key Implementation Patterns + +### Section Weight System + +Container parsers assign weights (1.0-10.0) to sections based on string likelihood: + +```rust +// ELF example from container/elf.rs +".rodata" | ".rodata.str1.*" => 10.0 // Highest priority +".comment" | ".note.*" => 9.0 // Build info +".data.rel.ro" => 7.0 // Read-only data +".data" => 5.0 // Writable data (lower priority) +``` + +**Pattern**: Use match expressions with fallthrough to assign weights; higher = more likely to contain meaningful strings. + +### String Deduplication (`extraction/dedup.rs`) + +Strings are grouped by `(text, encoding)` tuple in a `HashMap<(String, Encoding), Vec>`: + +- **Preserve all occurrences**: Each occurrence captures offset, RVA, section, source, tags, score, confidence +- **Tag merging**: Union all tags via `HashSet`, then sort +- **Combined scoring formula**: + ``` + base_score = max(occurrence.original_score) + occurrence_bonus = 5 * (count - 1) + cross_section_bonus = 10 (if >1 unique section) + multi_source_bonus = 15 (if >1 unique StringSource) + confidence_boost = (max_confidence * 10.0) as i32 + ``` + +### Non-Exhaustive Structs + +Use `#[non_exhaustive]` for public API structs like `ContainerInfo` and provide explicit constructors (see `types.rs`): + +```rust +#[non_exhaustive] +pub struct ContainerInfo { /* fields */ } + +impl ContainerInfo { + pub fn new(format: BinaryFormat, sections: Vec, ...) -> Self { ... } +} +``` + +## Testing Standards + +- **Snapshot testing**: Use `insta` for output verification (`tests/integration_*.rs`) +- **Fixtures**: Binary test fixtures in `tests/fixtures/` (see `fixtures/README.md`) +- **Integration tests**: Named `test_*.rs` or `integration_*.rs` in `tests/` +- **Run tests**: `just test` (uses `cargo nextest`) + +Example pattern from `tests/integration_elf.rs`: +```rust +fn get_fixture_path(name: &str) -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures").join(name) +} + +#[test] +fn test_elf_import_export_extraction() { + let data = fs::read(&get_fixture_path("test_binary_elf")).expect("..."); + let parser = ElfParser::new(); + let info = parser.parse(&data).expect("..."); + // Verify imports/exports with specific assertions +} +``` + +## Development Workflow + +### Common Commands (`justfile`) + +**Setup**: `just setup` (installs rustfmt, clippy, llvm-tools-preview, mdformat) + +**Development**: +- `just build` - Debug build +- `just test` - Run tests with nextest +- `just lint` - Full lint suite (rustfmt, clippy, actionlint, cspell, markdown) +- `just check` - Pre-commit checks + lint +- `just run ` - Run binary against test file + +**Code Quality**: +- `just fmt` - Format Rust/markdown/YAML/JSON +- `just fix` - Auto-fix clippy warnings with `--fix` +- `just coverage` - Generate LCOV coverage report + +**CI Parity**: `just ci-check` (runs full CI suite locally) + +### Windows vs Unix + +The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform compatibility. PowerShell on Windows, bash on Unix. + +## Dependencies & Crates + +**Core parsing**: `goblin` (ELF/PE/Mach-O), `pelite` (PE resources) +**CLI**: `clap` with derive macros +**Error handling**: `thiserror` +**Serialization**: `serde`, `serde_json` +**Regex**: `regex` for classification +**Testing**: `insta` (snapshots), `criterion` (benchmarks), `tempfile` + +## Import Conventions + +- Re-export commonly used types in `lib.rs` for ergonomic imports +- Import from `stringy::extraction` or `stringy::types`, not deeply nested paths +- Within `extraction/mod.rs`, do NOT import locally-defined types; downstream code imports from `stringy::extraction` + +## What NOT to Do + +- Don't use `async` (this is a synchronous CLI tool) +- Don't add `unsafe` blocks (forbidden) +- Don't ignore clippy warnings (they're errors) +- Don't create files >600 lines without splitting +- Don't use blanket `#[allow]` on modules/files +- Don't guess at section weights (refer to existing parsers in `container/`) + +## Current Implementation Status + +**Complete**: +- ELF/PE/Mach-O format detection and parsing +- ASCII, UTF-8, UTF-16LE/BE string extraction +- PE resource string extraction (VERSIONINFO, STRINGTABLE, MANIFEST) +- String deduplication with occurrence tracking +- IPv4/IPv6, URL, domain classification + +**In Progress**: +- Full semantic classification suite (GUIDs, paths, format strings, Base64) +- Ranking/scoring algorithm implementation +- CLI interface (`main.rs` is placeholder) +- Output formatters (JSON, YARA-friendly, human-readable) + +## Quick Reference Examples + +**Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): +```rust +let weight = match section_name { + ".mydata" => 8.0, // New section type + _ => existing_match_arms +}; +``` + +**Extracting strings from a section**: +```rust +use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; +let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; +let strings = extract_ascii_strings(§ion_data, &config); +``` + +**Adding a semantic tag**: +1. Add variant to `Tag` enum in `types.rs` +2. Implement pattern matching in `classification/semantic.rs` +3. Update deduplication tag merging if needed diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..4bfd711 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,217 @@ +# AI Agent Guidelines for Stringy + +## Character Usage Policy + +**CRITICAL**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. + +- Use standard ASCII hyphens (`-`) instead of em-dashes (—) or en-dashes (–) +- Use standard ASCII quotes (`"` or `'`) instead of smart quotes (" " ' ') +- Use standard ASCII apostrophes (`'`) instead of curly apostrophes (') +- Avoid Unicode symbols and emojis entirely +- Use text descriptions instead of symbolic indicators + +## Project Overview + +Stringy is a **smarter strings tool** for extracting meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike the standard `strings` command, Stringy is data-structure aware, section-aware, and semantically intelligent. + +## Architecture & Data Flow + +```text +Binary → Format Detection (goblin) → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output +``` + +### Module Organization + +- **`container/`** \[COMPLETE\]: Format detection (ELF/PE/Mach-O), section analysis, imports/exports via `goblin` +- **`extraction/`** \[COMPLETE\]: ASCII/UTF-8/UTF-16 string extraction, deduplication, PE resources +- **`classification/`** \[PARTIAL\]: Semantic tagging (URLs, IPs, domains, paths, GUIDs, etc.) +- **`output/`** \[PLANNED\]: JSON/human-readable/YARA-friendly formatting +- **`types/`** \[COMPLETE\]: Core data structures (`FoundString`, `ContainerInfo`, etc.), error handling + +## Critical Coding Standards + +### Zero Tolerance Policies + +- **No `unsafe` code**: `#![forbid(unsafe_code)]` enforced at package level +- **Zero warnings**: `cargo clippy -- -D warnings` must pass (`#![deny(warnings)]` enforced) +- **Rust 2024 Edition**: MSRV 1.91+, always use latest edition features +- **File size limit**: Keep files ≤500-600 lines; split larger files into focused modules +- **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules +- **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) + +### Error Handling with `thiserror` + +Use structured errors with detailed context (see `src/types.rs`): + +```rust +#[derive(Debug, Error)] +pub enum StringyError { + #[error("Binary parsing error: {0}")] + ParseError(String), + + #[error("Invalid encoding at offset {offset}")] + EncodingError { offset: u64 }, +} +``` + +Convert external errors with `From` implementations. Provide offsets, section names, and file paths in error messages. + +## Key Implementation Patterns + +### Section Weight System + +Container parsers assign weights (1.0-10.0) to sections based on string likelihood: + +```rust +// ELF example from container/elf.rs +".rodata" | ".rodata.str1.*" => 10.0 // Highest priority +".comment" | ".note.*" => 9.0 // Build info +".data.rel.ro" => 7.0 // Read-only data +".data" => 5.0 // Writable data (lower priority) +``` + +**Pattern**: Use match expressions with fallthrough to assign weights; higher = more likely to contain meaningful strings. + +### String Deduplication (`extraction/dedup.rs`) + +Strings are grouped by `(text, encoding)` tuple in a `HashMap<(String, Encoding), Vec>`: + +- **Preserve all occurrences**: Each occurrence captures offset, RVA, section, source, tags, score, confidence +- **Tag merging**: Union all tags via `HashSet`, then sort +- **Combined scoring formula**: + + ```text + base_score = max(occurrence.original_score) + occurrence_bonus = 5 * (count - 1) + cross_section_bonus = 10 (if >1 unique section) + multi_source_bonus = 15 (if >1 unique StringSource) + confidence_boost = (max_confidence * 10.0) as i32 + ``` + +### Non-Exhaustive Structs + +Use `#[non_exhaustive]` for public API structs like `ContainerInfo` and provide explicit constructors (see `types.rs`): + +```rust +#[non_exhaustive] +pub struct ContainerInfo { /* fields */ } + +impl ContainerInfo { + pub fn new(format: BinaryFormat, sections: Vec, ...) -> Self { ... } +} +``` + +## Testing Standards + +- **Snapshot testing**: Use `insta` for output verification (`tests/integration_*.rs`) +- **Fixtures**: Binary test fixtures in `tests/fixtures/` (see `fixtures/README.md`) +- **Integration tests**: Named `test_*.rs` or `integration_*.rs` in `tests/` +- **Run tests**: `just test` (uses `cargo nextest`) + +Example pattern from `tests/integration_elf.rs`: + +```rust +fn get_fixture_path(name: &str) -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures").join(name) +} + +#[test] +fn test_elf_import_export_extraction() { + let data = fs::read(&get_fixture_path("test_binary_elf")).expect("..."); + let parser = ElfParser::new(); + let info = parser.parse(&data).expect("..."); + // Verify imports/exports with specific assertions +} +``` + +## Development Workflow + +### Common Commands (`justfile`) + +**Setup**: `just setup` (installs rustfmt, clippy, llvm-tools-preview, mdformat) + +**Development**: + +- `just build` - Debug build +- `just test` - Run tests with nextest +- `just lint` - Full lint suite (rustfmt, clippy, actionlint, cspell, markdown) +- `just check` - Pre-commit checks + lint +- `just run ` - Run binary against test file + +**Code Quality**: + +- `just fmt` - Format Rust/markdown/YAML/JSON +- `just fix` - Auto-fix clippy warnings with `--fix` +- `just coverage` - Generate LCOV coverage report + +**CI Parity**: `just ci-check` (runs full CI suite locally) + +### Windows vs Unix + +The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform compatibility. PowerShell on Windows, bash on Unix. + +## Dependencies & Crates + +**Core parsing**: `goblin` (ELF/PE/Mach-O), `pelite` (PE resources)\ +**CLI**: `clap` with derive macros\ +**Error handling**: `thiserror`\ +**Serialization**: `serde`, `serde_json`\ +**Regex**: `regex` for classification\ +**Testing**: `insta` (snapshots), `criterion` (benchmarks), `tempfile` + +## Import Conventions + +- Re-export commonly used types in `lib.rs` for ergonomic imports +- Import from `stringy::extraction` or `stringy::types`, not deeply nested paths +- Within `extraction/mod.rs`, do NOT import locally-defined types; downstream code imports from `stringy::extraction` + +## What NOT to Do + +- Don't use `async` (this is a synchronous CLI tool) +- Don't add `unsafe` blocks (forbidden) +- Don't ignore clippy warnings (they're errors) +- Don't create files >600 lines without splitting +- Don't use blanket `#[allow]` on modules/files +- Don't guess at section weights (refer to existing parsers in `container/`) + +## Current Implementation Status + +**Complete**: + +- ELF/PE/Mach-O format detection and parsing +- ASCII, UTF-8, UTF-16LE/BE string extraction +- PE resource string extraction (VERSIONINFO, STRINGTABLE, MANIFEST) +- String deduplication with occurrence tracking +- IPv4/IPv6, URL, domain classification + +**In Progress**: + +- Full semantic classification suite (GUIDs, paths, format strings, Base64) +- Ranking/scoring algorithm implementation +- CLI interface (`main.rs` is placeholder) +- Output formatters (JSON, YARA-friendly, human-readable) + +## Quick Reference Examples + +**Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): + +```rust +let weight = match section_name { + ".mydata" => 8.0, // New section type + _ => existing_match_arms +}; +``` + +**Extracting strings from a section**: + +```rust +use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; +let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; +let strings = extract_ascii_strings(§ion_data, &config); +``` + +**Adding a semantic tag**: + +1. Add variant to `Tag` enum in `types.rs` +2. Implement pattern matching in `classification/semantic.rs` +3. Update deduplication tag merging if needed diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md From ff7130cfde0578ee6c9a84cf7859c73cfa2007f2 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 15:37:56 -0500 Subject: [PATCH 02/19] chore(docs): revise AI agent guidelines for clarity and rules Signed-off-by: UncleSp1d3r --- AGENTS.md | 232 ++++++++++-------------------------------------------- 1 file changed, 42 insertions(+), 190 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 4bfd711..9eaee6e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,217 +1,69 @@ # AI Agent Guidelines for Stringy -## Character Usage Policy +## Critical Rules -**CRITICAL**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. +**These rules are non-negotiable. Violations will cause CI failures.** -- Use standard ASCII hyphens (`-`) instead of em-dashes (—) or en-dashes (–) -- Use standard ASCII quotes (`"` or `'`) instead of smart quotes (" " ' ') -- Use standard ASCII apostrophes (`'`) instead of curly apostrophes (') -- Avoid Unicode symbols and emojis entirely -- Use text descriptions instead of symbolic indicators +1. **No `unsafe` code** - `#![forbid(unsafe_code)]` enforced +2. **Zero warnings** - `cargo clippy -- -D warnings` must pass +3. **ASCII only** - No emojis, em-dashes, smart quotes, or Unicode punctuation +4. **File size limit** - Keep files under 500 lines; split larger files +5. **No blanket `#[allow]`** - Any `allow` requires inline justification -## Project Overview +## Project Summary -Stringy is a **smarter strings tool** for extracting meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike the standard `strings` command, Stringy is data-structure aware, section-aware, and semantically intelligent. +Stringy extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. Unlike standard `strings`, it is section-aware and semantically intelligent. -## Architecture & Data Flow +**Data flow**: Binary -> Format Detection -> Container Parsing -> String Extraction -> Deduplication -> Classification -> Ranking -> Output -```text -Binary → Format Detection (goblin) → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output -``` - -### Module Organization - -- **`container/`** \[COMPLETE\]: Format detection (ELF/PE/Mach-O), section analysis, imports/exports via `goblin` -- **`extraction/`** \[COMPLETE\]: ASCII/UTF-8/UTF-16 string extraction, deduplication, PE resources -- **`classification/`** \[PARTIAL\]: Semantic tagging (URLs, IPs, domains, paths, GUIDs, etc.) -- **`output/`** \[PLANNED\]: JSON/human-readable/YARA-friendly formatting -- **`types/`** \[COMPLETE\]: Core data structures (`FoundString`, `ContainerInfo`, etc.), error handling - -## Critical Coding Standards - -### Zero Tolerance Policies - -- **No `unsafe` code**: `#![forbid(unsafe_code)]` enforced at package level -- **Zero warnings**: `cargo clippy -- -D warnings` must pass (`#![deny(warnings)]` enforced) -- **Rust 2024 Edition**: MSRV 1.91+, always use latest edition features -- **File size limit**: Keep files ≤500-600 lines; split larger files into focused modules -- **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules -- **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) - -### Error Handling with `thiserror` - -Use structured errors with detailed context (see `src/types.rs`): - -```rust -#[derive(Debug, Error)] -pub enum StringyError { - #[error("Binary parsing error: {0}")] - ParseError(String), - - #[error("Invalid encoding at offset {offset}")] - EncodingError { offset: u64 }, -} -``` - -Convert external errors with `From` implementations. Provide offsets, section names, and file paths in error messages. +## Module Structure -## Key Implementation Patterns +| Module | Purpose | +|--------|---------| +| `container/` | Format detection, section analysis, imports/exports via `goblin` | +| `extraction/` | ASCII/UTF-8/UTF-16 extraction, deduplication, PE resources | +| `classification/` | Semantic tagging (URLs, IPs, domains, paths, GUIDs) | +| `output/` | Formatters (JSON, human-readable, YARA-friendly) | +| `types/` | Core data structures, error handling with `thiserror` | -### Section Weight System +## Key Patterns -Container parsers assign weights (1.0-10.0) to sections based on string likelihood: +### Section Weights -```rust -// ELF example from container/elf.rs -".rodata" | ".rodata.str1.*" => 10.0 // Highest priority -".comment" | ".note.*" => 9.0 // Build info -".data.rel.ro" => 7.0 // Read-only data -".data" => 5.0 // Writable data (lower priority) -``` - -**Pattern**: Use match expressions with fallthrough to assign weights; higher = more likely to contain meaningful strings. - -### String Deduplication (`extraction/dedup.rs`) +Container parsers assign weights (1.0-10.0) based on string likelihood. Higher = more valuable. See existing parsers in `container/*.rs` for reference values. -Strings are grouped by `(text, encoding)` tuple in a `HashMap<(String, Encoding), Vec>`: +### Error Handling -- **Preserve all occurrences**: Each occurrence captures offset, RVA, section, source, tags, score, confidence -- **Tag merging**: Union all tags via `HashSet`, then sort -- **Combined scoring formula**: +Use `thiserror` with detailed context. Include offsets, section names, and file paths in error messages. Convert external errors with `From` implementations. - ```text - base_score = max(occurrence.original_score) - occurrence_bonus = 5 * (count - 1) - cross_section_bonus = 10 (if >1 unique section) - multi_source_bonus = 15 (if >1 unique StringSource) - confidence_boost = (max_confidence * 10.0) as i32 - ``` +### Public API Structs -### Non-Exhaustive Structs +Use `#[non_exhaustive]` for public structs and provide explicit constructors. -Use `#[non_exhaustive]` for public API structs like `ContainerInfo` and provide explicit constructors (see `types.rs`): +## Development Commands -```rust -#[non_exhaustive] -pub struct ContainerInfo { /* fields */ } - -impl ContainerInfo { - pub fn new(format: BinaryFormat, sections: Vec, ...) -> Self { ... } -} +```bash +just check # Pre-commit: fmt + lint + test +just test # Run tests with nextest +just lint # Full lint suite +just fix # Auto-fix clippy warnings +just ci-check # Full CI suite locally ``` -## Testing Standards - -- **Snapshot testing**: Use `insta` for output verification (`tests/integration_*.rs`) -- **Fixtures**: Binary test fixtures in `tests/fixtures/` (see `fixtures/README.md`) -- **Integration tests**: Named `test_*.rs` or `integration_*.rs` in `tests/` -- **Run tests**: `just test` (uses `cargo nextest`) - -Example pattern from `tests/integration_elf.rs`: - -```rust -fn get_fixture_path(name: &str) -> PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures").join(name) -} - -#[test] -fn test_elf_import_export_extraction() { - let data = fs::read(&get_fixture_path("test_binary_elf")).expect("..."); - let parser = ElfParser::new(); - let info = parser.parse(&data).expect("..."); - // Verify imports/exports with specific assertions -} -``` - -## Development Workflow - -### Common Commands (`justfile`) +## Testing -**Setup**: `just setup` (installs rustfmt, clippy, llvm-tools-preview, mdformat) +- Use `insta` for snapshot testing +- Binary fixtures in `tests/fixtures/` +- Integration tests named `integration_*.rs` -**Development**: +## Imports -- `just build` - Debug build -- `just test` - Run tests with nextest -- `just lint` - Full lint suite (rustfmt, clippy, actionlint, cspell, markdown) -- `just check` - Pre-commit checks + lint -- `just run ` - Run binary against test file +Import from `stringy::extraction` or `stringy::types`, not deeply nested paths. Re-exports are in `lib.rs`. -**Code Quality**: +## Adding Features -- `just fmt` - Format Rust/markdown/YAML/JSON -- `just fix` - Auto-fix clippy warnings with `--fix` -- `just coverage` - Generate LCOV coverage report - -**CI Parity**: `just ci-check` (runs full CI suite locally) - -### Windows vs Unix - -The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform compatibility. PowerShell on Windows, bash on Unix. - -## Dependencies & Crates - -**Core parsing**: `goblin` (ELF/PE/Mach-O), `pelite` (PE resources)\ -**CLI**: `clap` with derive macros\ -**Error handling**: `thiserror`\ -**Serialization**: `serde`, `serde_json`\ -**Regex**: `regex` for classification\ -**Testing**: `insta` (snapshots), `criterion` (benchmarks), `tempfile` - -## Import Conventions - -- Re-export commonly used types in `lib.rs` for ergonomic imports -- Import from `stringy::extraction` or `stringy::types`, not deeply nested paths -- Within `extraction/mod.rs`, do NOT import locally-defined types; downstream code imports from `stringy::extraction` - -## What NOT to Do - -- Don't use `async` (this is a synchronous CLI tool) -- Don't add `unsafe` blocks (forbidden) -- Don't ignore clippy warnings (they're errors) -- Don't create files >600 lines without splitting -- Don't use blanket `#[allow]` on modules/files -- Don't guess at section weights (refer to existing parsers in `container/`) - -## Current Implementation Status - -**Complete**: - -- ELF/PE/Mach-O format detection and parsing -- ASCII, UTF-8, UTF-16LE/BE string extraction -- PE resource string extraction (VERSIONINFO, STRINGTABLE, MANIFEST) -- String deduplication with occurrence tracking -- IPv4/IPv6, URL, domain classification - -**In Progress**: - -- Full semantic classification suite (GUIDs, paths, format strings, Base64) -- Ranking/scoring algorithm implementation -- CLI interface (`main.rs` is placeholder) -- Output formatters (JSON, YARA-friendly, human-readable) - -## Quick Reference Examples - -**Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): - -```rust -let weight = match section_name { - ".mydata" => 8.0, // New section type - _ => existing_match_arms -}; -``` - -**Extracting strings from a section**: - -```rust -use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; -let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; -let strings = extract_ascii_strings(§ion_data, &config); -``` +**New semantic tag**: Add variant to `Tag` enum in `types.rs`, implement pattern in `classification/semantic.rs` -**Adding a semantic tag**: +**New section weight**: Add match arm in the relevant `container/*.rs` parser -1. Add variant to `Tag` enum in `types.rs` -2. Implement pattern matching in `classification/semantic.rs` -3. Update deduplication tag merging if needed +**New string extractor**: Follow patterns in `extraction/` module From e4c82cae7d57686bfb34c2cf72352e80efeec7f0 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 16:17:06 -0500 Subject: [PATCH 03/19] chore(docs): update module structure formatting in documentation Signed-off-by: UncleSp1d3r --- .github/copilot-instructions.md | 44 ++++++++++++++++++++------------- AGENTS.md | 14 +++++------ 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2d188f1..6f05c41 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -12,11 +12,11 @@ Binary → Format Detection (goblin) → Container Parsing → String Extraction ### Module Organization -- **`container/`** [COMPLETE]: Format detection (ELF/PE/Mach-O), section analysis, imports/exports via `goblin` -- **`extraction/`** [COMPLETE]: ASCII/UTF-8/UTF-16 string extraction, deduplication, PE resources -- **`classification/`** [PARTIAL]: Semantic tagging (URLs, IPs, domains, paths, GUIDs, etc.) -- **`output/`** [PLANNED]: JSON/human-readable/YARA-friendly formatting -- **`types/`** [COMPLETE]: Core data structures (`FoundString`, `ContainerInfo`, etc.), error handling +- **`container/`** \[COMPLETE\]: Format detection (ELF/PE/Mach-O), section analysis, imports/exports via `goblin` +- **`extraction/`** \[COMPLETE\]: ASCII/UTF-8/UTF-16 string extraction, deduplication, PE resources +- **`classification/`** \[PARTIAL\]: Semantic tagging (URLs, IPs, domains, paths, GUIDs, etc.) +- **`output/`** \[PLANNED\]: JSON/human-readable/YARA-friendly formatting +- **`types/`** \[COMPLETE\]: Core data structures (`FoundString`, `ContainerInfo`, etc.), error handling ## Critical Coding Standards @@ -38,7 +38,7 @@ Use structured errors with detailed context (see `src/types.rs`): pub enum StringyError { #[error("Binary parsing error: {0}")] ParseError(String), - + #[error("Invalid encoding at offset {offset}")] EncodingError { offset: u64 }, } @@ -98,9 +98,12 @@ impl ContainerInfo { - **Run tests**: `just test` (uses `cargo nextest`) Example pattern from `tests/integration_elf.rs`: + ```rust fn get_fixture_path(name: &str) -> PathBuf { - Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures").join(name) + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures") + .join(name) } #[test] @@ -119,6 +122,7 @@ fn test_elf_import_export_extraction() { **Setup**: `just setup` (installs rustfmt, clippy, llvm-tools-preview, mdformat) **Development**: + - `just build` - Debug build - `just test` - Run tests with nextest - `just lint` - Full lint suite (rustfmt, clippy, actionlint, cspell, markdown) @@ -126,6 +130,7 @@ fn test_elf_import_export_extraction() { - `just run ` - Run binary against test file **Code Quality**: + - `just fmt` - Format Rust/markdown/YAML/JSON - `just fix` - Auto-fix clippy warnings with `--fix` - `just coverage` - Generate LCOV coverage report @@ -138,11 +143,11 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com ## Dependencies & Crates -**Core parsing**: `goblin` (ELF/PE/Mach-O), `pelite` (PE resources) -**CLI**: `clap` with derive macros -**Error handling**: `thiserror` -**Serialization**: `serde`, `serde_json` -**Regex**: `regex` for classification +**Core parsing**: `goblin` (ELF/PE/Mach-O), `pelite` (PE resources)\ +**CLI**: `clap` with derive macros\ +**Error handling**: `thiserror`\ +**Serialization**: `serde`, `serde_json`\ +**Regex**: `regex` for classification\ **Testing**: `insta` (snapshots), `criterion` (benchmarks), `tempfile` ## Import Conventions @@ -153,16 +158,17 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com ## What NOT to Do -- Don't use `async` (this is a synchronous CLI tool) -- Don't add `unsafe` blocks (forbidden) -- Don't ignore clippy warnings (they're errors) -- Don't create files >600 lines without splitting -- Don't use blanket `#[allow]` on modules/files +- Don't use `async` (this is a synchronous CLI tool) +- Don't add `unsafe` blocks (forbidden) +- Don't ignore clippy warnings (they're errors) +- Don't create files >600 lines without splitting +- Don't use blanket `#[allow]` on modules/files - Don't guess at section weights (refer to existing parsers in `container/`) ## Current Implementation Status **Complete**: + - ELF/PE/Mach-O format detection and parsing - ASCII, UTF-8, UTF-16LE/BE string extraction - PE resource string extraction (VERSIONINFO, STRINGTABLE, MANIFEST) @@ -170,6 +176,7 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com - IPv4/IPv6, URL, domain classification **In Progress**: + - Full semantic classification suite (GUIDs, paths, format strings, Base64) - Ranking/scoring algorithm implementation - CLI interface (`main.rs` is placeholder) @@ -178,6 +185,7 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com ## Quick Reference Examples **Adding a new section weight** (in `container/elf.rs`, `pe.rs`, or `macho.rs`): + ```rust let weight = match section_name { ".mydata" => 8.0, // New section type @@ -186,6 +194,7 @@ let weight = match section_name { ``` **Extracting strings from a section**: + ```rust use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; let config = AsciiExtractionConfig { min_length: 4, max_length: 1024 }; @@ -193,6 +202,7 @@ let strings = extract_ascii_strings(§ion_data, &config); ``` **Adding a semantic tag**: + 1. Add variant to `Tag` enum in `types.rs` 2. Implement pattern matching in `classification/semantic.rs` 3. Update deduplication tag merging if needed diff --git a/AGENTS.md b/AGENTS.md index 9eaee6e..b841448 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,13 +18,13 @@ Stringy extracts meaningful strings from ELF, PE, and Mach-O binaries using form ## Module Structure -| Module | Purpose | -|--------|---------| -| `container/` | Format detection, section analysis, imports/exports via `goblin` | -| `extraction/` | ASCII/UTF-8/UTF-16 extraction, deduplication, PE resources | -| `classification/` | Semantic tagging (URLs, IPs, domains, paths, GUIDs) | -| `output/` | Formatters (JSON, human-readable, YARA-friendly) | -| `types/` | Core data structures, error handling with `thiserror` | +| Module | Purpose | +| ----------------- | ---------------------------------------------------------------- | +| `container/` | Format detection, section analysis, imports/exports via `goblin` | +| `extraction/` | ASCII/UTF-8/UTF-16 extraction, deduplication, PE resources | +| `classification/` | Semantic tagging (URLs, IPs, domains, paths, GUIDs) | +| `output/` | Formatters (JSON, human-readable, YARA-friendly) | +| `types/` | Core data structures, error handling with `thiserror` | ## Key Patterns From dd404fee8fb8eb114d4cc4e30378d43cdeb958aa Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 16:17:22 -0500 Subject: [PATCH 04/19] feat(classification): implement file path classification for POSIX and Windows - Added POSIX and Windows file path pattern matching - Included registry path detection - Comprehensive unit and integration tests added Signed-off-by: UncleSp1d3r --- .kiro/specs/stringy-binary-analyzer/tasks.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.kiro/specs/stringy-binary-analyzer/tasks.md b/.kiro/specs/stringy-binary-analyzer/tasks.md index ffe6dd4..da38478 100644 --- a/.kiro/specs/stringy-binary-analyzer/tasks.md +++ b/.kiro/specs/stringy-binary-analyzer/tasks.md @@ -170,12 +170,19 @@ - Include unit tests for IP address detection - _Requirements: 3.3_ - - [ ] 9.3 Implement file path classification + - [x] 9.3 Implement file path classification - Add POSIX file path pattern matching - Add Windows file path pattern matching - Include registry path detection - Add unit tests for path classification + - Completed: + - POSIX file path pattern matching implemented + - Windows file path pattern matching implemented + - UNC path detection implemented + - Registry path detection implemented + - Comprehensive unit tests added + - Integration tests added - _Requirements: 3.4, 3.5_ - [ ] 9.4 Implement remaining semantic patterns From ab03844575ff3bc26af832c54730cdb360278e92 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 16:17:32 -0500 Subject: [PATCH 05/19] feat(classification): enhance path and registry detection - Implement POSIX, Windows, and UNC path classification - Add support for Windows registry path detection - Update documentation and tests for new capabilities Signed-off-by: UncleSp1d3r --- docs/src/classification.md | 258 +++---- src/classification/mod.rs | 10 +- src/classification/semantic.rs | 655 ++++++++++++++++++ tests/classification_integration.rs | 151 ++++ ...integration__classification_snapshots.snap | 43 ++ 5 files changed, 936 insertions(+), 181 deletions(-) create mode 100644 tests/classification_integration.rs create mode 100644 tests/snapshots/classification_integration__classification_snapshots.snap diff --git a/docs/src/classification.md b/docs/src/classification.md index 8507b17..24657da 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -5,7 +5,7 @@ Stringy's classification system applies semantic analysis to extracted strings, ## Classification Pipeline ```text -Raw String → Pattern Matching → Context Analysis → Tag Assignment → Confidence Scoring +Raw String -> Pattern Matching -> Tag Assignment ``` ## Semantic Categories @@ -14,14 +14,14 @@ Raw String → Pattern Matching → Context Analysis → Tag Assignment → Conf #### URLs -- **Pattern**: `https?://[^\s]+` +- **Pattern**: `` https?://[^\s<>"{}|\\^\[\]\`]+ `` - **Examples**: `https://api.example.com/v1/users`, `http://malware.com/payload` -- **Confidence factors**: Valid TLD, path structure, parameter format +- **Validation**: URL format check with safe character filtering - **Security relevance**: High - indicates network communication #### Domain Names -- **Pattern**: `[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}` +- **Pattern**: `\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b` - **Examples**: `api.example.com`, `malware-c2.net` - **Validation**: TLD checking, DNS format compliance - **Security relevance**: High - C2 domains, legitimate services @@ -34,7 +34,7 @@ Raw String → Pattern Matching → Context Analysis → Tag Assignment → Conf - **Validation**: Two-stage validation using regex pre-filter followed by `std::net::IpAddr` parsing for correctness - **Port Handling**: IP addresses with ports (e.g., `192.168.1.1:8080`) are supported by automatically stripping the port suffix before validation - **IPv6 Bracket Handling**: Bracketed IPv6 addresses (e.g., `[::1]` and `[::1]:8080`) are supported -- **False Positive Mitigation**: Version numbers like `1.2.3.4` are filtered out using heuristics (all octets < 20) +- **False Positive Mitigation**: Version numbers like `1.2.3.4` are accepted as IPv4 addresses by design - **Implementation**: See `src/classification/semantic.rs` for the complete implementation - **Security relevance**: High - infrastructure indicators @@ -42,16 +42,34 @@ Raw String → Pattern Matching → Context Analysis → Tag Assignment → Conf #### File Paths -- **POSIX Pattern**: `/[^\0\n\r]*` -- **Windows Pattern**: `[A-Za-z]:\\[^\0\n\r]*` -- **Examples**: `/usr/bin/malware`, `C:\Windows\System32\evil.dll` -- **Context**: Section type, surrounding strings -- **Security relevance**: Medium-High - persistence locations +- **POSIX Pattern**: `^/[^\0\n\r]*` +- **Windows Pattern**: `^[A-Za-z]:\\[^\0\n\r]*` +- **UNC Pattern**: `^\\\\[a-zA-Z0-9.-]+\\[^\0\n\r]*` +- **Examples**: `/usr/bin/malware`, `C:\\Windows\\System32\\evil.dll`, `\\\\server\\share\\file.txt` +- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects double path separators (`//` for POSIX, `\\` for Windows); applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter +- **Suspicious path examples**: `/etc/cron.d/`, `/etc/init.d/`, `/usr/local/bin/`, `/tmp/`, `/var/tmp/`; `C:\\Windows\\System32\\`, `C:\\Windows\\Temp\\`, `...\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\` +- **Security relevance**: Medium-High - persistence and execution locations #### Registry Paths -- **Pattern**: `HKEY_[A-Z_]+\\[^\0\n\r]*` -- **Examples**: `HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Run` +- **Full root pattern**: `^HKEY_[A-Z_]+\\[^\0\n\r]*` +- **Abbreviated root pattern**: `^HK(LM|CU|CR|U|CC)\\[^\0\n\r]*` +- **Supported root keys**: + - `HKEY_LOCAL_MACHINE` + - `HKEY_CURRENT_USER` + - `HKEY_CLASSES_ROOT` + - `HKEY_USERS` + - `HKEY_CURRENT_CONFIG` +- **Supported abbreviations**: + - `HKLM`, `HKCU`, `HKCR`, `HKU`, `HKCC` +- **Suspicious registry paths**: + - `\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run` + - `\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce` + - `\\System\\CurrentControlSet\\Services` + - `\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon` +- **Examples**: + - `HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run` + - `HKCU\\Software\\Microsoft` - **Security relevance**: High - persistence mechanisms ### Identifiers @@ -92,211 +110,99 @@ Raw String → Pattern Matching → Context Analysis → Tag Assignment → Conf - **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)` - **Security relevance**: Medium - network fingerprinting -## Implementation Details +### Method Signatures ### Pattern Matching Engine +The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives. + ```rust -pub struct SemanticClassifier { - url_regex: Regex, - domain_regex: Regex, - ipv4_regex: Regex, - ipv6_regex: Regex, - guid_regex: Regex, - email_regex: Regex, - format_regex: Regex, - base64_regex: Regex, +use lazy_static::lazy_static; +use regex::Regex; + +lazy_static! { + static ref URL_REGEX: Regex = Regex::new(r#"https?://[^\s<>"{}|\\^\[\]\`]+"#).unwrap(); } impl SemanticClassifier { - pub fn classify(&self, text: &str, context: &StringContext) -> Vec { + pub fn classify(&self, string: &FoundString) -> Vec { let mut tags = Vec::new(); - // Network indicators - if self.url_regex.is_match(text) { + if self.classify_url(&string.text).is_some() { tags.push(Tag::Url); } - if self.domain_regex.is_match(text) && !tags.contains(&Tag::Url) { + if self.classify_domain(&string.text).is_some() { tags.push(Tag::Domain); } - // File system - if self.is_file_path(text) { + tags.extend(self.classify_ip_addresses(&string.text)); + + if self.classify_posix_path(&string.text).is_some() + || self.classify_windows_path(&string.text).is_some() + || self.classify_unc_path(&string.text).is_some() + { tags.push(Tag::FilePath); } - if self.is_registry_path(text) { + if self.classify_registry_path(&string.text).is_some() { tags.push(Tag::RegistryPath); } - // Continue for other patterns... - tags } } ``` -### Context-Aware Classification - -Classification considers the context where strings are found: - -```rust -pub struct StringContext { - pub section_type: SectionType, - pub section_name: Option, - pub surrounding_strings: Vec, - pub binary_format: BinaryFormat, - pub encoding: Encoding, -} - -impl SemanticClassifier { - fn classify_with_context(&self, text: &str, context: &StringContext) -> Vec { - let mut tags = self.classify_patterns(text); - - // Boost confidence based on context - match context.section_type { - SectionType::Resources => { - if self.looks_like_version_string(text) { - tags.push(Tag::Version); - } - } - SectionType::StringData => { - // Higher confidence for semantic patterns - self.boost_pattern_confidence(&mut tags); - } - _ => {} - } - - tags - } -} -``` +## Implementation Details -### Symbol Classification +The classifier relies on `lazy_static!` to compile regex patterns once and reuse them across classification calls. Helper methods validate strings before assigning tags. -Import and export symbols get special handling: +Key method signatures: ```rust -pub struct SymbolClassifier { - known_apis: HashSet, - crypto_apis: HashSet, - network_apis: HashSet, -} - -impl SymbolClassifier { - pub fn classify_symbol(&self, name: &str, is_import: bool) -> Vec { - let mut tags = Vec::new(); - - if is_import { - tags.push(Tag::Import); - } else { - tags.push(Tag::Export); - } - - // Add semantic tags based on API name - if self.crypto_apis.contains(name) { - tags.push(Tag::Crypto); - } - - if self.network_apis.contains(name) { - tags.push(Tag::Network); - } - - tags - } -} +pub fn classify(&self, string: &FoundString) -> Vec; +pub fn classify_posix_path(&self, text: &str) -> Option; +pub fn classify_windows_path(&self, text: &str) -> Option; +pub fn classify_unc_path(&self, text: &str) -> Option; +pub fn classify_registry_path(&self, text: &str) -> Option; ``` -### Rust Symbol Demangling +## Using the Classification System ```rust -use rustc_demangle::demangle; - -pub fn classify_rust_symbol(mangled: &str) -> Vec { - let mut tags = vec![Tag::Export]; - - if let Ok(demangled) = demangle(mangled) { - let demangled_str = demangled.to_string(); - - // Look for common Rust patterns - if demangled_str.contains("::main") { - tags.push(Tag::EntryPoint); - } - - if demangled_str.contains("panic") { - tags.push(Tag::ErrorHandling); - } - } - - tags +use stringy::classification::SemanticClassifier; +use stringy::types::{Encoding, FoundString, StringSource, Tag}; + +let classifier = SemanticClassifier::new(); +let found_string = FoundString { + text: "C:\\Windows\\System32\\cmd.exe".to_string(), + encoding: Encoding::Ascii, + offset: 0, + rva: None, + section: None, + length: 27, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence: 1.0, +}; + +let tags = classifier.classify(&found_string); +if tags.contains(&Tag::FilePath) { + // Handle file path indicator } ``` ## Confidence Scoring -Each classification receives a confidence score: +The current implementation returns tags without explicit confidence scores. Confidence is implicit in the validation and matching logic. A future update may introduce explicit confidence values per tag. -```rust -pub struct ClassificationResult { - pub tag: Tag, - pub confidence: f32, // 0.0 to 1.0 - pub evidence: Vec, -} +## Planned Enhancements -impl SemanticClassifier { - fn calculate_confidence(&self, text: &str, tag: &Tag, context: &StringContext) -> f32 { - let mut confidence = 0.5; // Base confidence - - match tag { - Tag::Url => { - if text.starts_with("https://") { - confidence += 0.3; - } - if self.has_valid_tld(text) { - confidence += 0.2; - } - } - Tag::FilePath => { - if context.section_type == SectionType::StringData { - confidence += 0.2; - } - if self.has_valid_path_structure(text) { - confidence += 0.2; - } - } // ... other tag-specific confidence calculations - } - - confidence.min(1.0) - } -} -``` - -## Advanced Classification Features - -### Multi-Pattern Matching - -Some strings match multiple patterns: - -```rust -fn classify_multi_pattern(&self, text: &str) -> Vec { - let mut tags = Vec::new(); - - // A string can be both a URL and contain Base64 - if self.url_regex.is_match(text) { - tags.push(Tag::Url); - - // Check if URL contains Base64 parameters - if let Some(query) = self.extract_url_query(text) { - if self.base64_regex.is_match(query) { - tags.push(Tag::Base64); - } - } - } - - tags -} -``` +- Context-aware classification +- Symbol classification +- Additional semantic patterns (GUIDs, email addresses, base64, format strings) ### Language-Specific Patterns diff --git a/src/classification/mod.rs b/src/classification/mod.rs index f7c7a98..a63b138 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -10,11 +10,11 @@ //! ports, bracketed IPv6 notation, and false positive mitigation for version numbers //! - **URL Detection**: Identifies HTTP/HTTPS URLs //! - **Domain Detection**: Identifies domain names with TLD validation +//! - **File Path Detection**: Identifies POSIX, Windows, and UNC paths +//! - **Registry Path Detection**: Identifies Windows registry paths //! //! ## Future Capabilities //! -//! - File paths (POSIX and Windows) -//! - Registry paths //! - GUIDs/UUIDs //! - Email addresses //! - Base64 data @@ -29,12 +29,12 @@ //! //! let classifier = SemanticClassifier::new(); //! let found_string = FoundString { -//! text: "192.168.1.1:8080".to_string(), +//! text: "C:\\Windows\\System32\\cmd.exe".to_string(), //! encoding: Encoding::Ascii, //! offset: 0, //! rva: None, //! section: None, -//! length: 15, +//! length: 27, //! tags: Vec::new(), //! score: 0, //! source: StringSource::SectionData, @@ -42,7 +42,7 @@ //! }; //! //! let tags = classifier.classify(&found_string); -//! assert!(tags.contains(&Tag::IPv4)); +//! assert!(tags.contains(&Tag::FilePath)); //! ``` pub mod semantic; diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index da3143a..f1c8e0c 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -5,6 +5,12 @@ //! The classifier uses compiled regular expressions for efficient pattern //! matching and includes TLD validation to reduce false positives. //! +//! Current capabilities include: +//! - URLs and domain names +//! - IPv4 and IPv6 addresses +//! - POSIX and Windows file paths (including UNC paths) +//! - Windows registry paths +//! //! # Usage //! //! ```rust @@ -86,6 +92,114 @@ lazy_static! { /// /// Matches [IPv6] format used in URLs like [::1]:8080. static ref IPV6_BRACKETS_REGEX: Regex = Regex::new(r"^\[(.+)\]").unwrap(); + + /// Regular expression for matching POSIX file paths + /// + /// Pattern matches absolute POSIX paths starting with / followed by any characters + /// except null bytes, newlines, or carriage returns. + static ref POSIX_PATH_REGEX: Regex = Regex::new(r"^/[^\x00\n\r]*").unwrap(); + + /// Regular expression for matching Windows file paths + /// + /// Pattern matches Windows absolute paths starting with drive letter (C:\) + /// followed by any characters except null bytes, newlines, or carriage returns. + static ref WINDOWS_PATH_REGEX: Regex = Regex::new(r"^[A-Za-z]:\\[^\x00\n\r]*").unwrap(); + + /// Regular expression for matching UNC network paths + /// + /// Pattern matches UNC paths starting with \\ followed by server name and share. + static ref UNC_PATH_REGEX: Regex = Regex::new(r"^\\\\[a-zA-Z0-9.-]+\\[^\x00\n\r]*").unwrap(); + + /// Regular expression for matching full Windows registry paths + /// + /// Pattern matches registry paths starting with HKEY_ root keys. + static ref REGISTRY_PATH_REGEX: Regex = Regex::new(r"^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap(); + + /// Regular expression for matching abbreviated registry paths + /// + /// Pattern matches abbreviated registry forms like HKLM, HKCU, etc. + static ref REGISTRY_ABBREV_REGEX: Regex = Regex::new(r"^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap(); +} + +lazy_static! { + /// Common suspicious POSIX path prefixes for persistence detection + static ref SUSPICIOUS_POSIX_PATHS: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("/etc/cron.d/"); + set.insert("/etc/init.d/"); + set.insert("/usr/local/bin/"); + set.insert("/tmp/"); + set.insert("/var/tmp/"); + set.insert("/etc/rc.d/"); + set.insert("/etc/crontab"); + set.insert("/etc/systemd/system/"); + set.insert("~/.config/autostart/"); + set.insert("/Library/LaunchDaemons/"); + set.insert("/Library/LaunchAgents/"); + set + }; + + /// Common suspicious Windows path prefixes for persistence detection + static ref SUSPICIOUS_WINDOWS_PATHS: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("C:\\Windows\\System32\\"); + set.insert("C:\\Windows\\Temp\\"); + set.insert("\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); + set.insert("C:\\ProgramData\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\"); + set.insert("C:\\Windows\\SysWOW64\\"); + set + }; + + /// Known valid POSIX path prefixes + static ref KNOWN_POSIX_PREFIXES: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("/usr/"); + set.insert("/etc/"); + set.insert("/var/"); + set.insert("/home/"); + set.insert("/opt/"); + set.insert("/bin/"); + set.insert("/sbin/"); + set.insert("/lib/"); + set.insert("/dev/"); + set.insert("/proc/"); + set.insert("/sys/"); + set.insert("/tmp/"); + set + }; + + /// Known valid Windows path prefixes + static ref KNOWN_WINDOWS_PREFIXES: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("C:\\Windows\\"); + set.insert("C:\\Program Files\\"); + set.insert("C:\\Program Files (x86)\\"); + set.insert("C:\\Users\\"); + set.insert("C:\\ProgramData\\"); + set + }; + + /// Valid Windows registry root keys + static ref VALID_REGISTRY_ROOTS: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("HKEY_LOCAL_MACHINE"); + set.insert("HKEY_CURRENT_USER"); + set.insert("HKEY_CLASSES_ROOT"); + set.insert("HKEY_USERS"); + set.insert("HKEY_CURRENT_CONFIG"); + set + }; + + /// Suspicious Windows registry paths for persistence detection + static ref SUSPICIOUS_REGISTRY_PATHS: std::collections::HashSet<&'static str> = { + let mut set = std::collections::HashSet::new(); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run"); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\RunOnce"); + set.insert("\\System\\CurrentControlSet\\Services"); + set.insert("\\SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon"); + set.insert("\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Shell Folders"); + set + }; } /// Semantic classifier for identifying network indicators in extracted strings @@ -99,12 +213,41 @@ lazy_static! { #[derive(Debug, Default)] pub struct SemanticClassifier; +#[doc(hidden)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RegexCacheAddresses { + pub url: usize, + pub domain: usize, + pub ipv4: usize, + pub ipv6: usize, + pub posix_path: usize, + pub windows_path: usize, + pub unc_path: usize, + pub registry_full: usize, + pub registry_abbrev: usize, +} + impl SemanticClassifier { /// Create a new instance of the semantic classifier pub fn new() -> Self { Self } + #[doc(hidden)] + pub fn regex_cache_addresses(&self) -> RegexCacheAddresses { + RegexCacheAddresses { + url: &*URL_REGEX as *const Regex as usize, + domain: &*DOMAIN_REGEX as *const Regex as usize, + ipv4: &*IPV4_REGEX as *const Regex as usize, + ipv6: &*IPV6_REGEX as *const Regex as usize, + posix_path: &*POSIX_PATH_REGEX as *const Regex as usize, + windows_path: &*WINDOWS_PATH_REGEX as *const Regex as usize, + unc_path: &*UNC_PATH_REGEX as *const Regex as usize, + registry_full: &*REGISTRY_PATH_REGEX as *const Regex as usize, + registry_abbrev: &*REGISTRY_ABBREV_REGEX as *const Regex as usize, + } + } + /// Detects HTTP/HTTPS URLs in the given text /// /// This method identifies URLs that start with `http://` or `https://` @@ -237,6 +380,24 @@ impl SemanticClassifier { let ip_tags = self.classify_ip_addresses(&string.text); tags.extend(ip_tags); + // Check for file paths (POSIX, Windows, UNC) + if let Some(tag) = self.classify_posix_path(&string.text) { + tags.push(tag); + } + + if let Some(tag) = self.classify_windows_path(&string.text) { + tags.push(tag); + } + + if let Some(tag) = self.classify_unc_path(&string.text) { + tags.push(tag); + } + + // Check for registry paths + if let Some(tag) = self.classify_registry_path(&string.text) { + tags.push(tag); + } + tags } @@ -495,6 +656,206 @@ impl SemanticClassifier { Ipv6Addr::from_str(processed).is_ok() } + /// Detects POSIX file paths in the given text + /// + /// Returns `Some(Tag::FilePath)` if a POSIX path is detected and valid. + pub fn classify_posix_path(&self, text: &str) -> Option { + if !POSIX_PATH_REGEX.is_match(text) { + return None; + } + + if !self.is_valid_posix_path(text) { + return None; + } + + Some(Tag::FilePath) + } + + /// Detects Windows file paths in the given text + /// + /// Returns `Some(Tag::FilePath)` if a Windows path is detected and valid. + pub fn classify_windows_path(&self, text: &str) -> Option { + if !WINDOWS_PATH_REGEX.is_match(text) { + return None; + } + + if !self.is_valid_windows_path(text) { + return None; + } + + Some(Tag::FilePath) + } + + /// Detects UNC network paths in the given text + /// + /// Returns `Some(Tag::FilePath)` if a UNC path is detected and valid. + pub fn classify_unc_path(&self, text: &str) -> Option { + if !UNC_PATH_REGEX.is_match(text) { + return None; + } + + let trimmed = text.trim_start_matches('\\'); + let mut parts = trimmed.split('\\'); + let server = parts.next().unwrap_or(""); + let share = parts.next().unwrap_or(""); + + if server.is_empty() || share.is_empty() { + return None; + } + + Some(Tag::FilePath) + } + + /// Detects Windows registry paths in the given text + /// + /// Returns `Some(Tag::RegistryPath)` if a registry path is detected and valid. + pub fn classify_registry_path(&self, text: &str) -> Option { + if !REGISTRY_PATH_REGEX.is_match(text) && !REGISTRY_ABBREV_REGEX.is_match(text) { + return None; + } + + if !self.is_valid_registry_path(text) { + return None; + } + + Some(Tag::RegistryPath) + } + + /// Checks if the POSIX path matches known suspicious locations + pub fn is_suspicious_posix_path(&self, text: &str) -> bool { + SUSPICIOUS_POSIX_PATHS + .iter() + .any(|prefix| text.starts_with(prefix)) + } + + /// Checks if the Windows path matches known suspicious locations + pub fn is_suspicious_windows_path(&self, text: &str) -> bool { + SUSPICIOUS_WINDOWS_PATHS.iter().any(|prefix| { + if prefix.starts_with('\\') { + text.contains(prefix) + } else { + text.starts_with(prefix) + } + }) + } + + /// Checks if the registry path matches known persistence locations + pub fn is_suspicious_registry_path(&self, text: &str) -> bool { + let text_lower = text.to_ascii_lowercase(); + SUSPICIOUS_REGISTRY_PATHS + .iter() + .any(|path| text_lower.contains(&path.to_ascii_lowercase())) + } + + /// Detects printf-style placeholders to reduce false positives + fn contains_printf_placeholder(&self, text: &str) -> bool { + let mut chars = text.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch == '%' + && let Some(next) = chars.peek() + && matches!(next, 's' | 'd' | 'x' | 'o' | 'u' | 'f') + { + return true; + } + } + + false + } + + /// Validates POSIX path structure + pub fn is_valid_posix_path(&self, text: &str) -> bool { + if text.len() > 4096 { + return false; + } + + if text.contains('\0') || text.contains('\n') || text.contains('\r') { + return false; + } + + if text.contains("//") { + return false; + } + + if text.contains('\\') { + return false; + } + + if self.contains_printf_placeholder(text) { + return false; + } + + let has_known_prefix = KNOWN_POSIX_PREFIXES + .iter() + .any(|prefix| text.starts_with(prefix)); + let is_suspicious = self.is_suspicious_posix_path(text); + + if !has_known_prefix && !is_suspicious && text.len() > 2048 { + return false; + } + + true + } + + /// Validates Windows path structure + pub fn is_valid_windows_path(&self, text: &str) -> bool { + if text.len() > 4096 { + return false; + } + + if text.contains('/') { + return false; + } + + if text.contains("\\\\") { + return false; + } + + if self.contains_printf_placeholder(text) { + return false; + } + + let has_known_prefix = KNOWN_WINDOWS_PREFIXES + .iter() + .any(|prefix| text.starts_with(prefix)); + let is_suspicious = self.is_suspicious_windows_path(text); + + if !has_known_prefix && !is_suspicious && text.len() > 2048 { + return false; + } + + true + } + + /// Validates Windows registry path structure + pub fn is_valid_registry_path(&self, text: &str) -> bool { + if text.contains('/') { + return false; + } + + if text.contains("\\\\") { + return false; + } + + let root = text.split('\\').next().unwrap_or(""); + let root_upper = root.to_ascii_uppercase(); + + if root_upper.starts_with("HKEY_") { + return VALID_REGISTRY_ROOTS + .iter() + .any(|valid| valid.eq_ignore_ascii_case(&root_upper)); + } + + if root_upper.starts_with("HK") { + return matches!( + root_upper.as_str(), + "HKLM" | "HKCU" | "HKCR" | "HKU" | "HKCC" + ); + } + + false + } + /// Classifies IP addresses (IPv4 and IPv6) in the given text /// /// This method checks for both IPv4 and IPv6 addresses and returns @@ -872,4 +1233,298 @@ mod tests { assert_eq!(tags.len(), 1); assert!(matches!(tags[0], Tag::IPv6)); } + + #[test] + fn test_posix_absolute_path() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_posix_path("/usr/bin/bash"), + Some(Tag::FilePath) + ); + assert_eq!( + classifier.classify_posix_path("/etc/passwd"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_posix_home_directory() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_posix_path("/home/user/.bashrc"), + Some(Tag::FilePath) + ); + assert_eq!( + classifier.classify_posix_path("/home/user/.config/app"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_posix_with_spaces() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_posix_path("/Users/John Doe/Documents/file.txt"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_posix_system_directories() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_posix_path("/usr/"), Some(Tag::FilePath)); + assert_eq!(classifier.classify_posix_path("/etc/"), Some(Tag::FilePath)); + assert_eq!(classifier.classify_posix_path("/var/"), Some(Tag::FilePath)); + } + + #[test] + fn test_posix_suspicious_paths() { + let classifier = SemanticClassifier::new(); + + assert!(classifier.is_suspicious_posix_path("/tmp/malware")); + assert!(classifier.is_suspicious_posix_path("/etc/cron.d/backdoor")); + } + + #[test] + fn test_posix_too_short() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_posix_path("/a"), Some(Tag::FilePath)); + } + + #[test] + fn test_posix_invalid() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_posix_path("usr/bin/bash"), None); + } + + #[test] + fn test_posix_with_null_bytes() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_posix_path("/tmp/evil\0bin"), None); + } + + #[test] + fn test_windows_absolute_path() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_windows_path("C:\\Windows\\System32\\cmd.exe"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_windows_program_files() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_windows_path("C:\\Program Files (x86)\\App"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_windows_with_spaces() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_windows_path("D:\\My Documents\\file.txt"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_windows_different_drives() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_windows_path("D:\\"), + Some(Tag::FilePath) + ); + assert_eq!( + classifier.classify_windows_path("E:\\Data\\"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_windows_suspicious_paths() { + let classifier = SemanticClassifier::new(); + + assert!(classifier.is_suspicious_windows_path("C:\\Windows\\Temp\\evil.exe")); + } + + #[test] + fn test_windows_case_insensitive() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_windows_path("c:\\windows\\"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_windows_invalid() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_windows_path("C:/forward/slash"), None); + } + + #[test] + fn test_windows_invalid_drive() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_windows_path("1:\\path"), None); + } + + #[test] + fn test_unc_path() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_unc_path("\\\\server\\share\\file.txt"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_unc_with_domain() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_unc_path("\\\\server.domain.com\\share\\"), + Some(Tag::FilePath) + ); + } + + #[test] + fn test_unc_invalid() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_unc_path("\\\\\\\\"), None); + assert_eq!(classifier.classify_unc_path("\\\\server"), None); + } + + #[test] + fn test_registry_run_key() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + ), + Some(Tag::RegistryPath) + ); + } + + #[test] + fn test_registry_current_user() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_registry_path("HKEY_CURRENT_USER\\Software\\App\\Settings"), + Some(Tag::RegistryPath) + ); + } + + #[test] + fn test_registry_abbreviated_hklm() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_registry_path("HKLM\\System\\CurrentControlSet"), + Some(Tag::RegistryPath) + ); + } + + #[test] + fn test_registry_abbreviated_hkcu() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_registry_path("HKCU\\Software\\Microsoft"), + Some(Tag::RegistryPath) + ); + } + + #[test] + fn test_registry_persistence_run() { + let classifier = SemanticClassifier::new(); + + assert!(classifier.is_suspicious_registry_path( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + )); + } + + #[test] + fn test_registry_invalid_root() { + let classifier = SemanticClassifier::new(); + + assert_eq!( + classifier.classify_registry_path("HKEY_INVALID\\Path"), + None + ); + } + + #[test] + fn test_registry_forward_slash() { + let classifier = SemanticClassifier::new(); + + assert_eq!(classifier.classify_registry_path("HKLM/Software"), None); + } + + #[test] + fn test_classify_mixed_strings() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("https://example.com"); + + let tags = classifier.classify(&found_string); + assert!(tags.contains(&Tag::Url)); + } + + #[test] + fn test_classify_posix_path_in_found_string() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("/usr/bin/bash"); + + let tags = classifier.classify(&found_string); + assert!(tags.contains(&Tag::FilePath)); + } + + #[test] + fn test_classify_windows_path_in_found_string() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("C:\\Windows\\System32\\cmd.exe"); + + let tags = classifier.classify(&found_string); + assert!(tags.contains(&Tag::FilePath)); + } + + #[test] + fn test_classify_registry_path_in_found_string() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", + ); + + let tags = classifier.classify(&found_string); + assert!(tags.contains(&Tag::RegistryPath)); + } + + #[test] + fn test_no_false_positives_on_random_data() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("x9qz1p0t8v7w6r5y4u3i2o1p"); + + let tags = classifier.classify(&found_string); + assert!(tags.is_empty()); + } } diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs new file mode 100644 index 0000000..528ec36 --- /dev/null +++ b/tests/classification_integration.rs @@ -0,0 +1,151 @@ +use insta::assert_debug_snapshot; +use std::time::{Duration, Instant}; +use stringy::classification::SemanticClassifier; +use stringy::types::{Encoding, FoundString, StringSource, Tag}; + +fn make_found_string(text: &str) -> FoundString { + FoundString { + text: text.to_string(), + encoding: Encoding::Ascii, + offset: 0, + rva: None, + section: None, + length: text.len() as u32, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence: 1.0, + } +} + +fn classify_tags(classifier: &SemanticClassifier, text: &str) -> Vec { + classifier.classify(&make_found_string(text)) +} + +fn tags_as_strings(tags: &[Tag]) -> Vec { + let mut values: Vec = tags.iter().map(|tag| format!("{:?}", tag)).collect(); + values.sort(); + values +} + +#[test] +fn test_classify_mixed_indicators() { + let classifier = SemanticClassifier::new(); + + let samples = vec![ + ("https://example.com", vec![Tag::Url]), + ("example.com", vec![Tag::Domain]), + ("192.168.1.1", vec![Tag::IPv4]), + ("::1", vec![Tag::IPv6]), + ("/usr/bin/bash", vec![Tag::FilePath]), + ("C:\\Windows\\System32\\cmd.exe", vec![Tag::FilePath]), + ( + "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run", + vec![Tag::RegistryPath], + ), + ]; + + for (text, expected) in samples { + let tags = classify_tags(&classifier, text); + for tag in expected { + assert!(tags.contains(&tag)); + } + } +} + +#[test] +fn test_classify_all_path_types() { + let classifier = SemanticClassifier::new(); + + let posix_tags = classify_tags(&classifier, "/etc/passwd"); + assert!(posix_tags.contains(&Tag::FilePath)); + + let windows_tags = classify_tags(&classifier, "C:\\Windows\\Temp\\evil.exe"); + assert!(windows_tags.contains(&Tag::FilePath)); + + let unc_tags = classify_tags(&classifier, "\\\\server\\share\\file.txt"); + assert!(unc_tags.contains(&Tag::FilePath)); + + let registry_tags = classify_tags(&classifier, "HKLM\\System\\CurrentControlSet\\Services"); + assert!(registry_tags.contains(&Tag::RegistryPath)); +} + +#[test] +fn test_classification_performance() { + let classifier = SemanticClassifier::new(); + + let mut samples = Vec::new(); + for index in 0..350 { + samples.push(format!("https://example.com/api/{}", index)); + samples.push(format!("C:\\Windows\\Temp\\file{}.tmp", index)); + samples.push(format!("/usr/local/bin/tool{}", index)); + } + + let start = Instant::now(); + for sample in &samples { + let _ = classify_tags(&classifier, sample); + } + let elapsed = start.elapsed(); + + assert!(elapsed < Duration::from_millis(100)); +} + +#[test] +fn test_regex_caching() { + let classifier = SemanticClassifier::new(); + let first = classifier.regex_cache_addresses(); + + let second_classifier = SemanticClassifier::new(); + let second = second_classifier.regex_cache_addresses(); + + assert_eq!(first, second); +} + +#[test] +fn test_no_false_positives_on_random_data() { + let classifier = SemanticClassifier::new(); + let tags = classify_tags(&classifier, "x9qz1p0t8v7w6r5y4u3i2o1p"); + + assert!(tags.is_empty()); +} + +#[test] +fn test_format_strings_not_paths() { + let classifier = SemanticClassifier::new(); + let tags = classify_tags(&classifier, "C:\\%s"); + + assert!(!tags.contains(&Tag::FilePath)); +} + +#[test] +fn test_version_numbers_not_paths() { + let classifier = SemanticClassifier::new(); + let tags = classify_tags(&classifier, "1.2.3.4"); + + assert!(tags.contains(&Tag::IPv4)); + assert!(!tags.contains(&Tag::FilePath)); +} + +#[test] +fn test_classification_snapshots() { + let classifier = SemanticClassifier::new(); + + let inputs = [ + "https://example.com", + "192.168.1.1", + "/usr/bin/bash", + "C:\\Windows\\System32\\cmd.exe", + "\\\\server\\share\\file.txt", + "HKCU\\Software\\Microsoft", + ]; + + let snapshot: Vec<(String, Vec)> = inputs + .iter() + .map(|text| { + let tags = classify_tags(&classifier, text); + (text.to_string(), tags_as_strings(&tags)) + }) + .collect(); + + assert_debug_snapshot!(snapshot); +} diff --git a/tests/snapshots/classification_integration__classification_snapshots.snap b/tests/snapshots/classification_integration__classification_snapshots.snap new file mode 100644 index 0000000..21b9b32 --- /dev/null +++ b/tests/snapshots/classification_integration__classification_snapshots.snap @@ -0,0 +1,43 @@ +--- +source: tests/classification_integration.rs +assertion_line: 150 +expression: snapshot +--- +[ + ( + "https://example.com", + [ + "Url", + ], + ), + ( + "192.168.1.1", + [ + "IPv4", + ], + ), + ( + "/usr/bin/bash", + [ + "FilePath", + ], + ), + ( + "C:\\Windows\\System32\\cmd.exe", + [ + "FilePath", + ], + ), + ( + "\\\\server\\share\\file.txt", + [ + "FilePath", + ], + ), + ( + "HKCU\\Software\\Microsoft", + [ + "RegistryPath", + ], + ), +] From 449e42561debb07bc7b2475f4eb9e66c299c7502 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 16:29:37 -0500 Subject: [PATCH 06/19] chore: minor docs and test adjustments - Fix MSRV reference from 1.91+ to 1.85+ in copilot instructions - Reorganize classification docs for clarity - Increase classification test timeout for CI reliability Co-Authored-By: Claude Opus 4.5 --- .claude/settings.json | 5 +++++ .github/copilot-instructions.md | 4 ++-- docs/src/classification.md | 8 ++++---- tests/classification_integration.rs | 3 ++- 4 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 .claude/settings.json diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..37c2a2b --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "commit@cc-marketplace": true + } +} diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 6f05c41..beab23e 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -24,7 +24,7 @@ Binary → Format Detection (goblin) → Container Parsing → String Extraction - **No `unsafe` code**: `#![forbid(unsafe_code)]` enforced at package level - **Zero warnings**: `cargo clippy -- -D warnings` must pass (`#![deny(warnings)]` enforced) -- **Rust 2024 Edition**: MSRV 1.91+, always use latest edition features +- **Rust 2024 Edition**: MSRV 1.85+, always use latest edition features - **File size limit**: Keep files ≤500-600 lines; split larger files into focused modules - **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules - **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) @@ -179,7 +179,7 @@ The `justfile` uses OS annotations (`[windows]`/`[unix]`) for cross-platform com - Full semantic classification suite (GUIDs, paths, format strings, Base64) - Ranking/scoring algorithm implementation -- CLI interface (`main.rs` is placeholder) +- CLI (`main.rs` is placeholder) - Output formatters (JSON, YARA-friendly, human-readable) ## Quick Reference Examples diff --git a/docs/src/classification.md b/docs/src/classification.md index 24657da..044bfc5 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -110,8 +110,6 @@ Raw String -> Pattern Matching -> Tag Assignment - **Examples**: `Mozilla/5.0 (Windows NT 10.0; Win64; x64)` - **Security relevance**: Medium - network fingerprinting -### Method Signatures - ### Pattern Matching Engine The semantic classifier uses cached regex patterns via `lazy_static!` and applies validation checks to reduce false positives. @@ -158,6 +156,8 @@ impl SemanticClassifier { The classifier relies on `lazy_static!` to compile regex patterns once and reuse them across classification calls. Helper methods validate strings before assigning tags. +### Method Signatures + Key method signatures: ```rust @@ -198,11 +198,11 @@ if tags.contains(&Tag::FilePath) { The current implementation returns tags without explicit confidence scores. Confidence is implicit in the validation and matching logic. A future update may introduce explicit confidence values per tag. -## Planned Enhancements +## Planned Enhancements (implementation pending) - Context-aware classification - Symbol classification -- Additional semantic patterns (GUIDs, email addresses, base64, format strings) +- Additional semantic patterns (GUIDs, email addresses, base64, format strings) - documented above, implementation pending ### Language-Specific Patterns diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index 528ec36..da78516 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -70,6 +70,7 @@ fn test_classify_all_path_types() { assert!(registry_tags.contains(&Tag::RegistryPath)); } +// Note: classify_tags with SemanticClassifier can be slow on CI. #[test] fn test_classification_performance() { let classifier = SemanticClassifier::new(); @@ -87,7 +88,7 @@ fn test_classification_performance() { } let elapsed = start.elapsed(); - assert!(elapsed < Duration::from_millis(100)); + assert!(elapsed < Duration::from_millis(500)); } #[test] From 6ec87f88a7d1be1723dbbcf55d22f0e1187d9dcc Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Sat, 17 Jan 2026 17:04:36 -0500 Subject: [PATCH 07/19] fix(classification): address code review feedback on path classification performance and consistency (#122) * Initial plan * fix: address code review feedback on path classification - Clarify Windows path separator validation in docs - Optimize registry path matching with zero-allocation case-insensitive search - Prevent duplicate Tag::FilePath entries - Add comment explaining 500ms performance test timeout Co-authored-by: unclesp1d3r <251112+unclesp1d3r@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: unclesp1d3r <251112+unclesp1d3r@users.noreply.github.com> --- docs/src/classification.md | 2 +- src/classification/semantic.rs | 38 +++++++++++++++++++---------- tests/classification_integration.rs | 4 +++ 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/docs/src/classification.md b/docs/src/classification.md index 044bfc5..fea0ba3 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -46,7 +46,7 @@ Raw String -> Pattern Matching -> Tag Assignment - **Windows Pattern**: `^[A-Za-z]:\\[^\0\n\r]*` - **UNC Pattern**: `^\\\\[a-zA-Z0-9.-]+\\[^\0\n\r]*` - **Examples**: `/usr/bin/malware`, `C:\\Windows\\System32\\evil.dll`, `\\\\server\\share\\file.txt` -- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects double path separators (`//` for POSIX, `\\` for Windows); applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter +- **Validation rules**: Rejects null bytes, newlines, carriage returns; rejects consecutive path separators in POSIX paths (`//`) and consecutive backslashes in Windows paths (for example, `folder\\\\file.txt`), while allowing UNC paths that start with `\\\\`; applies a reasonable length limit (4096 max, stricter for unknown prefixes); POSIX paths must be absolute (start with `/`); Windows paths must use backslashes and a valid drive letter - **Suspicious path examples**: `/etc/cron.d/`, `/etc/init.d/`, `/usr/local/bin/`, `/tmp/`, `/var/tmp/`; `C:\\Windows\\System32\\`, `C:\\Windows\\Temp\\`, `...\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\` - **Security relevance**: Medium-High - persistence and execution locations diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index f1c8e0c..812b87a 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -380,17 +380,12 @@ impl SemanticClassifier { let ip_tags = self.classify_ip_addresses(&string.text); tags.extend(ip_tags); - // Check for file paths (POSIX, Windows, UNC) - if let Some(tag) = self.classify_posix_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_windows_path(&string.text) { - tags.push(tag); - } - - if let Some(tag) = self.classify_unc_path(&string.text) { - tags.push(tag); + // Check for file paths (POSIX, Windows, UNC) - only add FilePath tag once + if self.classify_posix_path(&string.text).is_some() + || self.classify_windows_path(&string.text).is_some() + || self.classify_unc_path(&string.text).is_some() + { + tags.push(Tag::FilePath); } // Check for registry paths @@ -741,10 +736,27 @@ impl SemanticClassifier { /// Checks if the registry path matches known persistence locations pub fn is_suspicious_registry_path(&self, text: &str) -> bool { - let text_lower = text.to_ascii_lowercase(); SUSPICIOUS_REGISTRY_PATHS .iter() - .any(|path| text_lower.contains(&path.to_ascii_lowercase())) + .any(|path| self.contains_ascii_case_insensitive(text, path)) + } + + /// Case-insensitive ASCII substring search without allocations + fn contains_ascii_case_insensitive(&self, haystack: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + + let haystack_bytes = haystack.as_bytes(); + let needle_bytes = needle.as_bytes(); + + if needle_bytes.len() > haystack_bytes.len() { + return false; + } + + haystack_bytes + .windows(needle_bytes.len()) + .any(|window| window.eq_ignore_ascii_case(needle_bytes)) } /// Detects printf-style placeholders to reduce false positives diff --git a/tests/classification_integration.rs b/tests/classification_integration.rs index da78516..710f2ea 100644 --- a/tests/classification_integration.rs +++ b/tests/classification_integration.rs @@ -88,6 +88,10 @@ fn test_classification_performance() { } let elapsed = start.elapsed(); + // Timeout is set to 500ms to accommodate slower CI environments while still detecting + // performance regressions. This processes 1050 samples (350 iterations x 3 samples each). + // The timeout is higher than typical development performance (~50-100ms) to ensure + // CI stability across different runner configurations and load conditions. assert!(elapsed < Duration::from_millis(500)); } From 6f51ccabb9333756641f6d139b781261a28c7bfa Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 17:56:13 -0500 Subject: [PATCH 08/19] chore: add comprehensive codebase analysis documentation Signed-off-by: UncleSp1d3r --- codebase_analysis.md | 605 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100644 codebase_analysis.md diff --git a/codebase_analysis.md b/codebase_analysis.md new file mode 100644 index 0000000..2ac1405 --- /dev/null +++ b/codebase_analysis.md @@ -0,0 +1,605 @@ +# Stringy Codebase Analysis + +## 1. Project Overview + +**Stringy** is a smarter alternative to the standard `strings` command that extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. + +### Key Differentiators + +- **Data-Structure Aware**: Extracts strings from actual binary data structures, not arbitrary byte runs +- **Section-Aware**: Prioritizes high-value sections (`.rodata`, `.rdata`, `__cstring`) with weight-based scoring +- **Encoding-Aware**: Supports ASCII, UTF-8, UTF-16LE/BE with confidence scoring +- **Semantically Tagged**: Identifies URLs, domains, IPs, file paths, registry keys, GUIDs, and more +- **Ranked Output**: Presents most relevant strings first using a scoring algorithm + +### Project Metadata + +| Attribute | Value | +| ------------- | ----------------------------------------------- | +| Language | Rust 2024 Edition | +| MSRV | 1.85+ | +| License | Apache-2.0 | +| Repository | | +| Version | 0.1.0 (in development) | +| Total LoC | ~11,153 (src) + ~5,254 (tests) = ~16,407 lines | + +--- + +## 2. Directory Structure Analysis + +``` +D:\Stringy\ +|-- .github/ +| |-- copilot-instructions.md # AI agent guidelines +| |-- dependabot.yml # Dependency updates +| `-- workflows/ +| `-- ci.yml # CI pipeline +|-- .kiro/ +| `-- specs/ +| `-- stringy-binary-analyzer/ +| |-- design.md # Architecture design +| |-- requirements.md # 9 project requirements +| `-- tasks.md # Implementation tracking +|-- benches/ +| |-- ascii_extraction.rs # ASCII extraction benchmarks +| |-- elf.rs # ELF parsing benchmarks +| `-- pe.rs # PE parsing benchmarks +|-- docs/ +| |-- book.toml # mdBook configuration +| `-- src/ # Documentation source +|-- src/ +| |-- lib.rs # Library entry point (86 lines) +| |-- main.rs # CLI placeholder (23 lines) +| |-- types.rs # Core data structures (309 lines) +| |-- classification/ +| | |-- mod.rs # Module exports (49 lines) +| | `-- semantic.rs # Semantic classifier (1,542 lines) +| |-- container/ +| | |-- mod.rs # Parser trait & detection (73 lines) +| | |-- elf.rs # ELF parser (627 lines) +| | |-- pe.rs # PE parser (661 lines) +| | `-- macho.rs # Mach-O parser (574 lines) +| |-- extraction/ +| | |-- mod.rs # Extraction framework (1,498 lines) +| | |-- ascii.rs # ASCII extraction (820 lines) +| | |-- config.rs # Extraction config (221 lines) +| | |-- dedup.rs # Deduplication (841 lines) +| | |-- filters.rs # Noise filters (702 lines) +| | |-- macho_load_commands.rs # Mach-O commands (370 lines) +| | |-- pe_resources.rs # PE resources (1,430 lines) +| | |-- utf16.rs # UTF-16 extraction (1,269 lines) +| | `-- util.rs # Utilities (57 lines) +| `-- output/ +| `-- mod.rs # Output formatters (1 line, planned) +|-- tests/ +| |-- fixtures/ # Binary test fixtures +| |-- snapshots/ # Insta snapshots +| |-- classification_integration.rs +| |-- integration_elf.rs +| |-- integration_extraction.rs +| |-- integration_macho.rs +| |-- integration_pe.rs +| |-- test_ascii_extraction.rs +| |-- test_ascii_integration.rs +| |-- test_deduplication.rs +| |-- test_noise_filters.rs +| `-- test_utf16_extraction.rs +|-- Cargo.toml # Project manifest +|-- justfile # Build automation (444 lines) +|-- CLAUDE.md # Claude Code instructions +|-- AGENTS.md # AI agent guidelines +`-- README.md # Project documentation +``` + +--- + +## 3. File-by-File Breakdown + +### Core Library (`src/`) + +#### `src/lib.rs` (86 lines) + +Library entry point with module declarations and public re-exports. + +```rust +#![forbid(unsafe_code)] +#![deny(warnings)] + +pub mod classification; +pub mod container; +pub mod extraction; +pub mod output; +pub mod types; + +// Re-exports for ergonomic imports +pub use classification::SemanticClassifier; +pub use container::{create_parser, detect_format, ContainerParser}; +pub use extraction::{BasicExtractor, StringExtractor, /* ... */}; +pub use types::{BinaryFormat, ContainerInfo, Encoding, FoundString, /* ... */}; +``` + +#### `src/main.rs` (23 lines) + +CLI placeholder using `clap` derive macros. + +```rust +#[derive(Parser)] +#[command(name = "stringy")] +struct Cli { + #[arg(value_name = "FILE")] + input: PathBuf, +} + +fn main() -> Result<(), Box> { + let _args = Cli::parse(); + // TODO: Implement main extraction pipeline + Ok(()) +} +``` + +#### `src/types.rs` (309 lines) + +Core data structures with comprehensive type definitions: + +| Type | Purpose | +| --------------- | ---------------------------------------------------- | +| `Tag` | Semantic classification tags (Url, Domain, IPv4...) | +| `Encoding` | String encoding (Ascii, Utf8, Utf16Le, Utf16Be) | +| `BinaryFormat` | Binary format (Elf, Pe, MachO, Unknown) | +| `SectionType` | Section classification (Code, ReadOnlyData, etc.) | +| `StringSource` | String origin (SectionData, Import, Export, etc.) | +| `ContainerInfo` | Parsed binary metadata (non-exhaustive) | +| `SectionInfo` | Section details with weight scoring | +| `FoundString` | Extracted string with full metadata | +| `StringyError` | Error types with `thiserror` | + +### Container Module (`src/container/`) + +#### `src/container/mod.rs` (73 lines) + +Defines the `ContainerParser` trait and format detection. + +```rust +pub trait ContainerParser { + fn detect(data: &[u8]) -> bool where Self: Sized; + fn parse(&self, data: &[u8]) -> Result; +} + +pub fn detect_format(data: &[u8]) -> BinaryFormat { /* ... */ } +pub fn create_parser(format: BinaryFormat) -> Result> { /* ... */ } +``` + +#### `src/container/elf.rs` (627 lines) + +ELF binary parser with section weight system: + +| Section Pattern | Weight | Description | +| -------------------- | ------ | --------------------- | +| `.rodata` | 10.0 | Read-only data | +| `.comment`, `.note` | 9.0 | Build info | +| `.data.rel.ro` | 7.0 | Relocated read-only | +| `.data` | 5.0 | Writable data | +| `.dynstr`, `.strtab` | 8.0 | String tables | + +#### `src/container/pe.rs` (661 lines) + +PE binary parser with Windows-specific handling: + +| Section Pattern | Weight | Description | +| --------------- | ------ | ------------------ | +| `.rdata` | 10.0 | Read-only data | +| `.rsrc` | 9.0 | Resources | +| `.text` | 3.0 | Code section | +| `.data` | 5.0-7.0| Data (by perms) | + +#### `src/container/macho.rs` (574 lines) + +Mach-O parser for macOS/iOS binaries: + +| Segment/Section | Weight | Description | +| ------------------------ | ------ | ----------------- | +| `__TEXT,__cstring` | 10.0 | C strings | +| `__TEXT,__const` | 9.0 | Constants | +| `__DATA_CONST` | 7.0 | Const data | +| `__DATA,__data` | 5.0 | Writable data | + +### Extraction Module (`src/extraction/`) + +#### `src/extraction/mod.rs` (1,498 lines) + +Main extraction framework with `StringExtractor` trait and `BasicExtractor`. + +```rust +pub trait StringExtractor { + fn extract(&self, data: &[u8], info: &ContainerInfo) -> Vec; +} + +pub struct BasicExtractor { + ascii_config: AsciiExtractionConfig, + utf16_config: Utf16ExtractionConfig, + filter_config: FilterConfig, +} +``` + +#### `src/extraction/ascii.rs` (820 lines) + +ASCII/UTF-8 string extraction with configurable parameters. + +#### `src/extraction/utf16.rs` (1,269 lines) + +UTF-16LE/BE extraction with confidence scoring and BOM detection. + +#### `src/extraction/dedup.rs` (841 lines) + +Deduplication with occurrence tracking and score aggregation: + +``` +Score = max(base_scores) + 5*(count-1) + 10*(cross_section) + 15*(multi_source) + confidence_boost +``` + +#### `src/extraction/pe_resources.rs` (1,430 lines) + +PE resource extraction (VERSIONINFO, STRINGTABLE, MANIFEST). + +#### `src/extraction/filters.rs` (702 lines) + +Noise filtering to reduce false positives. + +### Classification Module (`src/classification/`) + +#### `src/classification/semantic.rs` (1,542 lines) + +Semantic classifier with pattern matching: + +| Pattern Type | Implementation | +| -------------- | ------------------------------------------------- | +| URLs | Regex with safe character filtering | +| Domains | TLD validation, DNS format compliance | +| IPv4/IPv6 | Regex pre-filter + `std::net::IpAddr` validation | +| POSIX Paths | `/path` format with validation rules | +| Windows Paths | `C:\path` format with drive letter validation | +| UNC Paths | `\\server\share` format | +| Registry Paths | HKEY_*/HK* prefix detection | + +--- + +## 4. API Endpoints Analysis + +**N/A** - Stringy is a command-line tool, not a web service. The public API is exposed as a Rust library: + +```rust +// Library usage +use stringy::{detect_format, create_parser, BasicExtractor, SemanticClassifier}; + +let data = std::fs::read("binary")?; +let format = detect_format(&data); +let parser = create_parser(format)?; +let info = parser.parse(&data)?; + +let extractor = BasicExtractor::new(/* configs */); +let strings = extractor.extract(&data, &info); + +let classifier = SemanticClassifier::new(); +for s in &strings { + let tags = classifier.classify(s); +} +``` + +--- + +## 5. Architecture Deep Dive + +### Data Flow Pipeline + +``` +Binary File + | + v ++-------------------+ +| Format Detection | detect_format() -> BinaryFormat ++-------------------+ + | + v ++-------------------+ +| Container Parser | ContainerParser::parse() -> ContainerInfo +| (ELF/PE/Mach-O) | - Section analysis with weights ++-------------------+ - Import/export extraction + | + v ++-------------------+ +| String Extraction | StringExtractor::extract() -> Vec +| - ASCII/UTF-8 | - Per-section extraction +| - UTF-16LE/BE | - PE resource extraction ++-------------------+ + | + v ++-------------------+ +| Deduplication | Deduplicator::deduplicate() +| - Occurrence | - Score aggregation +| tracking | - Tag merging ++-------------------+ + | + v ++-------------------+ +| Classification | SemanticClassifier::classify() +| - URLs, IPs | - Pattern matching +| - Paths, Registry | - Validation ++-------------------+ + | + v ++-------------------+ +| Ranking | (In progress) +| - Score-based | +| prioritization | ++-------------------+ + | + v ++-------------------+ +| Output Formatter | (Planned) +| - JSON/JSONL | +| - Human-readable | +| - YARA-friendly | ++-------------------+ +``` + +### Design Patterns + +1. **Trait-Based Polymorphism**: `ContainerParser` and `StringExtractor` traits enable format extensibility +2. **Builder Pattern**: Extraction configs use builder-style construction +3. **Non-Exhaustive Enums/Structs**: Public API stability via `#[non_exhaustive]` +4. **Lazy Static Regex**: Compiled once via `lazy_static!` for performance +5. **Error Propagation**: `thiserror` for structured error handling with context + +### Module Dependencies + +``` +types.rs (core data structures) + ^ + | ++---+---+---+---+ +| | | | | +v v v v v +container/ extraction/ classification/ output/ + | | | + +-----------+--------------+ + | + v + lib.rs (re-exports) + | + v + main.rs (CLI) +``` + +--- + +## 6. Environment & Setup Analysis + +### Prerequisites + +- Rust 1.85+ (2024 Edition) +- Cargo (included with Rust) +- Optional: `just` command runner + +### Development Setup + +```bash +# Clone repository +git clone https://github.com/EvilBit-Labs/Stringy +cd Stringy + +# Install tools (via justfile) +just setup # Installs rustfmt, clippy, llvm-tools-preview, mdformat + +# Build +just build # Debug build +cargo build --release # Release build + +# Test +just test # Run with nextest +cargo test # Standard test runner + +# Lint +just lint # Full lint suite (rustfmt, clippy, actionlint, cspell, markdown) +just check # Pre-commit checks +``` + +### CI Pipeline (`.github/workflows/ci.yml`) + +| Job | Description | +| ------------ | ---------------------------------------------- | +| `check` | Format check, clippy, build | +| `test` | Run tests on ubuntu-latest, windows-latest | +| `coverage` | Generate LCOV coverage report | +| `docs` | Build mdBook documentation | + +### Environment Variables + +None required for basic operation. CI uses: + +- `CARGO_TERM_COLOR=always` +- `RUSTFLAGS=-Cinstrument-coverage` (for coverage) + +--- + +## 7. Technology Stack Breakdown + +### Core Dependencies + +| Crate | Version | Purpose | +| ------------ | ------- | ------------------------------------ | +| `goblin` | 0.10.4 | ELF/PE/Mach-O parsing | +| `pelite` | 0.10.0 | PE resource extraction | +| `clap` | 4.5.54 | CLI argument parsing (derive macros) | +| `regex` | 1.12.2 | Pattern matching for classification | +| `lazy_static`| 1.5 | Compile-time regex caching | +| `serde` | 1.0.228 | Serialization (JSON output) | +| `serde_json` | 1.0.148 | JSON formatting | +| `thiserror` | 2.0.17 | Error handling with derives | +| `entropy` | 0.4.2 | Entropy calculation for filtering | + +### Development Dependencies + +| Crate | Version | Purpose | +| ---------- | ------- | ------------------------- | +| `criterion`| 0.8.1 | Benchmarking framework | +| `insta` | 1.46.0 | Snapshot testing | +| `tempfile` | 3.24.0 | Temporary file handling | + +### Build Tools + +| Tool | Purpose | +| ---------- | -------------------------------------- | +| `just` | Cross-platform task runner | +| `nextest` | Fast test runner | +| `mdformat` | Markdown formatting | +| `mdbook` | Documentation generation | +| `cspell` | Spell checking | +| `actionlint`| GitHub Actions linting | + +--- + +## 8. Visual Architecture Diagram + +``` ++===========================================================================+ +| STRINGY ARCHITECTURE | ++===========================================================================+ + + +----------------+ + | Binary File | + | (ELF/PE/MachO) | + +-------+--------+ + | + v ++-----------------------------------------------------------------------------+ +| CONTAINER LAYER | +| +------------------+ +------------------+ +------------------+ | +| | ElfParser | | PeParser | | MachoParser | | +| | - Section scan | | - Section scan | | - Segment scan | | +| | - Weight assign | | - Resource enum | | - Load commands | | +| | - Symbol extract | | - Import/Export | | - Symbol extract | | +| +------------------+ +------------------+ +------------------+ | +| \ | / | +| \ | / | +| +------------------+-----------------+ | +| | | +| v | +| +----------------+ | +| | ContainerInfo | | +| | - Format | | +| | - Sections[] | | +| | - Imports[] | | +| | - Exports[] | | +| +----------------+ | ++-----------------------------------------------------------------------------+ + | + v ++-----------------------------------------------------------------------------+ +| EXTRACTION LAYER | +| +------------------+ +------------------+ +------------------+ | +| | AsciiExtractor | | Utf16Extractor | | PeResourceExtractor| | +| | - Min/max length | | - LE/BE support | | - VERSIONINFO | | +| | - UTF-8 validate | | - BOM detection | | - STRINGTABLE | | +| | - Confidence | | - Confidence | | - MANIFEST | | +| +------------------+ +------------------+ +------------------+ | +| \ | / | +| +-------------------+-------------------+ | +| | | +| v | +| +----------------+ | +| | Deduplicator | | +| | - Group by key | | +| | - Merge tags | | +| | - Score calc | | +| +----------------+ | ++-----------------------------------------------------------------------------+ + | + v ++-----------------------------------------------------------------------------+ +| CLASSIFICATION LAYER | +| +--------------------+ | +| | SemanticClassifier | | +| +--------------------+ | +| | | +| +------------+ +--------+ +--------+ +----------+ +----------+ | +| | URL | | Domain | | IP | | FilePath | | Registry | | +| | Detection | | Check | | v4/v6 | | POSIX/Win| | Path | | +| +------------+ +--------+ +--------+ +----------+ +----------+ | ++-----------------------------------------------------------------------------+ + | + v ++-----------------------------------------------------------------------------+ +| OUTPUT LAYER (Planned) | +| +------------------+ +------------------+ +------------------+ | +| | JSON Formatter | | Human Formatter | | YARA Formatter | | +| +------------------+ +------------------+ +------------------+ | ++-----------------------------------------------------------------------------+ + | + v + +----------------+ + | CLI Output | + | (stdout/file) | + +----------------+ +``` + +--- + +## 9. Key Insights & Recommendations + +### Strengths + +1. **Solid Foundation**: Well-structured module organization with clear separation of concerns +2. **Type Safety**: Comprehensive error handling with `thiserror` and extensive use of Rust's type system +3. **Extensibility**: Trait-based design (`ContainerParser`, `StringExtractor`) enables easy format additions +4. **Performance Focus**: Regex caching via `lazy_static!`, section weight prioritization +5. **Testing Coverage**: Snapshot tests with `insta`, benchmarks with `criterion`, integration tests for all formats +6. **Code Quality**: `#![forbid(unsafe_code)]`, `#![deny(warnings)]`, comprehensive linting + +### Areas for Completion + +1. **CLI Implementation**: `main.rs` is a placeholder - full pipeline integration needed +2. **Output Formatters**: `output/mod.rs` is empty - JSON, human-readable, YARA outputs pending +3. **Additional Classifiers**: GUIDs, email addresses, Base64, format strings documented but not implemented +4. **Ranking System**: Score-based prioritization framework exists but needs completion + +### Recommendations + +1. **Complete CLI Pipeline**: Wire up container parsing -> extraction -> classification -> output +2. **Implement Output Formatters**: Start with JSON (most requested for pipelines) +3. **Add Missing Classifiers**: GUID and email detection are straightforward additions +4. **Performance Benchmarks**: Expand benchmarks to cover full pipeline, not just parsing +5. **Documentation**: Complete mdBook documentation with usage examples + +### Code Metrics Summary + +| Category | Files | Lines | Status | +| -------------- | ----- | ------ | ----------- | +| Source (`src/`)| 19 | 11,153 | Active | +| Tests | 10 | 5,254 | Active | +| Benchmarks | 3 | ~300 | Active | +| Documentation | 5+ | ~1,000 | In Progress | +| **Total** | ~37 | ~17,707| In Progress | + +### Implementation Status + +| Component | Status | Completion | +| -------------------- | ----------- | ---------- | +| Format Detection | Complete | 100% | +| Container Parsers | Complete | 100% | +| ASCII Extraction | Complete | 100% | +| UTF-16 Extraction | Complete | 100% | +| PE Resources | Complete | 100% | +| Deduplication | Complete | 100% | +| IP Classification | Complete | 100% | +| URL/Domain | Complete | 100% | +| Path Classification | Complete | 100% | +| Registry Paths | Complete | 100% | +| GUIDs/Email/Base64 | Planned | 0% | +| Ranking System | In Progress | 50% | +| Output Formatters | Planned | 0% | +| CLI Integration | In Progress | 20% | + +--- + +*Generated: 2026-01-17* +*Analysis performed on branch: `17-implement-file-path-classification-for-posix-windows-and-registry-paths`* From bb7de66c16d8ec630fba256983bd346790b8c8b6 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 18:01:49 -0500 Subject: [PATCH 09/19] chore: add CodeRabbit configuration file for project setup This configuration file includes general settings, review guidelines, path filters, and instructions for various modules to streamline code reviews and maintain project standards. Signed-off-by: UncleSp1d3r --- .coderabbit.yml | 583 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 583 insertions(+) create mode 100644 .coderabbit.yml diff --git a/.coderabbit.yml b/.coderabbit.yml new file mode 100644 index 0000000..f44fd03 --- /dev/null +++ b/.coderabbit.yml @@ -0,0 +1,583 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json +# CodeRabbit Configuration for Stringy +# Schema: https://coderabbit.ai/integrations/schema.v2.json +# Documentation: https://docs.coderabbit.ai/getting-started/yaml-configuration + +# ============================================================================= +# GENERAL SETTINGS +# ============================================================================= + +language: "en-US" + +tone_instructions: "Be direct and technical. Focus on correctness, performance, and idiomatic Rust patterns. Stringy is a binary analysis tool - prioritize security considerations and false positive reduction." + +early_access: true +enable_free_tier: true +inheritance: false + +# ============================================================================= +# REVIEWS CONFIGURATION +# ============================================================================= +reviews: + profile: "assertive" + request_changes_workflow: false + high_level_summary: true + high_level_summary_instructions: "Focus on architectural impact, API changes, binary format handling, and semantic classification accuracy." + high_level_summary_in_walkthrough: true + review_status: true + commit_status: true + fail_commit_status: false + collapse_walkthrough: true + changed_files_summary: true + sequence_diagrams: true + estimate_code_review_effort: true + assess_linked_issues: true + related_issues: true + related_prs: true + suggested_labels: true + auto_apply_labels: true + suggested_reviewers: true + auto_assign_reviewers: true + poem: true + abort_on_close: true + disable_cache: false + + # --------------------------------------------------------------------------- + # Path Filters + # --------------------------------------------------------------------------- + path_filters: + - "!**/Cargo.lock" + - "!**/target/**" + - "!**/.git/**" + - "!**/tests/fixtures/**" + - "!**/tests/snapshots/**" + + # --------------------------------------------------------------------------- + # Path Instructions - Module-specific review guidelines + # --------------------------------------------------------------------------- + path_instructions: + # ------------------------------------------------------------------------- + # Core Library + # ------------------------------------------------------------------------- + - path: "src/lib.rs" + instructions: | + This is the library entry point with module declarations and re-exports. Review for: + - #![forbid(unsafe_code)] and #![deny(warnings)] are present + - Public re-exports are ergonomic (import from stringy::types, not deeply nested) + - Module organization matches the data flow pipeline + - No unnecessary public exports + + - path: "src/main.rs" + instructions: | + This is the CLI entry point using clap derive macros. Review for: + - Correct use of clap attributes (#[arg], #[command]) + - Help text is clear and actionable + - Error handling uses Box or StringyError + - Pipeline integration: format detection -> parsing -> extraction -> classification -> output + + - path: "src/types.rs" + instructions: | + This file defines core data structures. Review for: + - All public structs use #[non_exhaustive] with explicit constructors + - Tag enum covers all semantic classification types + - StringyError uses thiserror with descriptive messages and context + - Encoding, BinaryFormat, SectionType, StringSource enums are complete + - FoundString and ContainerInfo have all necessary metadata fields + - All structs derive appropriate traits (Debug, Clone, Serialize, Deserialize) + + # ------------------------------------------------------------------------- + # Container Module - Binary Format Parsing + # ------------------------------------------------------------------------- + - path: "src/container/mod.rs" + instructions: | + This file defines the ContainerParser trait and format detection. Review for: + - ContainerParser trait has detect() and parse() methods + - detect_format() correctly identifies ELF, PE, Mach-O via magic bytes + - create_parser() returns appropriate parser for each format + - Error handling uses StringyError variants + + - path: "src/container/elf.rs" + instructions: | + This file implements ELF binary parsing via goblin. Review for: + - Section weight system (1.0-10.0) follows established patterns: + - .rodata: 10.0, .comment/.note: 9.0, .dynstr/.strtab: 8.0 + - .data.rel.ro: 7.0, .data: 5.0 + - Import/export extraction from symbol tables + - SectionType classification is accurate + - Error context includes section names and offsets + + - path: "src/container/pe.rs" + instructions: | + This file implements PE binary parsing via goblin. Review for: + - Section weight system follows established patterns: + - .rdata: 10.0, .rsrc: 9.0, .data (by permissions): 5.0-7.0, .text: 3.0 + - Import/export extraction from PE tables + - Resource section handling coordinates with pe_resources.rs + - Windows-specific characteristics are handled + + - path: "src/container/macho.rs" + instructions: | + This file implements Mach-O binary parsing via goblin. Review for: + - Segment/section weight system follows established patterns: + - __TEXT,__cstring: 10.0, __TEXT,__const: 9.0 + - __DATA_CONST: 7.0, __DATA,__data: 5.0 + - Load command processing + - Universal binary (fat) handling if applicable + - Import/export extraction from symbol tables + + # ------------------------------------------------------------------------- + # Extraction Module - String Extraction + # ------------------------------------------------------------------------- + - path: "src/extraction/mod.rs" + instructions: | + This file defines the StringExtractor trait and BasicExtractor. Review for: + - StringExtractor trait signature: extract(&self, data, info) -> Vec + - BasicExtractor combines ASCII, UTF-16, and filter configs + - Per-section extraction respects section weights + - Proper coordination between extraction components + + - path: "src/extraction/ascii.rs" + instructions: | + This file handles ASCII/UTF-8 string extraction. Review for: + - AsciiExtractionConfig has sensible defaults (min_length: 4, max_length: 1024) + - UTF-8 validation is correct + - Confidence scoring logic is sound + - Performance for large binaries + + - path: "src/extraction/utf16.rs" + instructions: | + This file handles UTF-16LE/BE string extraction. Review for: + - Both little-endian and big-endian support + - BOM detection and handling + - Confidence scoring distinguishes real UTF-16 from garbage + - Null-interleaved ASCII detection + - Performance considerations + + - path: "src/extraction/dedup.rs" + instructions: | + This file handles string deduplication with occurrence tracking. Review for: + - Grouping by (text, encoding) tuple + - All occurrences are preserved with metadata + - Tag merging uses HashSet union + - Score formula: base + occurrence_bonus + cross_section_bonus + multi_source_bonus + confidence_boost + - No data loss during deduplication + + - path: "src/extraction/pe_resources.rs" + instructions: | + This file extracts PE resources using pelite. Review for: + - VERSIONINFO parsing and string extraction + - STRINGTABLE resource handling + - MANIFEST XML parsing + - Resource metadata capture (type, language, size) + - Error handling for malformed resources + + - path: "src/extraction/filters.rs" + instructions: | + This file implements noise filtering to reduce false positives. Review for: + - Filter criteria are well-justified + - Entropy-based filtering uses appropriate thresholds + - No legitimate strings are filtered incorrectly + - FilterConfig is configurable + + - path: "src/extraction/config.rs" + instructions: | + This file defines extraction configuration structs. Review for: + - All config structs have sensible defaults + - Builder pattern if used + - Validation of config values + - Documentation of config options + + # ------------------------------------------------------------------------- + # Classification Module - Semantic Tagging + # ------------------------------------------------------------------------- + - path: "src/classification/mod.rs" + instructions: | + This file exports the classification module. Review for: + - SemanticClassifier is publicly exported + - Module organization is clean + + - path: "src/classification/semantic.rs" + instructions: | + This file implements semantic classification with regex patterns. Review for: + - Regex patterns use lazy_static! for caching + - URL pattern handles safe characters correctly + - Domain validation includes TLD checking + - IPv4/IPv6 uses regex pre-filter + std::net::IpAddr validation + - POSIX paths: absolute paths starting with / + - Windows paths: drive letter + backslash, no consecutive backslashes + - UNC paths: \\server\share format + - Registry paths: HKEY_* and HK* abbreviations + - False positive reduction logic is sound + - No catastrophic backtracking in regex patterns + - classify() method returns Vec + + # ------------------------------------------------------------------------- + # Output Module - Formatters + # ------------------------------------------------------------------------- + - path: "src/output/mod.rs" + instructions: | + This file defines output formatters. Review for: + - OutputFormatter trait if defined + - JSON output is valid and complete + - Human-readable output is clear + - YARA-friendly output follows YARA syntax + + # ------------------------------------------------------------------------- + # Tests + # ------------------------------------------------------------------------- + - path: "tests/**/*.rs" + instructions: | + Review test files for: + - Integration tests use fixtures from tests/fixtures/ + - Snapshot testing uses insta with assert_debug_snapshot! + - Test names follow test__ pattern + - Both success and error cases are covered + - Performance tests have reasonable timeouts + - No flaky tests (avoid timing-dependent assertions) + - Helper functions like make_found_string() are reused + + - path: "benches/**/*.rs" + instructions: | + Review benchmark files for: + - Uses Criterion framework correctly + - Benchmarks measure meaningful operations (parsing, extraction) + - Input sizes are realistic (actual binary fixtures) + - No I/O in hot paths being measured + - Results are reproducible + + # ------------------------------------------------------------------------- + # Configuration & Documentation + # ------------------------------------------------------------------------- + - path: "Cargo.toml" + instructions: | + Review Cargo.toml changes for: + - Version bumps follow semver + - Edition is 2024, MSRV is 1.85+ + - New dependencies are necessary and well-maintained + - Core deps: goblin, pelite, clap, regex, lazy_static, serde, thiserror + - Dev deps: criterion, insta, tempfile + - No unnecessary feature flags + + - path: "*.md" + instructions: | + Review documentation for: + - ASCII only - no emojis, em-dashes, or Unicode punctuation + - Accuracy with current implementation + - Clear examples that work + - Proper markdown formatting + - No broken links + + - path: "justfile" + instructions: | + Review justfile changes for: + - Cross-platform compatibility (Windows PowerShell / Unix bash) + - Commands follow existing patterns + - No breaking changes to common commands (check, test, lint, build) + + # --------------------------------------------------------------------------- + # Auto Review Settings + # --------------------------------------------------------------------------- + auto_review: + enabled: true + auto_incremental_review: true + ignore_title_keywords: + - "WIP" + - "DO NOT REVIEW" + - "[skip ci]" + - "[skip review]" + labels: [] + drafts: false + base_branches: [] + ignore_usernames: + - "dependabot[bot]" + - "renovate[bot]" + + # --------------------------------------------------------------------------- + # Finishing Touches + # --------------------------------------------------------------------------- + finishing_touches: + docstrings: + enabled: true + unit_tests: + enabled: true + + # --------------------------------------------------------------------------- + # Pre-merge Checks + # --------------------------------------------------------------------------- + pre_merge_checks: + docstrings: + mode: "warning" + threshold: 70 + + title: + mode: "warning" + requirements: | + PR titles must follow Conventional Commits format: + (): + + Types: feat, fix, docs, style, refactor, perf, test, build, ci, chore + Scopes: container, extraction, classification, output, types, cli, deps, release + + Examples: + - feat(classification): add GUID pattern detection + - fix(extraction): handle malformed PE resources + - docs(readme): update installation instructions + - refactor(container): extract common section weight logic + + description: + mode: "warning" + + issue_assessment: + mode: "warning" + + custom_checks: + - mode: "warning" + name: "No Unsafe Code" + instructions: | + Verify that no unsafe code is introduced: + 1. No `unsafe` blocks or functions + 2. No `#[allow(unsafe_code)]` attributes + 3. The crate uses `#![forbid(unsafe_code)]` + + - mode: "warning" + name: "Error Handling" + instructions: | + Verify that all error handling follows the project conventions: + 1. Use StringyError variants from types.rs, never raw strings + 2. Add context with descriptive messages including offsets and section names + 3. Map external errors (goblin, pelite, io) to appropriate StringyError variants + 4. Use thiserror #[error] and #[from] attributes correctly + + - mode: "warning" + name: "ASCII Only" + instructions: | + Verify that no Unicode punctuation is introduced: + 1. No emojis in code or documentation + 2. No em-dashes - use regular hyphens + 3. No smart quotes - use straight quotes + 4. No other non-ASCII characters in strings or comments + + - mode: "warning" + name: "File Size Limit" + instructions: | + Verify that files stay under 500 lines: + 1. New files should be under 500 lines + 2. If a file exceeds 500 lines, consider splitting into modules + 3. No blanket #[allow] attributes on modules or files + + - mode: "warning" + name: "Section Weight Consistency" + instructions: | + For changes to container parsers, verify section weights are consistent: + 1. Read-only string sections: 9.0-10.0 + 2. String tables: 8.0 + 3. Read-only data: 7.0 + 4. Writable data: 5.0 + 5. Code sections: 3.0 or lower + 6. Follow existing patterns in container/*.rs + + # --------------------------------------------------------------------------- + # Labeling Instructions + # --------------------------------------------------------------------------- + labeling_instructions: + - label: "bug" + instructions: "Apply when the PR fixes something that is not working correctly. Look for fixes to parsing errors, incorrect classification, false positives/negatives, or crashes." + - label: "enhancement" + instructions: "Apply when the PR adds new features or improves existing functionality. This includes new binary format support, new semantic classifiers, new output formats, or performance improvements." + - label: "documentation" + instructions: "Apply when the PR primarily updates documentation files (*.md), code comments, or inline documentation." + - label: "help wanted" + instructions: "Apply when the PR is incomplete or the author explicitly requests help with implementation." + - label: "question" + instructions: "Apply when the PR raises questions about implementation approach or needs discussion before proceeding." + - label: "duplicate" + instructions: "Apply when the PR duplicates work from another open or merged PR." + - label: "invalid" + instructions: "Apply when the PR does not follow project conventions, targets the wrong branch, or is not appropriate for the project." + + # --------------------------------------------------------------------------- + # Tools Configuration + # --------------------------------------------------------------------------- + tools: + # Rust-specific tools + clippy: + enabled: true + + # General tools + ast-grep: + essential_rules: true + rule_dirs: [] + util_dirs: [] + packages: [] + + shellcheck: + enabled: true + + markdownlint: + enabled: true + + github-checks: + enabled: true + timeout_ms: 90000 + + languagetool: + enabled: true + enabled_rules: [] + disabled_rules: [] + enabled_categories: [] + disabled_categories: [] + enabled_only: false + + gitleaks: + enabled: true + + checkov: + enabled: true + + semgrep: + enabled: true + + actionlint: + enabled: true + + yamllint: + enabled: true + + # Disable tools not relevant to this project + ruff: + enabled: false + eslint: + enabled: false + biome: + enabled: false + phpstan: + enabled: false + swiftlint: + enabled: false + rubocop: + enabled: false + detekt: + enabled: false + golangci-lint: + enabled: false + +# ============================================================================= +# CHAT CONFIGURATION +# ============================================================================= +chat: + art: true + auto_reply: true + integrations: + jira: + usage: "disabled" + linear: + usage: "disabled" + +# ============================================================================= +# KNOWLEDGE BASE CONFIGURATION +# ============================================================================= +knowledge_base: + opt_out: false + + web_search: + enabled: true + + code_guidelines: + enabled: true + filePatterns: + - "AGENTS.md" + - "CLAUDE.md" + - ".github/copilot-instructions.md" + - "README.md" + - "codebase_analysis.md" + + learnings: + scope: "local" + + issues: + scope: "local" + + pull_requests: + scope: "local" + + jira: + usage: "disabled" + project_keys: [] + + linear: + usage: "disabled" + team_keys: [] + + mcp: + usage: "disabled" + disabled_servers: [] + +# ============================================================================= +# CODE GENERATION CONFIGURATION +# ============================================================================= +code_generation: + docstrings: + language: "en-US" + path_instructions: + - path: "src/**/*.rs" + instructions: | + Generate Rust documentation comments (///) for public items. + Follow these conventions: + - Start with a brief one-line summary + - Add # Examples section with working code when helpful + - Add # Errors section documenting error conditions + - Add # Panics section if the function can panic + - Use backticks for code references + - Reference related items with [links] + - ASCII only - no emojis or Unicode punctuation + + unit_tests: + path_instructions: + - path: "src/**/*.rs" + instructions: | + Generate Rust unit tests following project conventions: + - Place tests in #[cfg(test)] mod tests { } blocks + - Use #[test] for all tests (project is synchronous) + - Use tempfile for filesystem fixtures + - Use insta for snapshot testing where appropriate + - Include both success and error cases + - Use assert_eq! with descriptive messages + - Name tests: test__ + - Binary fixtures go in tests/fixtures/ + +# ============================================================================= +# ISSUE ENRICHMENT CONFIGURATION +# ============================================================================= +issue_enrichment: + auto_enrich: + enabled: true + + planning: + enabled: true + auto_planning: + enabled: false + labels: + - "enhancement" + - "bug" + + labeling: + auto_apply_labels: true + labeling_instructions: + - label: "bug" + instructions: "Apply when the issue reports something that is not working correctly. Look for error messages, unexpected behavior, crashes, or incorrect classification results." + - label: "enhancement" + instructions: "Apply when the issue requests new features or improvements. This includes new binary format support, new semantic classifiers, new output formats, or performance improvements." + - label: "documentation" + instructions: "Apply when the issue is about missing, incorrect, or unclear documentation. This includes README updates, API documentation, examples, or inline code comments." + - label: "good first issue" + instructions: "Apply when the issue is well-scoped, has clear requirements, and does not require deep knowledge of the codebase. Good for newcomers to contribute." + - label: "help wanted" + instructions: "Apply when the issue needs community input, additional expertise, or the maintainers explicitly request assistance." + - label: "question" + instructions: "Apply when the issue is asking for clarification, guidance, or discussion rather than reporting a bug or requesting a feature." + - label: "duplicate" + instructions: "Apply when this issue duplicates an existing open or recently closed issue. Reference the original issue." + - label: "invalid" + instructions: "Apply when the issue does not provide enough information, is not related to this project, or cannot be reproduced." + - label: "wontfix" + instructions: "Apply when the issue describes behavior that is working as intended, is out of scope for the project, or conflicts with project goals." From d421b0530207259452da73e696d20b5116f9cb77 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 18:16:29 -0500 Subject: [PATCH 10/19] chore: improve formatting and readability in codebase analysis Signed-off-by: UncleSp1d3r --- codebase_analysis.md | 263 +++++++++++++++++++++++-------------------- 1 file changed, 143 insertions(+), 120 deletions(-) diff --git a/codebase_analysis.md b/codebase_analysis.md index 2ac1405..11b2bb0 100644 --- a/codebase_analysis.md +++ b/codebase_analysis.md @@ -2,32 +2,38 @@ ## 1. Project Overview -**Stringy** is a smarter alternative to the standard `strings` command that extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. +**Stringy** is a smarter alternative to the standard `strings` command that +extracts meaningful strings from ELF, PE, and Mach-O binaries using +format-specific knowledge and semantic classification. ### Key Differentiators -- **Data-Structure Aware**: Extracts strings from actual binary data structures, not arbitrary byte runs -- **Section-Aware**: Prioritizes high-value sections (`.rodata`, `.rdata`, `__cstring`) with weight-based scoring +- **Data-Structure Aware**: Extracts strings from actual binary data structures, + not arbitrary byte runs +- **Section-Aware**: Prioritizes high-value sections (`.rodata`, `.rdata`, + `__cstring`) with weight-based scoring - **Encoding-Aware**: Supports ASCII, UTF-8, UTF-16LE/BE with confidence scoring -- **Semantically Tagged**: Identifies URLs, domains, IPs, file paths, registry keys, GUIDs, and more -- **Ranked Output**: Presents most relevant strings first using a scoring algorithm +- **Semantically Tagged**: Identifies URLs, domains, IPs, file paths, registry + keys, GUIDs, and more +- **Ranked Output**: Presents most relevant strings first using a scoring + algorithm ### Project Metadata -| Attribute | Value | -| ------------- | ----------------------------------------------- | -| Language | Rust 2024 Edition | -| MSRV | 1.85+ | -| License | Apache-2.0 | -| Repository | | -| Version | 0.1.0 (in development) | -| Total LoC | ~11,153 (src) + ~5,254 (tests) = ~16,407 lines | +| Attribute | Value | +| ---------- | ---------------------------------------------- | +| Language | Rust 2024 Edition | +| MSRV | 1.85+ | +| License | Apache-2.0 | +| Repository | | +| Version | 0.1.0 (in development) | +| Total LoC | ~11,153 (src) + ~5,254 (tests) = ~16,407 lines | --- ## 2. Directory Structure Analysis -``` +```text D:\Stringy\ |-- .github/ | |-- copilot-instructions.md # AI agent guidelines @@ -141,8 +147,8 @@ fn main() -> Result<(), Box> { Core data structures with comprehensive type definitions: -| Type | Purpose | -| --------------- | ---------------------------------------------------- | +| Type | Purpose | +| --------------- | --------------------------------------------------- | | `Tag` | Semantic classification tags (Url, Domain, IPv4...) | | `Encoding` | String encoding (Ascii, Utf8, Utf16Le, Utf16Be) | | `BinaryFormat` | Binary format (Elf, Pe, MachO, Unknown) | @@ -173,35 +179,35 @@ pub fn create_parser(format: BinaryFormat) -> Result> { ELF binary parser with section weight system: -| Section Pattern | Weight | Description | -| -------------------- | ------ | --------------------- | -| `.rodata` | 10.0 | Read-only data | -| `.comment`, `.note` | 9.0 | Build info | -| `.data.rel.ro` | 7.0 | Relocated read-only | -| `.data` | 5.0 | Writable data | -| `.dynstr`, `.strtab` | 8.0 | String tables | +| Section Pattern | Weight | Description | +| -------------------- | ------ | ------------------- | +| `.rodata` | 10.0 | Read-only data | +| `.comment`, `.note` | 9.0 | Build info | +| `.data.rel.ro` | 7.0 | Relocated read-only | +| `.data` | 5.0 | Writable data | +| `.dynstr`, `.strtab` | 8.0 | String tables | #### `src/container/pe.rs` (661 lines) PE binary parser with Windows-specific handling: -| Section Pattern | Weight | Description | -| --------------- | ------ | ------------------ | -| `.rdata` | 10.0 | Read-only data | -| `.rsrc` | 9.0 | Resources | -| `.text` | 3.0 | Code section | -| `.data` | 5.0-7.0| Data (by perms) | +| Section Pattern | Weight | Description | +| --------------- | ------- | --------------- | +| `.rdata` | 10.0 | Read-only data | +| `.rsrc` | 9.0 | Resources | +| `.text` | 3.0 | Code section | +| `.data` | 5.0-7.0 | Data (by perms) | #### `src/container/macho.rs` (574 lines) Mach-O parser for macOS/iOS binaries: -| Segment/Section | Weight | Description | -| ------------------------ | ------ | ----------------- | -| `__TEXT,__cstring` | 10.0 | C strings | -| `__TEXT,__const` | 9.0 | Constants | -| `__DATA_CONST` | 7.0 | Const data | -| `__DATA,__data` | 5.0 | Writable data | +| Segment/Section | Weight | Description | +| ------------------ | ------ | ------------- | +| `__TEXT,__cstring` | 10.0 | C strings | +| `__TEXT,__const` | 9.0 | Constants | +| `__DATA_CONST` | 7.0 | Const data | +| `__DATA,__data` | 5.0 | Writable data | ### Extraction Module (`src/extraction/`) @@ -233,7 +239,7 @@ UTF-16LE/BE extraction with confidence scoring and BOM detection. Deduplication with occurrence tracking and score aggregation: -``` +```text Score = max(base_scores) + 5*(count-1) + 10*(cross_section) + 15*(multi_source) + confidence_boost ``` @@ -251,21 +257,22 @@ Noise filtering to reduce false positives. Semantic classifier with pattern matching: -| Pattern Type | Implementation | -| -------------- | ------------------------------------------------- | -| URLs | Regex with safe character filtering | -| Domains | TLD validation, DNS format compliance | -| IPv4/IPv6 | Regex pre-filter + `std::net::IpAddr` validation | -| POSIX Paths | `/path` format with validation rules | -| Windows Paths | `C:\path` format with drive letter validation | -| UNC Paths | `\\server\share` format | -| Registry Paths | HKEY_*/HK* prefix detection | +| Pattern Type | Implementation | +| -------------- | ------------------------------------------------ | +| URLs | Regex with safe character filtering | +| Domains | TLD validation, DNS format compliance | +| IPv4/IPv6 | Regex pre-filter + `std::net::IpAddr` validation | +| POSIX Paths | `/path` format with validation rules | +| Windows Paths | `C:\path` format with drive letter validation | +| UNC Paths | `\\server\share` format | +| Registry Paths | HKEY__/HK_ prefix detection | --- ## 4. API Endpoints Analysis -**N/A** - Stringy is a command-line tool, not a web service. The public API is exposed as a Rust library: +**N/A** - Stringy is a command-line tool, not a web service. The public API is +exposed as a Rust library: ```rust // Library usage @@ -291,7 +298,7 @@ for s in &strings { ### Data Flow Pipeline -``` +```text Binary File | v @@ -344,15 +351,17 @@ Binary File ### Design Patterns -1. **Trait-Based Polymorphism**: `ContainerParser` and `StringExtractor` traits enable format extensibility +1. **Trait-Based Polymorphism**: `ContainerParser` and `StringExtractor` traits + enable format extensibility 2. **Builder Pattern**: Extraction configs use builder-style construction -3. **Non-Exhaustive Enums/Structs**: Public API stability via `#[non_exhaustive]` +3. **Non-Exhaustive Enums/Structs**: Public API stability via + `#[non_exhaustive]` 4. **Lazy Static Regex**: Compiled once via `lazy_static!` for performance 5. **Error Propagation**: `thiserror` for structured error handling with context ### Module Dependencies -``` +```text types.rs (core data structures) ^ | @@ -405,12 +414,12 @@ just check # Pre-commit checks ### CI Pipeline (`.github/workflows/ci.yml`) -| Job | Description | -| ------------ | ---------------------------------------------- | -| `check` | Format check, clippy, build | -| `test` | Run tests on ubuntu-latest, windows-latest | -| `coverage` | Generate LCOV coverage report | -| `docs` | Build mdBook documentation | +| Job | Description | +| ---------- | ------------------------------------------ | +| `check` | Format check, clippy, build | +| `test` | Run tests on ubuntu-latest, windows-latest | +| `coverage` | Generate LCOV coverage report | +| `docs` | Build mdBook documentation | ### Environment Variables @@ -425,42 +434,42 @@ None required for basic operation. CI uses: ### Core Dependencies -| Crate | Version | Purpose | -| ------------ | ------- | ------------------------------------ | -| `goblin` | 0.10.4 | ELF/PE/Mach-O parsing | -| `pelite` | 0.10.0 | PE resource extraction | -| `clap` | 4.5.54 | CLI argument parsing (derive macros) | -| `regex` | 1.12.2 | Pattern matching for classification | -| `lazy_static`| 1.5 | Compile-time regex caching | -| `serde` | 1.0.228 | Serialization (JSON output) | -| `serde_json` | 1.0.148 | JSON formatting | -| `thiserror` | 2.0.17 | Error handling with derives | -| `entropy` | 0.4.2 | Entropy calculation for filtering | +| Crate | Version | Purpose | +| ------------- | ------- | ------------------------------------ | +| `goblin` | 0.10.4 | ELF/PE/Mach-O parsing | +| `pelite` | 0.10.0 | PE resource extraction | +| `clap` | 4.5.54 | CLI argument parsing (derive macros) | +| `regex` | 1.12.2 | Pattern matching for classification | +| `lazy_static` | 1.5 | Compile-time regex caching | +| `serde` | 1.0.228 | Serialization (JSON output) | +| `serde_json` | 1.0.148 | JSON formatting | +| `thiserror` | 2.0.17 | Error handling with derives | +| `entropy` | 0.4.2 | Entropy calculation for filtering | ### Development Dependencies -| Crate | Version | Purpose | -| ---------- | ------- | ------------------------- | -| `criterion`| 0.8.1 | Benchmarking framework | -| `insta` | 1.46.0 | Snapshot testing | -| `tempfile` | 3.24.0 | Temporary file handling | +| Crate | Version | Purpose | +| ----------- | ------- | ----------------------- | +| `criterion` | 0.8.1 | Benchmarking framework | +| `insta` | 1.46.0 | Snapshot testing | +| `tempfile` | 3.24.0 | Temporary file handling | ### Build Tools -| Tool | Purpose | -| ---------- | -------------------------------------- | -| `just` | Cross-platform task runner | -| `nextest` | Fast test runner | -| `mdformat` | Markdown formatting | -| `mdbook` | Documentation generation | -| `cspell` | Spell checking | -| `actionlint`| GitHub Actions linting | +| Tool | Purpose | +| ------------ | -------------------------- | +| `just` | Cross-platform task runner | +| `nextest` | Fast test runner | +| `mdformat` | Markdown formatting | +| `mdbook` | Documentation generation | +| `cspell` | Spell checking | +| `actionlint` | GitHub Actions linting | --- ## 8. Visual Architecture Diagram -``` +```text +===========================================================================+ | STRINGY ARCHITECTURE | +===========================================================================+ @@ -548,58 +557,72 @@ None required for basic operation. CI uses: ### Strengths -1. **Solid Foundation**: Well-structured module organization with clear separation of concerns -2. **Type Safety**: Comprehensive error handling with `thiserror` and extensive use of Rust's type system -3. **Extensibility**: Trait-based design (`ContainerParser`, `StringExtractor`) enables easy format additions -4. **Performance Focus**: Regex caching via `lazy_static!`, section weight prioritization -5. **Testing Coverage**: Snapshot tests with `insta`, benchmarks with `criterion`, integration tests for all formats -6. **Code Quality**: `#![forbid(unsafe_code)]`, `#![deny(warnings)]`, comprehensive linting +1. **Solid Foundation**: Well-structured module organization with clear + separation of concerns +2. **Type Safety**: Comprehensive error handling with `thiserror` and extensive + use of Rust's type system +3. **Extensibility**: Trait-based design (`ContainerParser`, `StringExtractor`) + enables easy format additions +4. **Performance Focus**: Regex caching via `lazy_static!`, section weight + prioritization +5. **Testing Coverage**: Snapshot tests with `insta`, benchmarks with + `criterion`, integration tests for all formats +6. **Code Quality**: `#![forbid(unsafe_code)]`, `#![deny(warnings)]`, + comprehensive linting ### Areas for Completion -1. **CLI Implementation**: `main.rs` is a placeholder - full pipeline integration needed -2. **Output Formatters**: `output/mod.rs` is empty - JSON, human-readable, YARA outputs pending -3. **Additional Classifiers**: GUIDs, email addresses, Base64, format strings documented but not implemented -4. **Ranking System**: Score-based prioritization framework exists but needs completion +1. **CLI Implementation**: `main.rs` is a placeholder - full pipeline + integration needed +2. **Output Formatters**: `output/mod.rs` is empty - JSON, human-readable, YARA + outputs pending +3. **Additional Classifiers**: GUIDs, email addresses, Base64, format strings + documented but not implemented +4. **Ranking System**: Score-based prioritization framework exists but needs + completion ### Recommendations -1. **Complete CLI Pipeline**: Wire up container parsing -> extraction -> classification -> output -2. **Implement Output Formatters**: Start with JSON (most requested for pipelines) -3. **Add Missing Classifiers**: GUID and email detection are straightforward additions -4. **Performance Benchmarks**: Expand benchmarks to cover full pipeline, not just parsing +1. **Complete CLI Pipeline**: Wire up container parsing -> extraction -> + classification -> output +2. **Implement Output Formatters**: Start with JSON (most requested for + pipelines) +3. **Add Missing Classifiers**: GUID and email detection are straightforward + additions +4. **Performance Benchmarks**: Expand benchmarks to cover full pipeline, not + just parsing 5. **Documentation**: Complete mdBook documentation with usage examples ### Code Metrics Summary -| Category | Files | Lines | Status | -| -------------- | ----- | ------ | ----------- | -| Source (`src/`)| 19 | 11,153 | Active | -| Tests | 10 | 5,254 | Active | -| Benchmarks | 3 | ~300 | Active | -| Documentation | 5+ | ~1,000 | In Progress | -| **Total** | ~37 | ~17,707| In Progress | +| Category | Files | Lines | Status | +| --------------- | ----- | ------- | ----------- | +| Source (`src/`) | 19 | 11,153 | Active | +| Tests | 10 | 5,254 | Active | +| Benchmarks | 3 | ~300 | Active | +| Documentation | 5+ | ~1,000 | In Progress | +| **Total** | ~37 | ~17,707 | In Progress | ### Implementation Status -| Component | Status | Completion | -| -------------------- | ----------- | ---------- | -| Format Detection | Complete | 100% | -| Container Parsers | Complete | 100% | -| ASCII Extraction | Complete | 100% | -| UTF-16 Extraction | Complete | 100% | -| PE Resources | Complete | 100% | -| Deduplication | Complete | 100% | -| IP Classification | Complete | 100% | -| URL/Domain | Complete | 100% | -| Path Classification | Complete | 100% | -| Registry Paths | Complete | 100% | -| GUIDs/Email/Base64 | Planned | 0% | -| Ranking System | In Progress | 50% | -| Output Formatters | Planned | 0% | -| CLI Integration | In Progress | 20% | +| Component | Status | Completion | +| ------------------- | ----------- | ---------- | +| Format Detection | Complete | 100% | +| Container Parsers | Complete | 100% | +| ASCII Extraction | Complete | 100% | +| UTF-16 Extraction | Complete | 100% | +| PE Resources | Complete | 100% | +| Deduplication | Complete | 100% | +| IP Classification | Complete | 100% | +| URL/Domain | Complete | 100% | +| Path Classification | Complete | 100% | +| Registry Paths | Complete | 100% | +| GUIDs/Email/Base64 | Planned | 0% | +| Ranking System | In Progress | 50% | +| Output Formatters | Planned | 0% | +| CLI Integration | In Progress | 20% | --- -*Generated: 2026-01-17* -*Analysis performed on branch: `17-implement-file-path-classification-for-posix-windows-and-registry-paths`* +_Generated: 2026-01-17_ _Analysis performed on branch: +`17-implement-file-path-classification-for-posix-windows-and-registry-paths`_ From d0993134109cf1e270ea64f3022dd2e145f6b75c Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 18:28:59 -0500 Subject: [PATCH 11/19] chore: update formatting in copilot instructions Signed-off-by: UncleSp1d3r --- .github/copilot-instructions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index beab23e..ae6aad9 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -6,7 +6,7 @@ Stringy is a **smarter strings tool** for extracting meaningful strings from ELF ## Architecture & Data Flow -``` +```text Binary → Format Detection (goblin) → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output ``` @@ -25,7 +25,7 @@ Binary → Format Detection (goblin) → Container Parsing → String Extraction - **No `unsafe` code**: `#![forbid(unsafe_code)]` enforced at package level - **Zero warnings**: `cargo clippy -- -D warnings` must pass (`#![deny(warnings)]` enforced) - **Rust 2024 Edition**: MSRV 1.85+, always use latest edition features -- **File size limit**: Keep files ≤500-600 lines; split larger files into focused modules +- **File size limit**: Keep files \<=500-600 lines; split larger files into focused modules - **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules - **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) @@ -69,7 +69,7 @@ Strings are grouped by `(text, encoding)` tuple in a `HashMap<(String, Encoding) - **Preserve all occurrences**: Each occurrence captures offset, RVA, section, source, tags, score, confidence - **Tag merging**: Union all tags via `HashSet`, then sort - **Combined scoring formula**: - ``` + ```text base_score = max(occurrence.original_score) occurrence_bonus = 5 * (count - 1) cross_section_bonus = 10 (if >1 unique section) From 1dd48a14a09fbae948dfa4d64dccf4913da1943b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 19:05:10 -0500 Subject: [PATCH 12/19] chore: update Cargo.toml and codebase_analysis.md formatting Signed-off-by: UncleSp1d3r --- Cargo.toml | 1 + codebase_analysis.md | 88 +++++++++++++--------------------- src/classification/semantic.rs | 2 +- 3 files changed, 36 insertions(+), 55 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 136e125..c1a7703 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "stringy" version = "0.1.0" edition = "2024" +rust-version = "1.85" authors = ["UncleSp1d3r "] description = "A smarter alternative to the strings command that leverages format-specific knowledge" license = "Apache-2.0" diff --git a/codebase_analysis.md b/codebase_analysis.md index 11b2bb0..382d293 100644 --- a/codebase_analysis.md +++ b/codebase_analysis.md @@ -2,21 +2,15 @@ ## 1. Project Overview -**Stringy** is a smarter alternative to the standard `strings` command that -extracts meaningful strings from ELF, PE, and Mach-O binaries using -format-specific knowledge and semantic classification. +**Stringy** is a smarter alternative to the standard `strings` command that extracts meaningful strings from ELF, PE, and Mach-O binaries using format-specific knowledge and semantic classification. ### Key Differentiators -- **Data-Structure Aware**: Extracts strings from actual binary data structures, - not arbitrary byte runs -- **Section-Aware**: Prioritizes high-value sections (`.rodata`, `.rdata`, - `__cstring`) with weight-based scoring +- **Data-Structure Aware**: Extracts strings from actual binary data structures, not arbitrary byte runs +- **Section-Aware**: Prioritizes high-value sections (`.rodata`, `.rdata`, `__cstring`) with weight-based scoring - **Encoding-Aware**: Supports ASCII, UTF-8, UTF-16LE/BE with confidence scoring -- **Semantically Tagged**: Identifies URLs, domains, IPs, file paths, registry - keys, GUIDs, and more -- **Ranked Output**: Presents most relevant strings first using a scoring - algorithm +- **Semantically Tagged**: Identifies URLs, domains, IPs, file paths, registry keys, GUIDs, and more +- **Ranked Output**: Presents most relevant strings first using a scoring algorithm ### Project Metadata @@ -119,9 +113,9 @@ pub mod types; // Re-exports for ergonomic imports pub use classification::SemanticClassifier; -pub use container::{create_parser, detect_format, ContainerParser}; -pub use extraction::{BasicExtractor, StringExtractor, /* ... */}; -pub use types::{BinaryFormat, ContainerInfo, Encoding, FoundString, /* ... */}; +pub use container::{ContainerParser, create_parser, detect_format}; +pub use extraction::{BasicExtractor, StringExtractor /* ... */}; +pub use types::{BinaryFormat, ContainerInfo, Encoding, FoundString /* ... */}; ``` #### `src/main.rs` (23 lines) @@ -167,12 +161,16 @@ Defines the `ContainerParser` trait and format detection. ```rust pub trait ContainerParser { - fn detect(data: &[u8]) -> bool where Self: Sized; + fn detect(data: &[u8]) -> bool + where + Self: Sized; fn parse(&self, data: &[u8]) -> Result; } -pub fn detect_format(data: &[u8]) -> BinaryFormat { /* ... */ } -pub fn create_parser(format: BinaryFormat) -> Result> { /* ... */ } +pub fn detect_format(data: &[u8]) -> BinaryFormat { /* ... */ +} +pub fn create_parser(format: BinaryFormat) -> Result> { /* ... */ +} ``` #### `src/container/elf.rs` (627 lines) @@ -265,14 +263,13 @@ Semantic classifier with pattern matching: | POSIX Paths | `/path` format with validation rules | | Windows Paths | `C:\path` format with drive letter validation | | UNC Paths | `\\server\share` format | -| Registry Paths | HKEY__/HK_ prefix detection | +| Registry Paths | HKEY\_\_/HK\_ prefix detection | --- ## 4. API Endpoints Analysis -**N/A** - Stringy is a command-line tool, not a web service. The public API is -exposed as a Rust library: +**N/A** - Stringy is a command-line tool, not a web service. The public API is exposed as a Rust library: ```rust // Library usage @@ -351,11 +348,9 @@ Binary File ### Design Patterns -1. **Trait-Based Polymorphism**: `ContainerParser` and `StringExtractor` traits - enable format extensibility +1. **Trait-Based Polymorphism**: `ContainerParser` and `StringExtractor` traits enable format extensibility 2. **Builder Pattern**: Extraction configs use builder-style construction -3. **Non-Exhaustive Enums/Structs**: Public API stability via - `#[non_exhaustive]` +3. **Non-Exhaustive Enums/Structs**: Public API stability via `#[non_exhaustive]` 4. **Lazy Static Regex**: Compiled once via `lazy_static!` for performance 5. **Error Propagation**: `thiserror` for structured error handling with context @@ -557,40 +552,26 @@ None required for basic operation. CI uses: ### Strengths -1. **Solid Foundation**: Well-structured module organization with clear - separation of concerns -2. **Type Safety**: Comprehensive error handling with `thiserror` and extensive - use of Rust's type system -3. **Extensibility**: Trait-based design (`ContainerParser`, `StringExtractor`) - enables easy format additions -4. **Performance Focus**: Regex caching via `lazy_static!`, section weight - prioritization -5. **Testing Coverage**: Snapshot tests with `insta`, benchmarks with - `criterion`, integration tests for all formats -6. **Code Quality**: `#![forbid(unsafe_code)]`, `#![deny(warnings)]`, - comprehensive linting +1. **Solid Foundation**: Well-structured module organization with clear separation of concerns +2. **Type Safety**: Comprehensive error handling with `thiserror` and extensive use of Rust's type system +3. **Extensibility**: Trait-based design (`ContainerParser`, `StringExtractor`) enables easy format additions +4. **Performance Focus**: Regex caching via `lazy_static!`, section weight prioritization +5. **Testing Coverage**: Snapshot tests with `insta`, benchmarks with `criterion`, integration tests for all formats +6. **Code Quality**: `#![forbid(unsafe_code)]`, `#![deny(warnings)]`, comprehensive linting ### Areas for Completion -1. **CLI Implementation**: `main.rs` is a placeholder - full pipeline - integration needed -2. **Output Formatters**: `output/mod.rs` is empty - JSON, human-readable, YARA - outputs pending -3. **Additional Classifiers**: GUIDs, email addresses, Base64, format strings - documented but not implemented -4. **Ranking System**: Score-based prioritization framework exists but needs - completion +1. **CLI Implementation**: `main.rs` is a placeholder - full pipeline integration needed +2. **Output Formatters**: `output/mod.rs` is empty - JSON, human-readable, YARA outputs pending +3. **Additional Classifiers**: GUIDs, email addresses, Base64, format strings documented but not implemented +4. **Ranking System**: Score-based prioritization framework exists but needs completion ### Recommendations -1. **Complete CLI Pipeline**: Wire up container parsing -> extraction -> - classification -> output -2. **Implement Output Formatters**: Start with JSON (most requested for - pipelines) -3. **Add Missing Classifiers**: GUID and email detection are straightforward - additions -4. **Performance Benchmarks**: Expand benchmarks to cover full pipeline, not - just parsing +1. **Complete CLI Pipeline**: Wire up container parsing -> extraction -> classification -> output +2. **Implement Output Formatters**: Start with JSON (most requested for pipelines) +3. **Add Missing Classifiers**: GUID and email detection are straightforward additions +4. **Performance Benchmarks**: Expand benchmarks to cover full pipeline, not just parsing 5. **Documentation**: Complete mdBook documentation with usage examples ### Code Metrics Summary @@ -624,5 +605,4 @@ None required for basic operation. CI uses: --- -_Generated: 2026-01-17_ _Analysis performed on branch: -`17-implement-file-path-classification-for-posix-windows-and-registry-paths`_ +_Generated: 2026-01-17_ _Analysis performed on branch: `17-implement-file-path-classification-for-posix-windows-and-registry-paths`_ diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 812b87a..23ab9eb 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -855,7 +855,7 @@ impl SemanticClassifier { if root_upper.starts_with("HKEY_") { return VALID_REGISTRY_ROOTS .iter() - .any(|valid| valid.eq_ignore_ascii_case(&root_upper)); + .any(|valid| *valid == root_upper); } if root_upper.starts_with("HK") { From d351b8275a20f3f6ccf9c3503a18526c2f7c106c Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 19:12:47 -0500 Subject: [PATCH 13/19] chore: refresh task list to reflect current implementation state --- .kiro/specs/stringy-binary-analyzer/tasks.md | 116 ++++++++----------- 1 file changed, 51 insertions(+), 65 deletions(-) diff --git a/.kiro/specs/stringy-binary-analyzer/tasks.md b/.kiro/specs/stringy-binary-analyzer/tasks.md index da38478..2d501e5 100644 --- a/.kiro/specs/stringy-binary-analyzer/tasks.md +++ b/.kiro/specs/stringy-binary-analyzer/tasks.md @@ -35,48 +35,35 @@ - [x] 4. Implement PE section classification - - Enhance PE parser to classify sections (.rdata, .data) by string likelihood ✅ + - Enhance PE parser to classify sections (.rdata, .data) by string likelihood - - Add section weight assignment for PE-specific sections ✅ + - Add section weight assignment for PE-specific sections - - Implement basic PE import/export table parsing ✅ + - Implement basic PE import/export table parsing - - Add benchmarks and snapshot tests ✅ + - Add benchmarks and snapshot tests - _Requirements: 1.2, 1.4_ - - _Completed: Issue #3_ - - [x] 4.1 Add PE resource extraction foundation - - Add pelite dependency to Cargo.toml ✅ - - Implement basic PE resource enumeration ✅ - - Create framework for extracting VERSIONINFO and STRINGTABLE resources ✅ - - Add comprehensive unit tests covering edge cases ✅ + - Add pelite dependency to Cargo.toml + - Implement basic PE resource enumeration + - Create framework for extracting VERSIONINFO and STRINGTABLE resources + - Add comprehensive unit tests covering edge cases - _Requirements: 1.2_ - - _Completed: Issue #4 - Phase 1 Foundation_ - [x] 4.2 Implement PE resource string extraction - - Extract strings from VERSIONINFO resources ✅ - - Extract strings from STRINGTABLE resources ✅ - - Add manifest resource string extraction ✅ - - Implement UTF-16LE decoding utilities ✅ - - Add comprehensive unit tests ✅ - - Add integration tests with fixtures ✅ + - Extract strings from VERSIONINFO resources + - Extract strings from STRINGTABLE resources + - Add manifest resource string extraction + - Implement UTF-16LE decoding utilities + - Add comprehensive unit tests + - Add integration tests with fixtures - _Requirements: 1.2_ - - _Completed: Issue #5 - Phase 2 String Extraction_ - - **Implementation Notes:** - - - VERSIONINFO: Uses pelite's `version_info()` API to extract all StringFileInfo key-value pairs - - STRINGTABLE: Manual parsing of RT_STRING blocks (16 strings per block, UTF-16LE) - - MANIFEST: Encoding detection (UTF-8/UTF-16LE/UTF-16BE) and XML extraction - - All strings tagged appropriately (`Tag::Version`, `Tag::Manifest`, `Tag::Resource`) - - Graceful error handling throughout (returns empty Vec on errors) - - Test coverage includes both unit tests and integration tests with real fixtures -- [ ] 5. Implement Mach-O section classification +- [x] 5. Implement Mach-O section classification - Enhance Mach-O parser to identify string-containing sections @@ -86,63 +73,68 @@ - _Requirements: 1.3, 1.4_ - - [ ] 5.1 Add Mach-O load command processing + - [x] 5.1 Add Mach-O load command processing - - Add object crate dependency for enhanced Mach-O support - - Extract strings from load commands + - Extract strings from load commands (LC_LOAD_DYLIB, LC_RPATH) - Implement load command string classification and tagging + - Add comprehensive unit tests and integration tests - _Requirements: 1.3_ -- [ ] 6. Create string extraction framework +- [x] 6. Create string extraction framework - Create StringExtractor trait in src/extraction/mod.rs - - Define RawString struct for extracted string data with metadata + - Define ExtractionConfig struct for configurable parameters - - Create ExtractionConfig struct for configurable parameters + - Implement BasicExtractor with section-aware extraction - _Requirements: 2.1_ - - [ ] 6.1 Implement basic ASCII string extraction + - [x] 6.1 Implement basic ASCII string extraction - Create src/extraction/ascii.rs with ASCII extraction logic - Implement scanning for printable character runs (0x20-0x7E) - Add configurable minimum length filtering - - Add unit tests for basic ASCII extraction + - Add comprehensive unit tests for basic ASCII extraction + - Implement section-aware extraction with metadata population - _Requirements: 2.1_ - - [ ] 6.2 Add ASCII noise filtering + - [x] 6.2 Add ASCII noise filtering - - Implement heuristics to distinguish legitimate strings from binary noise - - Add logic to avoid extracting from obvious padding or table data - - Consider section context when determining string legitimacy + - Implement CompositeNoiseFilter with multiple heuristics + - Add entropy-based filtering for random data detection + - Add pattern-based filtering for padding and table data + - Integrate noise filtering into extraction pipeline with confidence scoring - _Requirements: 1.4, 2.1_ -- [ ] 7. Implement UTF-16LE string extraction +- [x] 7. Implement UTF-16 string extraction - - Create src/extraction/utf16.rs with UTF-16LE extraction logic + - Create src/extraction/utf16.rs with UTF-16 extraction logic - - Implement detection of even-length sequences with mostly-zero high bytes + - Implement detection of UTF-16LE and UTF-16BE sequences - Add configurable minimum length for wide character strings - - Add unit tests for UTF-16LE extraction + - Add comprehensive unit tests for UTF-16 extraction - _Requirements: 2.2_ - - [ ] 7.1 Add UTF-16BE support and confidence scoring + - [x] 7.1 Add UTF-16BE support and confidence scoring - Extend UTF-16 extractor to handle big-endian byte order - Implement confidence scoring to avoid false positives - Add detection of null-interleaved text patterns + - Add surrogate pair handling for proper Unicode support - _Requirements: 2.3, 2.4_ -- [ ] 8. Implement string deduplication +- [x] 8. Implement string deduplication - Create src/extraction/dedup.rs with deduplication logic - Implement string canonicalization while preserving metadata - Handle multiple instances of same string in different sections - - Add unit tests for deduplication with metadata preservation + - Add comprehensive unit tests for deduplication with metadata preservation + - Implement CanonicalString and StringOccurrence types + - Add occurrence-based scoring bonuses - _Requirements: 2.5_ - [ ] 9. Create semantic classification framework @@ -157,17 +149,16 @@ - [ ] 9.1 Implement URL and domain classification - - Add regex dependency to Cargo.toml - Create src/classification/semantic.rs with URL pattern matching - - Implement domain name detection and validation - - Add unit tests for URL and domain classification + - Implement domain name detection and validation with TLD checking + - Add comprehensive unit tests for URL and domain classification - _Requirements: 3.1, 3.2_ - [ ] 9.2 Implement IP address classification - Add IPv4 address pattern matching to semantic classifier - - Add IPv6 address pattern matching - - Include unit tests for IP address detection + - Add IPv6 address pattern matching with bracketed notation support + - Include comprehensive unit tests for IP address detection - _Requirements: 3.3_ - [x] 9.3 Implement file path classification @@ -175,14 +166,9 @@ - Add POSIX file path pattern matching - Add Windows file path pattern matching - Include registry path detection - - Add unit tests for path classification - - Completed: - - POSIX file path pattern matching implemented - - Windows file path pattern matching implemented - - UNC path detection implemented - - Registry path detection implemented - - Comprehensive unit tests added - - Integration tests added + - Add UNC path detection + - Add comprehensive unit tests for path classification + - Add integration tests - _Requirements: 3.4, 3.5_ - [ ] 9.4 Implement remaining semantic patterns @@ -325,13 +311,13 @@ - Add performance benchmarks for regex caching - _Requirements: 8.3_ -- [ ] 15. Create basic test infrastructure +- [ ] 15. Create comprehensive test infrastructure - - Create tests/fixtures/ directory with sample binary files + - Expand tests/fixtures/ directory with additional sample binary files - - Add basic integration test framework + - Add comprehensive integration test framework - - Create simple ELF, PE, and Mach-O test binaries + - Create diverse ELF, PE, and Mach-O test binaries - _Requirements: All requirements validation_ @@ -339,7 +325,7 @@ - Add criterion dependency for performance benchmarks - Implement end-to-end CLI functionality tests - - Add insta dependency for snapshot testing + - Expand insta snapshot testing coverage - Create cross-platform validation tests - _Requirements: All requirements validation_ From 034cbe3244f6f375d18bba8816be21de8d3c75ea Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:06:45 -0500 Subject: [PATCH 14/19] chore: add documentation for core flows and technical plan - Introduced detailed documentation for core user interactions in Stringy v1.0. - Added technical plan outlining architecture, design decisions, and integration strategies. - Included performance optimization ticket and end-to-end testing strategy. Signed-off-by: UncleSp1d3r --- ...e_Flows__Stringy_v1.0_User_Interactions.md | 425 +++++++++++++++ .../Epic_Brief__Stringy_v1.0_Completion.md | 72 +++ ...nical_Plan__Stringy_v1.0_Implementation.md | 510 ++++++++++++++++++ ...ce_optimizations_and_dependency_updates.md | 47 ++ ...on_with_filtering_and_progress_feedback.md | 60 +++ ...with_new_patterns_and_symbol_demangling.md | 52 ++ ...-end_integration_testing_and_validation.md | 51 ++ ..._model_for_demangling_and_debug_support.md | 37 ++ ...ent_CLI_argument_parsing_and_validation.md | 53 ++ ...d_output_formatters_(Table,_JSON,_YARA).md | 58 ++ ...anking_system_with_configurable_scoring.md | 50 ++ 11 files changed, 1415 insertions(+) create mode 100644 project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md create mode 100644 project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md create mode 100644 project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md create mode 100644 project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md create mode 100644 project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md create mode 100644 project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md create mode 100644 project_plan/tickets/End-to-end_integration_testing_and_validation.md create mode 100644 project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md create mode 100644 project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md create mode 100644 project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md create mode 100644 project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md diff --git a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md new file mode 100644 index 0000000..09e77db --- /dev/null +++ b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md @@ -0,0 +1,425 @@ +# Core Flows: Stringy v1.0 User Interactions + +## Overview + +This document defines the core user flows for Stringy v1.0, capturing how users interact with the tool across different use cases: quick analysis, filtered searches, automated integration, and YARA rule generation. + +## Design Principles + +**Information Hierarchy:** + +- String content is primary (users scan for recognizable patterns) +- Tags and scores provide context for prioritization +- Section information aids in understanding string provenance +- Full metadata available in JSON for programmatic access + +**User Journey:** + +- Entry: Command-line invocation with binary file path +- Processing: Stage-based progress feedback to stderr +- Output: Format adapts to context (TTY vs pipe, human vs machine) +- Exit: Clean exit codes for scripting integration + +**Feedback & State:** + +- Progress shown via stage indicators (Parsing... Extracting... Classifying... Ranking...) +- Errors go to stderr with brief, actionable messages +- Success indicated by results on stdout +- Summary statistics available via --summary flag + +```mermaid +sequenceDiagram + participant User + participant CLI + participant Pipeline + participant Output + + User->>CLI: stringy binary.exe [flags] + CLI->>Pipeline: Load and validate binary + Pipeline-->>CLI: Parsing... + Pipeline->>Pipeline: Format detection & container parsing + Pipeline-->>CLI: Extracting... + Pipeline->>Pipeline: String extraction (ASCII, UTF-16) + Pipeline-->>CLI: Classifying... + Pipeline->>Pipeline: Semantic tagging & symbol demangling + Pipeline-->>CLI: Ranking... + Pipeline->>Pipeline: Score calculation & sorting + Pipeline->>Output: Formatted results + Output->>User: Display results (stdout) + Note over User,Output: Progress to stderr, results to stdout +``` + +## Flow 1: Quick Analysis (Default) + +**Description:** User runs Stringy on a binary to see all meaningful strings ranked by relevance. Unlike GNU strings which extracts every printable sequence, Stringy uses format-aware filtering to exclude noise (padding, binary tables, code sections) while behaving like strings for legitimate text data. + +**Trigger:** `stringy binary.exe` + +**Default Behavior:** + +- No minimum length filter applied by default (behaves like GNU strings) +- Format-aware filtering removes strings from sections known to contain non-text data +- All extracted strings are shown, ranked by score +- The value proposition is intelligent filtering, not output limiting + +**Steps:** + +1. User invokes Stringy with binary file path +2. System displays "Parsing..." to stderr +3. System detects format (ELF/PE/Mach-O) and parses container structure +4. System displays "Extracting..." to stderr +5. System extracts strings from appropriate sections using format knowledge +6. System displays "Classifying..." to stderr +7. System applies semantic classification (URLs, IPs, paths, GUIDs, etc.) +8. System demangles Rust/C++ symbols to readable form +9. System displays "Ranking..." to stderr +10. System calculates scores based on section weights, semantic boosts, and noise penalties +11. System sorts strings by score (descending) +12. System outputs table to stdout with columns: String | Tags | Score | Section +13. Each string shown in full (terminal handles wrapping) +14. Tags column shows primary tag only +15. Demangled symbols shown in readable form (original mangled form discarded) + +**Output Format (TTY):** + +``` +String Tags Score Section +https://malicious-c2.example.com/api url 95 .rdata +C:\Windows\System32\kernel32.dll filepath 88 .rdata +core::fmt::Display::fmt export 85 .text +192.168.1.100 ipv4 82 .rodata +{A1B2C3D4-E5F6-7890-ABCD-EF1234567890} guid 80 .data +``` + +**Exit:** Returns exit code 0 on success + +--- + +## Flow 2: Filtered Analysis + +**Description:** User applies filters to focus on specific string types, encodings, or characteristics. + +**Trigger:** `stringy binary.exe --only-tags url,ipv4 --min-len 10 --enc utf16` + +**Note:** The --enc flag accepts both specific encodings (utf16le, utf16be) and grouped values (utf16 matches both LE and BE variants). + +**Steps:** + +1. User invokes Stringy with filtering flags +2. System performs standard analysis pipeline (Parsing... Extracting... Classifying... Ranking...) +3. System applies filters with AND logic: + - String must have tag "url" OR "ipv4" + - String length must be >= 10 characters + - String encoding must be UTF-16 +4. System outputs filtered results in table format +5. If no strings match filters, system displays to stderr: "Analyzed 1,234 strings, 0 matched filters" +6. If strings match, system outputs table with matching strings only + +**Filter Combination Rules:** + +- All filter types are AND-ed together +- Within --only-tags, tags are OR-ed (match any specified tag) +- --notags excludes strings with any of the specified tags +- --top applies after all other filters + +**Exit:** Returns exit code 0 (even if no matches) + +**Conflicting Flags:** +If multiple output format flags are specified (e.g., --json and --yara), the system displays an error: + +- Output to stderr: "Error: Cannot specify multiple output formats (--json, --yara)" +- Exit code: 1 + +--- + +## Flow 3: Top N Results + +**Description:** User limits output to the highest-ranked strings for quick triage. + +**Trigger:** `stringy binary.exe --top 50` + +**Steps:** + +1. User invokes Stringy with --top flag +2. System performs standard analysis pipeline +3. System ranks all strings by score +4. System selects top 50 highest-scoring strings +5. System outputs table with only top 50 results + +**With Filters:** `stringy binary.exe --only-tags url --top 20` + +- Filters applied first (select all URLs) +- Then top 20 URLs by score selected +- Result: "Top 20 URLs" not "Top 20 overall that happen to be URLs" + +**Exit:** Returns exit code 0 + +--- + +## Flow 4: Automated Integration (JSON) + +**Description:** User generates machine-readable output for integration into analysis pipelines or scripts. + +**Trigger:** `stringy binary.exe --json` + +**Steps:** + +1. User invokes Stringy with --json flag +2. System performs standard analysis pipeline +3. Progress indicators go to stderr (not stdout) +4. System outputs JSONL (JSON Lines) to stdout +5. Each line is a complete JSON object representing one string +6. JSON includes all metadata: text, encoding, offset, rva, section, length, tags (full list), score, source, confidence + +**JSON Format:** + +```json +{"text":"https://example.com","encoding":"Ascii","offset":4096,"rva":8192,"section":".rdata","length":19,"tags":["url","domain"],"score":95,"source":"SectionData","confidence":1.0} +{"text":"C:\\Windows\\System32","encoding":"Utf16Le","offset":8192,"rva":12288,"section":".data","length":38,"tags":["filepath"],"score":88,"source":"SectionData","confidence":0.95} +``` + +**Use Cases:** + +- Piping to jq for filtering: `stringy binary.exe --json | jq 'select(.tags[] == "url")'` +- Importing into analysis tools +- Batch processing with scripts + +**Exit:** Returns exit code 0 + +--- + +## Flow 5: YARA Rule Generation + +**Description:** User generates YARA-friendly output for creating detection rules. + +**Trigger:** `stringy binary.exe --yara` + +**Steps:** + +1. User invokes Stringy with --yara flag +2. System performs standard analysis pipeline +3. Progress indicators go to stderr +4. System generates complete YARA rule template to stdout +5. Rule includes: + - Rule name (derived from binary filename) + - Metadata section (file hash, analysis date, tool version) + - Strings section with properly escaped strings + - Condition section (basic template) +6. Strings are escaped according to YARA syntax rules +7. Very long strings (>200 chars) are truncated with comment + +**YARA Output Format:** + +``` +rule binary_strings { + meta: + description = "Strings extracted from binary.exe" + tool = "Stringy v1.0" + date = "2024-01-15" + + strings: + $s1 = "https://malicious-c2.example.com/api" ascii wide + $s2 = "C:\\Windows\\System32\\kernel32.dll" ascii wide + $s3 = "core::fmt::Display::fmt" ascii + $s4 = "192.168.1.100" ascii + // Skipped: too long (245 chars) + + condition: + any of them +} +``` + +**YARA Rule Naming:** + +- Rule name derived from binary filename +- Non-alphanumeric characters replaced with underscore +- File extension removed +- Suffix "_strings" added +- Example: "binary.exe" becomes "binary_strings", "my-app.dll" becomes "my_app_strings" + +**YARA String Handling:** + +- Strings over 200 characters are skipped with comment: "// Skipped: too long (N chars)" +- Strings properly escaped according to YARA syntax +- Both ascii and wide modifiers included for compatibility + +**Exit:** Returns exit code 0 + +--- + +## Flow 6: Non-TTY Piping + +**Description:** User pipes Stringy output to other Unix tools for further processing. + +**Trigger:** `stringy binary.exe | grep "http"` + +**Steps:** + +1. User invokes Stringy with output piped to another command +2. System detects non-TTY output (stdout is not a terminal) +3. System performs standard analysis pipeline +4. Progress indicators go to stderr (visible to user) +5. System outputs plain strings to stdout (no table formatting) +6. Each string on its own line +7. No headers, no columns, no decorations +8. Pipe-friendly format for grep, awk, sed, etc. + +**Output Format (Non-TTY):** + +``` +https://malicious-c2.example.com/api +C:\Windows\System32\kernel32.dll +core::fmt::Display::fmt +192.168.1.100 +{A1B2C3D4-E5F6-7890-ABCD-EF1234567890} +``` + +**Exit:** Returns exit code 0 + +--- + +## Flow 7: Summary Statistics + +**Description:** User requests summary information about the analysis. + +**Trigger:** `stringy binary.exe --summary` + +**Steps:** + +1. User invokes Stringy with --summary flag +2. System performs standard analysis pipeline +3. System outputs results in standard format (table or JSON) +4. After results, system outputs summary to stdout: + - Binary format detected + - Total strings extracted + - Strings after filtering (if filters applied) + - Top tags found + - Analysis time + +**Summary Format:** + +``` +[Results table here] + +Summary: + Binary: binary.exe (PE, 2.4 MB) + Format: PE (Windows executable) + Strings extracted: 1,234 + Strings shown: 1,234 + Top tags: url (45), filepath (123), import (234), export (89) + Analysis time: 0.8s +``` + +**Note:** The --summary flag is ignored when --json is specified, as JSON output is intended for automation and doesn't need human-readable summaries. + +**Exit:** Returns exit code 0 + +--- + +## Flow 8: Error Handling + +**Description:** System handles errors and edge cases gracefully. + +**Common Error Scenarios:** + +**Unsupported Format:** + +- Trigger: `stringy unknown_format.bin` +- Output to stderr: "Error: Unsupported file format" +- Exit code: 1 + +**File Not Found:** + +- Trigger: `stringy nonexistent.exe` +- Output to stderr: "Error: File not found: nonexistent.exe" +- Exit code: 1 + +**Invalid Filter:** + +- Trigger: `stringy binary.exe --only-tags invalid_tag` +- Output to stderr: "Error: Unknown tag 'invalid_tag'. Use --help to see available tags." +- Exit code: 1 + +**Corrupted Binary:** + +- Trigger: `stringy corrupted.exe` +- Output to stderr: "Error: Binary parsing failed: invalid PE header" +- Exit code: 1 + +**No Strings Found:** + +- Trigger: `stringy empty.bin` +- Output to stderr: "Analyzed empty.bin: found 0 strings" +- Exit code: 0 (not an error, just no results) + +--- + +## CLI Flag Reference + + +| Flag | Description | Example | +| ------------------ | ------------------------------------------------------------------------------ | ---------------------- | +| `--min-len N` | Minimum string length | `--min-len 10` | +| `--enc ENCODING` | Filter by encoding. Accepts: ascii, utf8, utf16 (both LE/BE), utf16le, utf16be | `--enc utf16` | +| `--only-tags TAGS` | Include only specified tags (comma-separated) | `--only-tags url,ipv4` | +| `--notags TAGS` | Exclude specified tags | `--notags debug,test` | +| `--top N` | Show only top N results | `--top 50` | +| `--json` | Output JSONL format | `--json` | +| `--yara` | Output YARA rule template | `--yara` | +| `--summary` | Show summary statistics | `--summary` | +| `--debug` | Include score breakdown in output (section_weight, semantic_boost, noise_penalty) | `--debug` | +| `--help` | Show help including available tags | `--help` | +| `--version` | Show version information | `--version` | + + +**Available Tags (shown in --help):** + +- `url` - HTTP/HTTPS URLs +- `domain` - Domain names +- `ipv4` - IPv4 addresses +- `ipv6` - IPv6 addresses +- `filepath` - File paths (Windows/POSIX) +- `regpath` - Windows registry paths +- `guid` - GUIDs/UUIDs +- `email` - Email addresses +- `b64` - Base64-encoded data +- `fmt` - Printf-style format strings +- `user-agent-ish` - User agent strings +- `import` - Import symbols +- `export` - Export symbols +- `version` - Version strings +- `manifest` - Manifest data +- `resource` - Resource strings + +--- + +## Output Format Details + +**Table Column Widths:** + +- String: Flexible (shows full string, terminal wraps if needed) +- Tags: 15 characters (shows primary tag only) +- Score: 6 characters (right-aligned integer) +- Section: 20 characters (section name) + +**Tag Priority (for display):** +When multiple tags exist, show tags from the highest priority level. If multiple tags exist at the same priority level, show them comma-separated (e.g., "url,ipv4"). + +Priority order: + +1. url, ipv4, ipv6 (network indicators) +2. filepath, regpath (file system) +3. guid, email (identifiers) +4. import, export (symbols) +5. Other tags + +**Score Range:** + +- 0-100 scale +- Higher = more relevant +- Typical ranges: + - 90-100: High-value strings (URLs, GUIDs, imports) + - 70-89: Meaningful strings (file paths, exports) + - 50-69: Moderate relevance (general strings) + - Below 50: Low relevance (potential noise) + diff --git a/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md b/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md new file mode 100644 index 0000000..c83fd8f --- /dev/null +++ b/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md @@ -0,0 +1,72 @@ +# Epic Brief: Stringy v1.0 Completion + +## Summary + +Stringy v1.0 completion transforms a promising foundation into a production-ready binary analysis tool that solves a fundamental problem: existing tools like GNU strings produce overwhelming noise without intelligence. Users across security research, malware analysis, and reverse engineering need format-aware string extraction that automatically identifies meaningful data, ranks results by relevance, and integrates into automated workflows. The current incomplete state forces users to manually filter thousands of strings and decipher cryptic mangled symbols, wasting valuable analysis time. Completing v1.0 delivers semantic classification, intelligent ranking, symbol demangling, and flexible output formats - enabling users to immediately see the most important strings (URLs, file paths, IOCs) ranked by relevance, understand binary functionality through demangled symbols, and integrate Stringy into production analysis pipelines. + +## Context & Problem + +### Who's Affected + +**Primary Users:** +- **Security researchers and malware analysts** who need to quickly identify indicators of compromise (IOCs), behavioral patterns, and malicious infrastructure in binaries +- **Reverse engineers** who need to understand binary functionality, dependencies, and internal structure through string analysis +- **Open-source community** seeking a modern, intelligent alternative to decades-old tools like GNU strings +- **DevOps and security teams** who need to integrate string extraction into automated analysis pipelines + +These users share a common need: efficient, intelligent binary analysis that surfaces meaningful information without manual noise filtering. + +### Current Pain Points + +**Problem 1: Signal vs Noise** +Traditional tools like GNU strings extract every printable character sequence, producing thousands of results where 90%+ are meaningless - padding bytes, binary tables, random data that happens to be printable. Users must manually scan through this noise to find the 10% that matters: URLs, file paths, registry keys, function names. This manual filtering is time-consuming, error-prone, and doesn't scale. + +**Problem 2: Cryptic Symbols** +Modern binaries contain mangled symbols (especially Rust, C++) that appear as cryptic strings like `_ZN4core3fmt3num52_$LT$impl$`. Without demangling, users cannot understand what functions are called, what libraries are used, or what the binary actually does. This forces users to copy-paste symbols into external demangling tools, breaking their workflow. + +**Problem 3: No Prioritization** +Even when users find potentially interesting strings, they have no way to know which ones are most important. Is this URL critical infrastructure or a help link? Is this file path a configuration file or a debug artifact? Without ranking and context, users waste time investigating low-value strings while missing critical indicators. + +**Problem 4: Integration Barriers** +Users cannot integrate the current incomplete tool into automated workflows because it lacks: +- Structured output formats (JSON) for programmatic consumption +- Filtering capabilities to focus on specific string types +- Reliable, production-ready behavior + +### Where in the Product + +The gaps exist across the entire analysis pipeline: + +1. **Classification Layer** (file:src/classification/): Types are defined but semantic pattern matching is not implemented. Users cannot automatically identify URLs, IPs, domains, file paths, GUIDs, emails, Base64, format strings, or user agents. + +2. **Symbol Processing** (file:src/classification/): No demangling capability exists. Mangled Rust/C++ symbols remain cryptic and unusable. + +3. **Ranking System** (file:src/classification/): No scoring algorithm exists to prioritize strings. All strings are treated equally regardless of their source section, semantic meaning, or likelihood of being meaningful. + +4. **Output Formatting** (file:src/output/): Only basic interfaces exist. Users cannot get JSONL for automation, human-readable tables for manual review, or YARA-friendly output for rule creation. + +5. **CLI Interface** (file:src/main.rs): Missing filtering options (--min-len, --enc, --only-tags, --notags, --top) and output format selection (--json). + +6. **Performance** (file:src/extraction/): No memory mapping for large files, no regex caching for classification patterns. + +### The Gap + +**Current State:** Stringy has a solid foundation with format detection, container parsing, and basic string extraction working. Users can extract strings from ELF, PE, and Mach-O binaries with encoding awareness (ASCII, UTF-16). However, the output is raw and unprocessed - similar to GNU strings but with better encoding support. + +**Desired State:** Users run Stringy on any binary and immediately see: +- All meaningful strings ranked by importance (format-aware filtering removes noise) +- Automatic semantic tags (URL, filepath, ipv4, guid, etc.) highlighting what each string represents +- Demangled symbols showing actual function and type names +- Flexible output formats for both manual analysis and automated integration +- Production-ready reliability for daily use + +**User Feedback:** "I want to use Stringy but it's not ready yet. I need it to tell me what's important, not just dump everything." + +### Success Criteria + +When v1.0 is complete, users will be able to: +1. Run Stringy on a binary and immediately see the most relevant strings ranked by importance +2. Quickly identify IOCs, file paths, URLs, and other semantic patterns without manual searching +3. Understand binary functionality through demangled symbols and import/export analysis +4. Integrate Stringy into automated analysis pipelines with proper filtering and output formats +5. Rely on Stringy as a production-ready tool for daily binary analysis work diff --git a/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md b/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md new file mode 100644 index 0000000..8cbf585 --- /dev/null +++ b/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md @@ -0,0 +1,510 @@ +# Technical Plan: Stringy v1.0 Implementation + +## Overview + +This technical plan defines the architecture for completing Stringy v1.0, building on the existing foundation of format detection, container parsing, and string extraction. The implementation adds semantic classification, intelligent ranking, symbol demangling, flexible output formatting, and CLI orchestration. + +## Architectural Approach + +### Core Design Decisions + +**1. Pipeline Architecture** + +The main execution flow will be encapsulated in a `Pipeline` struct that orchestrates the entire analysis process. This provides: + +- Clear entry point for the analysis workflow +- Centralized error handling and recovery +- Testability through dependency injection +- Progress tracking integration + +**Trade-off**: Struct-based approach adds a layer of abstraction but provides better testability and maintainability compared to functional composition in main(). + +**2. Enum-Based Output Formatting** + +Output formatters will use an enum-based approach with a single format() function that matches on the output type. This provides: + +- Simplicity and directness for the 3 required formats +- Easy to understand and maintain +- No trait abstraction overhead +- Sufficient for current requirements + +**Trade-off**: Less extensible than trait-based approach, but simpler and more appropriate for the limited number of formatters. Future formats can be added as enum variants. + +**3. Memory-Mapped File I/O with Fallback** + +File reading will attempt memory mapping first, with automatic fallback to regular file reading on failure. This provides: + +- Efficient memory-mapped access for most cases +- Robustness for edge cases (network filesystems, locked files, platform limitations) +- Consistent behavior across all file sizes +- Zero-copy access when possible + +**Trade-off**: Slightly more complex than always-on mmap, but handles real-world failure scenarios gracefully. + +**4. Modern Regex Caching** + +Migrate from `lazy_static` to `once_cell` for regex pattern caching. This provides: + +- More modern, ergonomic API +- Better compile-time guarantees +- Consistent with Rust ecosystem trends +- Simpler initialization patterns + +**Trade-off**: Requires dependency migration but improves code quality and maintainability. + +**5. Rich Progress Feedback** + +Use `indicatif` library for progress indicators. This provides: + +- Professional progress bars and spinners +- Automatic TTY detection +- Minimal code for rich feedback +- Consistent user experience + +**Trade-off**: Adds external dependency but provides significantly better UX than manual eprintln! calls. + +### Technical Constraints + +**Codebase Standards:** + +- No `unsafe` code (`#![forbid(unsafe_code)]` enforced) +- Zero warnings (`cargo clippy -- -D warnings` must pass) +- ASCII-only text (no Unicode punctuation or emojis) +- File size limit: 500 lines per file (split larger files) +- No blanket `#[allow]` attributes + +**Module Organization:** + +- file:src/classification/ - Semantic analysis and ranking +- file:src/output/ - Output formatters +- file:src/main.rs - CLI and pipeline orchestration +- file:src/lib.rs - Public API and re-exports + +**Error Handling:** + +- Use `thiserror` for all error types +- Include context (offsets, section names, file paths) +- Graceful degradation where possible +- Clear error messages for user-facing errors + +### Integration Strategy + +The new components integrate with existing infrastructure: + +1. **Classification Integration**: New semantic patterns and symbol demangling extend existing `SemanticClassifier` in file:src/classification/semantic.rs +2. **Ranking Integration**: New `RankingEngine` in src/classification/ranking.rs consumes `FoundString` objects with section weights from container parsers +3. **Output Integration**: New formatters in file:src/output/ consume ranked `Vec` from pipeline +4. **CLI Integration**: file:src/main.rs orchestrates all components through `Pipeline` struct + +```mermaid +sequenceDiagram + participant CLI as main.rs + participant Pipeline + participant Container as container/* + participant Extractor as extraction/* + participant Classifier as classification/* + participant Ranker as ranking.rs + participant Formatter as output/* + + CLI->>Pipeline: new(config) + CLI->>Pipeline: run(file_path) + Pipeline->>Container: detect_format() & parse() + Container-->>Pipeline: ContainerInfo + Pipeline->>Extractor: extract(data, container_info) + Extractor-->>Pipeline: Vec + Pipeline->>Classifier: classify(strings) + Classifier->>Classifier: demangle_symbols() + Classifier-->>Pipeline: Vec with tags + Pipeline->>Ranker: calculate_scores(strings) + Ranker-->>Pipeline: Vec with scores + Pipeline->>Pipeline: apply_filters() & sort() + Pipeline->>Formatter: format(strings) + Formatter-->>CLI: Output to stdout +``` + +--- + +## Data Model + +### FoundString Enhancement + +Extend the existing `FoundString` struct in file:src/types.rs to include score breakdown for transparency and debugging: + +```rust +pub struct FoundString { + // Existing fields + pub text: String, + pub encoding: Encoding, + pub offset: u64, + pub rva: Option, + pub section: Option, + pub length: u32, + pub tags: Vec, + pub score: i32, // Final calculated score + pub source: StringSource, + pub confidence: f32, + + // New fields for symbol demangling + pub original_text: Option, // Original mangled form (if demangled) + + // Optional debug fields (only populated with --debug flag) + pub section_weight: Option, // Score from section type + pub semantic_boost: Option, // Bonus from semantic tags + pub noise_penalty: Option, // Penalty from noise detection +} +``` + +**Rationale**: + +- `original_text` preserves the mangled symbol for cross-referencing and recovery +- Breakdown fields (section_weight, semantic_boost, noise_penalty) are optional to avoid exposing internal implementation details in the public API +- With --debug flag, breakdown fields are populated for debugging and validation +- Without --debug, breakdown fields remain None, keeping the API simple and flexible + +### Ranking Configuration + +New configuration struct for ranking parameters in src/classification/ranking.rs: + +```rust +pub struct RankingConfig { + pub section_weights: HashMap, + pub tag_boosts: HashMap, + pub noise_penalty_config: NoisePenaltyConfig, +} + +pub struct NoisePenaltyConfig { + pub high_entropy_penalty: i32, + pub excessive_length_penalty: i32, + pub repeated_pattern_penalty: i32, +} +``` + +**Integration**: `RankingConfig` uses hardcoded sensible defaults. No user configuration is provided - the defaults are designed to work well across all use cases (malware analysis, reverse engineering, general analysis). + +### Filter Configuration + +New configuration struct for CLI filtering in file:src/main.rs: + +```rust +pub struct FilterConfig { + pub min_length: Option, + pub encodings: Option>, + pub include_tags: Option>, + pub exclude_tags: Option>, + pub top_n: Option, +} +``` + +**Integration**: Built from CLI arguments, passed to Pipeline. Pipeline applies filters after ranking using iterator adapters. + +### Output Formatter Interface + +Trait definition for output formatters in file:src/output/mod.rs: + +```rust +pub enum OutputFormat { + Table, + Json, + Yara, +} + +pub struct OutputMetadata { + pub binary_name: String, + pub binary_format: BinaryFormat, + pub total_strings: usize, + pub filtered_strings: usize, +} + +pub fn format_output( + format: OutputFormat, + strings: &[FoundString], + metadata: &OutputMetadata, +) -> Result; +``` + +**Rationale**: Enum-based design is simpler and more direct for the three required formats. The format_output function matches on the enum and delegates to format-specific logic. Metadata provides context for formatters to include summary information. + +### Tag Enum Extensions + +Add new variants to existing `Tag` enum in file:src/types.rs with specificity levels: + +```rust +pub enum Tag { + // Existing tags (specific) + Url, Domain, IPv4, IPv6, FilePath, RegistryPath, + Import, Export, Version, Manifest, Resource, + DylibPath, Rpath, RpathVariable, FrameworkPath, + + // New specific tags for v1.0 + Guid, // GUIDs/UUIDs (specific) + Email, // Email addresses (specific) + FormatString, // Printf-style format strings (specific) + UserAgent, // User agent strings (specific) + DemangledSymbol, // Demangled Rust/C++ symbols (specific) + + // Broad/ambiguous tags + Base64, // Base64-encoded data (broad - many false positives) +} +``` + +**Tag Specificity**: Tags are categorized as specific (high confidence, low false positives) or broad (lower confidence, higher false positives). A string can have multiple tags. Specific tags like Email are prioritized over broad tags like Base64 in display and ranking. + +**Integration**: New tags follow existing pattern. Classification logic in file:src/classification/semantic.rs will be extended to detect these patterns. False negatives are worse than false positives - we prefer to tag liberally. + +--- + +## Component Architecture + +### 1. Ranking Engine (src/classification/ranking.rs) + +**Purpose**: Calculate relevance scores for strings based on multiple factors. + +**Interface**: + +```rust +pub struct RankingEngine { + config: RankingConfig, +} + +impl RankingEngine { + pub fn new(config: RankingConfig) -> Self; + pub fn calculate_score(&self, string: &mut FoundString); + pub fn rank_strings(&self, strings: &mut [FoundString]); +} +``` + +**Responsibilities**: + +- Apply section weight scoring based on `SectionType` +- Apply semantic boost scoring based on tags +- Calculate noise penalties from confidence scores +- Populate score breakdown fields +- Sort strings by final score + +**Integration**: Consumes `FoundString` objects after classification, uses section weights from `ContainerInfo`, applies tag-based boosts. + +**File Size**: Keep under 500 lines. If scoring logic exceeds limit, split into src/classification/ranking/mod.rs with submodules for section_weights.rs, semantic_boosts.rs, noise_penalties.rs. + +### 2. Symbol Demangling (src/classification/symbols.rs) + +**Purpose**: Demangle Rust and C++ symbols to human-readable form while preserving original. + +**Interface**: + +```rust +pub struct SymbolDemangler { + // Uses rustc-demangle crate +} + +impl SymbolDemangler { + pub fn new() -> Self; + pub fn demangle(&self, string: &mut FoundString); + pub fn is_mangled(&self, symbol: &str) -> bool; +} +``` + +**Responsibilities**: + +- Detect mangled Rust symbols (starts with `_ZN` or `_R`) +- Demangle using `rustc-demangle` crate +- Store original mangled form in `FoundString.original_text` +- Replace `FoundString.text` with demangled version +- Tag demangled symbols with `DemangledSymbol` tag +- Handle demangling failures gracefully (keep original text, no tag) + +**Integration**: Called during classification phase. Processes strings with `Import` or `Export` tags. Modifies FoundString in-place, preserving original in original_text field. + +**Dependency**: Add `rustc-demangle` to Cargo.toml. + +### 3. Semantic Classification Extensions (file:src/classification/semantic.rs) + +**Purpose**: Extend existing classifier with new pattern detection. + +**New Patterns**: + +- **GUID**: Regex for standard GUID format `{XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX}` (specific) +- **Email**: Regex with basic validation ([user@domain.tld](mailto:user@domain.tld)) (specific) +- **Base64**: Pattern detection for Base64-encoded data (length, character set) (broad - many false positives) +- **Format String**: Detection of printf-style format specifiers (%s, %d, %x, etc.) (specific) +- **User Agent**: Pattern matching for common user agent strings (specific) + +**Tag Specificity Strategy**: + +- Apply all matching patterns - a string can have multiple tags +- Prefer false positives over false negatives (better to tag liberally) +- Specific tags (Email, GUID, FormatString) have higher confidence +- Broad tags (Base64) are applied with lower confidence but still useful +- Tag priority for display handles showing most relevant tags first + +**Integration**: Extend existing `SemanticClassifier::classify()` method with new pattern checks. Use `once_cell` for regex caching. + +**File Size**: Current file is approaching 500 lines. If additions exceed limit, split into src/classification/semantic/mod.rs with submodules for network.rs, filesystem.rs, identifiers.rs, encoding.rs. + +### 4. Output Formatters (file:src/output/) + +**Module Structure**: + +- file:src/output/mod.rs - OutputFormat enum and format_output() function +- src/output/table.rs - Table formatting logic +- src/output/json.rs - JSONL formatting logic +- src/output/yara.rs - YARA rule generation logic + +**Table Formatter** (src/output/table.rs): + +- Detect TTY vs non-TTY output using `atty` or `std::io::IsTerminal` +- TTY: Format as aligned table with columns (String | Tags | Score | Section) +- Non-TTY: Output plain strings, one per line +- Handle long strings (show in full, terminal wraps) +- Show primary tags (comma-separated if multiple at same priority) + +**JSON Formatter** (src/output/json.rs): + +- Output JSONL (one JSON object per line) +- Include all `FoundString` fields (text, encoding, offset, rva, section, length, tags, score, source, confidence) +- Include original_text if present (demangled symbols) +- Include score breakdown fields only if populated (--debug mode) +- Proper escaping for JSON strings via serde_json + +**YARA Formatter** (src/output/yara.rs): + +- Generate complete YARA rule template +- Sanitize binary filename for rule name (replace non-alphanumeric with underscore, remove extension, add "_strings" suffix) +- Include metadata section (description, tool, date, file hash) +- Escape strings according to YARA syntax (backslashes, quotes, newlines) +- Skip strings over 200 characters with comment: "// Skipped: too long (N chars)" +- Include both `ascii` and `wide` modifiers for compatibility + +**Integration**: Pipeline calls format_output() with selected OutputFormat enum variant. The function matches on the enum and delegates to the appropriate formatting module. + +### 5. Pipeline Orchestration (file:src/main.rs) + +**Purpose**: Coordinate the entire analysis workflow including filtering. + +**Structure**: + +```rust +pub struct Pipeline { + config: PipelineConfig, + progress: ProgressBar, // from indicatif +} + +pub struct PipelineConfig { + extraction_config: ExtractionConfig, + ranking_config: RankingConfig, + filter_config: FilterConfig, + debug_mode: bool, +} + +impl Pipeline { + pub fn new(config: PipelineConfig) -> Self; + pub fn run(&mut self, file_path: &Path) -> Result>; +} +``` + +**Workflow**: + +1. Display "Parsing..." progress indicator +2. Attempt memory-map file using `memmap2`, fall back to `std::fs::read()` on failure +3. Detect format and parse container (fail fast on error) +4. Display "Extracting..." progress indicator +5. Extract strings using `BasicExtractor` (fail fast on critical errors) +6. Display "Classifying..." progress indicator +7. Apply semantic classification (graceful degradation - skip failed strings) +8. Apply symbol demangling (graceful degradation - keep original on failure) +9. Display "Ranking..." progress indicator +10. Calculate scores using `RankingEngine` (populate breakdown fields if debug_mode) +11. Apply filters from FilterConfig (min-len, encoding, tags) +12. Sort by score and apply --top limit +13. Format output using selected OutputFormat enum +14. Write to stdout + +**Error Handling Strategy**: + +*Critical Stages (fail fast):* + +- File access: Exit with error if file not found or cannot be read +- Format detection: Exit with error if format unsupported +- Container parsing: Exit with error if binary is corrupted or invalid + +*Optional Stages (graceful degradation):* + +- Classification: If classification fails on individual strings, skip those strings and continue +- Symbol demangling: If demangling fails, keep original text and continue +- Ranking: If ranking fails, output unranked strings with warning +- Formatting: If primary formatter fails, attempt plain text fallback + +*Recovery Strategy*: + +- Memory mapping failure: Automatically fall back to regular file reading +- Partial results: If some strings are processed successfully, output them with warning about failures +- No strings found: Display informational message to stderr, exit 0 (not an error) + +**Progress Feedback**: Use `indicatif::ProgressBar` with spinner style. Progress messages go to stderr, results to stdout. + +**CLI Filtering**: Filter logic is part of `Pipeline::run()` to keep main.rs under 500-line limit: + +- Pipeline receives filter configuration from CLI args +- Filters applied after ranking, before output formatting +- Uses iterator adapters: `filter()` for criteria, `take()` for --top +- Filter validation happens during Pipeline initialization + +**Integration**: Pipeline owns the entire process including filtering. CLI argument parsing uses `clap` derive macros. Main function is minimal: parse args, create Pipeline, call run(), handle output. + +### 6. Performance Optimizations + +**Memory Mapping** (file:src/main.rs): + +- Attempt `memmap2::Mmap` first for efficient access +- On mmap failure (network filesystem, locked file, platform limitations), fall back to `std::fs::read()` +- Pass byte slice to container parsers (works with both mmap and regular read) +- Log fallback to regular file reading for user awareness + +**Regex Caching** (file:src/classification/semantic.rs): + +- Migrate from `lazy_static` to `once_cell::sync::Lazy` +- Pre-compile all regex patterns at first use +- Share compiled patterns across all classification calls + +**Dependency Additions**: + +- `memmap2` - Memory-mapped file I/O with fallback +- `once_cell` - Modern lazy initialization (migrate from lazy_static) +- `indicatif` - Progress bars and spinners +- `rustc-demangle` - Rust symbol demangling +- `atty` or use `std::io::IsTerminal` - TTY detection for output formatting + +### Integration Points Summary + + +| Component | Consumes | Produces | Integration Point | +| ------------------ | ------------------------------------ | ----------------------------- | ----------------------------------------------- | +| Pipeline | CLI args, file path | Formatted output | Orchestrates all components + filtering | +| RankingEngine | Vec<FoundString>, debug flag | Scored Vec<FoundString> | Called after classification | +| SymbolDemangler | &mut FoundString | () | Called during classification, modifies in-place | +| SemanticClassifier | FoundString | Vec<Tag> | Extended with new patterns | +| format_output() | OutputFormat, Vec<FoundString> | String | Enum-based dispatch to formatters | + + +### Testing Strategy + +**Unit Tests**: + +- Ranking: Test score calculation with known inputs +- Symbol demangling: Test with mangled/unmangled symbols +- Semantic patterns: Test each new pattern with positive/negative cases +- Output formatters: Test with sample data, verify format correctness + +**Integration Tests**: + +- End-to-end pipeline tests with fixture binaries +- CLI argument parsing and filtering +- Output format validation with `insta` snapshots +- Error handling scenarios + +**Benchmarks**: + +- Ranking performance with large string sets +- Regex pattern matching performance +- Memory mapping vs regular file I/O +- Overall pipeline throughput + diff --git a/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md b/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md new file mode 100644 index 0000000..aea53d6 --- /dev/null +++ b/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md @@ -0,0 +1,47 @@ +# Add performance optimizations and dependency updates + +## Objective + +Integrate performance optimizations including memory mapping with fallback, regex caching migration, and progress feedback library. + +## Scope + +**In Scope:** +- Add dependencies to Cargo.toml: + - memmap2 for memory-mapped file I/O + - once_cell for modern lazy initialization + - indicatif for progress bars + - rustc-demangle for symbol demangling +- Implement memory mapping with fallback in Pipeline: + - Attempt memmap2::Mmap first + - Fall back to std::fs::read() on failure + - Log fallback for user awareness +- Migrate all lazy_static usage to once_cell::sync::Lazy +- Integrate indicatif progress indicators in Pipeline +- Add benchmarks for performance validation +- Update documentation with performance characteristics + +**Out of Scope:** +- Pipeline implementation (separate ticket, but this ticket provides the tools) +- Classification implementation (separate ticket, but this ticket provides regex caching) + +## Acceptance Criteria + +- [ ] All dependencies added to Cargo.toml with appropriate versions +- [ ] Memory mapping with fallback implemented and tested +- [ ] All lazy_static migrated to once_cell +- [ ] indicatif integrated for progress feedback +- [ ] Benchmarks added for memory mapping and regex performance +- [ ] Documentation updated with performance notes +- [ ] Zero clippy warnings +- [ ] All tests passing + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Performance Optimizations section) +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 14) +- file:Cargo.toml + +## Dependencies + +None - this ticket provides infrastructure for other tickets but can be implemented independently. \ No newline at end of file diff --git a/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md b/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md new file mode 100644 index 0000000..50b2cff --- /dev/null +++ b/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md @@ -0,0 +1,60 @@ +# Build Pipeline orchestration with filtering and progress feedback + +## Objective + +Create the Pipeline struct that orchestrates the entire analysis workflow, including filtering, error recovery, and progress feedback. + +## Scope + +**In Scope:** +- Create Pipeline struct in file:src/main.rs with PipelineConfig +- Implement 14-step workflow: + 1. Progress indicator setup (indicatif) + 2. Memory-mapped file reading with fallback to regular read + 3. Format detection and container parsing (fail fast) + 4. String extraction (fail fast on critical errors) + 5. Semantic classification (graceful degradation) + 6. Symbol demangling (graceful degradation) + 7. Ranking with optional debug breakdown + 8. Filtering (min-len, encoding, tags) + 9. Sorting and top-N limiting + 10. Output formatting +- Implement FilterConfig struct for CLI filter parameters +- Implement stage-specific error recovery: + - Critical stages: fail fast with clear errors + - Optional stages: graceful degradation with warnings +- Integrate indicatif for progress feedback (Parsing... Extracting... Classifying... Ranking...) +- Add memory mapping with fallback logic +- Add comprehensive integration tests +- Keep main.rs under 500 lines + +**Out of Scope:** +- CLI argument parsing (handled in same file but separate concern) +- Individual component implementations (separate tickets) +- Output formatter implementations (separate ticket) + +## Acceptance Criteria + +- [ ] Pipeline struct with new() and run() methods +- [ ] PipelineConfig with all necessary configuration +- [ ] FilterConfig for CLI filter parameters +- [ ] 14-step workflow implemented with proper error handling +- [ ] Stage-specific error recovery (fail fast vs graceful degradation) +- [ ] Memory mapping with automatic fallback to regular file reading +- [ ] Progress feedback using indicatif (messages to stderr) +- [ ] Filtering logic using iterator adapters +- [ ] Integration tests covering success and failure scenarios +- [ ] main.rs under 500 lines +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/dbfae449-b832-46a9-8fe7-748d7c5f5a20 (Core Flows - All flows) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Component 5) +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 16) + +## Dependencies + +- Ticket: "Complete semantic classification" (needs classification logic) +- Ticket: "Implement ranking system" (needs RankingEngine) +- Ticket: "Implement output formatters" (needs formatters) \ No newline at end of file diff --git a/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md b/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md new file mode 100644 index 0000000..67f3631 --- /dev/null +++ b/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md @@ -0,0 +1,52 @@ +# Complete semantic classification with new patterns and symbol demangling + +## Objective + +Extend the semantic classification system to detect all required patterns (GUIDs, emails, Base64, format strings, user agents) and integrate Rust symbol demangling. + +## Scope + +**In Scope:** +- Extend file:src/classification/semantic.rs with new pattern detection: + - GUID pattern matching + - Email address validation + - Base64 pattern detection (marked as broad/ambiguous) + - Printf-style format string detection + - User agent pattern matching +- Create file:src/classification/symbols.rs for symbol demangling: + - Integrate rustc-demangle crate + - Detect mangled Rust symbols + - Demangle and preserve original in original_text field + - Tag demangled symbols appropriately +- Migrate regex caching from lazy_static to once_cell +- Add comprehensive unit tests for all new patterns +- Handle tag specificity (specific vs broad tags) +- Split file:src/classification/semantic.rs if it exceeds 500 lines + +**Out of Scope:** +- Ranking/scoring logic (separate ticket) +- CLI integration (separate ticket) +- Output formatting (separate ticket) + +## Acceptance Criteria + +- [ ] All new semantic patterns implemented with regex matching +- [ ] Symbol demangling working for Rust symbols (rustc-demangle) +- [ ] Original mangled form preserved in original_text field +- [ ] Demangled text replaces FoundString.text +- [ ] All regex patterns migrated to once_cell +- [ ] Tag specificity documented (specific vs broad) +- [ ] Comprehensive unit tests for each pattern (positive and negative cases) +- [ ] File size under 500 lines (split into submodules if needed) +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/24940fed-1cc7-4d17-bc4b-fb5558c6f827 (Epic Brief - Problem 2: Cryptic Symbols) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Components 2 & 3) +- file:src/classification/semantic.rs +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Tasks 9.1-9.4, 10) + +## Dependencies + +- Ticket: "Extend FoundString data model" (needs original_text field) \ No newline at end of file diff --git a/project_plan/tickets/End-to-end_integration_testing_and_validation.md b/project_plan/tickets/End-to-end_integration_testing_and_validation.md new file mode 100644 index 0000000..223f998 --- /dev/null +++ b/project_plan/tickets/End-to-end_integration_testing_and_validation.md @@ -0,0 +1,51 @@ +# End-to-end integration testing and validation + +## Objective + +Create comprehensive integration tests that validate the complete pipeline with real binaries and all output formats. + +## Scope + +**In Scope:** +- Expand tests/fixtures/ with diverse test binaries: + - ELF binaries with various string types + - PE binaries with resources and imports + - Mach-O binaries with load commands + - Binaries with mangled symbols +- Create end-to-end integration tests: + - Test complete pipeline with each binary format + - Test all output formats (table, JSON, YARA) + - Test filtering combinations + - Test error scenarios (corrupted binaries, unsupported formats) + - Test edge cases (no strings, all strings filtered out) +- Add insta snapshot tests for output validation +- Add CLI integration tests +- Validate against all requirements from file:.kiro/specs/stringy-binary-analyzer/requirements.md +- Add performance benchmarks for complete pipeline + +**Out of Scope:** +- Unit tests for individual components (handled in component tickets) +- Implementation of components (separate tickets) + +## Acceptance Criteria + +- [ ] Comprehensive test fixtures for all binary formats +- [ ] End-to-end integration tests covering all user flows +- [ ] Snapshot tests for all output formats +- [ ] Error scenario tests (corrupted binaries, invalid inputs) +- [ ] Edge case tests (empty results, filter mismatches) +- [ ] Performance benchmarks for complete pipeline +- [ ] All requirements validated with tests +- [ ] All tests passing +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/dbfae449-b832-46a9-8fe7-748d7c5f5a20 (Core Flows - All flows) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Testing Strategy) +- file:.kiro/specs/stringy-binary-analyzer/requirements.md +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 15) + +## Dependencies + +- All other tickets (this validates the complete implementation) \ No newline at end of file diff --git a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md new file mode 100644 index 0000000..7311e59 --- /dev/null +++ b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md @@ -0,0 +1,37 @@ +# Extend FoundString data model for demangling and debug support + +## Objective + +Update the `FoundString` struct in file:src/types.rs to support symbol demangling preservation and optional score breakdown for debugging. + +## Scope + +**In Scope:** +- Add `original_text: Option` field to preserve mangled symbols +- Add optional breakdown fields: `section_weight: Option`, `semantic_boost: Option`, `noise_penalty: Option` +- Update serde serialization to handle new fields correctly +- Update existing tests to account for new fields +- Update documentation with field descriptions + +**Out of Scope:** +- Actual demangling logic (handled in separate ticket) +- Score calculation logic (handled in separate ticket) +- CLI --debug flag implementation (handled in pipeline ticket) + +## Acceptance Criteria + +- [ ] FoundString struct includes original_text field +- [ ] FoundString struct includes optional breakdown fields (section_weight, semantic_boost, noise_penalty) +- [ ] All fields properly serialize/deserialize with serde +- [ ] Existing tests updated and passing +- [ ] Documentation updated with field descriptions and usage examples +- [ ] No breaking changes to existing code that creates FoundString instances + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Data Model section) +- file:src/types.rs (FoundString definition) + +## Dependencies + +None - this is the foundational ticket that other work depends on. \ No newline at end of file diff --git a/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md b/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md new file mode 100644 index 0000000..c719c10 --- /dev/null +++ b/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md @@ -0,0 +1,53 @@ +# Implement CLI argument parsing and validation + +## Objective + +Complete the CLI interface with all filtering flags, output format selection, and proper validation. + +## Scope + +**In Scope:** +- Extend Cli struct in file:src/main.rs with all flags: + - --min-len N + - --enc ENCODING (accept: ascii, utf8, utf16, utf16le, utf16be) + - --only-tags TAGS (comma-separated) + - --notags TAGS (comma-separated) + - --top N + - --json + - --yara + - --summary + - --debug +- Implement flag validation: + - Validate encoding values + - Validate tag names (show suggestions for invalid tags) + - Detect conflicting output format flags (--json + --yara) +- Build FilterConfig from CLI arguments +- Update --help text to include available tags +- Add CLI argument parsing tests +- Wire CLI to Pipeline + +**Out of Scope:** +- Pipeline implementation (separate ticket) +- Filter execution logic (handled in Pipeline) +- Output formatting (separate ticket) + +## Acceptance Criteria + +- [ ] All CLI flags implemented using clap derive macros +- [ ] Flag validation with helpful error messages +- [ ] Conflicting output format detection (exit with error) +- [ ] Invalid tag names show suggestions +- [ ] --help includes complete list of available tags +- [ ] FilterConfig correctly built from CLI arguments +- [ ] CLI tests covering valid and invalid argument combinations +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/dbfae449-b832-46a9-8fe7-748d7c5f5a20 (Core Flows - CLI Flag Reference, Flow 8: Error Handling) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Component 5) +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 13) + +## Dependencies + +- Ticket: "Extend FoundString data model" (needs to know about new fields) \ No newline at end of file diff --git a/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md b/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md new file mode 100644 index 0000000..638c8fb --- /dev/null +++ b/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md @@ -0,0 +1,58 @@ +# Implement enum-based output formatters (Table, JSON, YARA) + +## Objective + +Create output formatters for human-readable tables, JSONL, and YARA rules using an enum-based approach. + +## Scope + +**In Scope:** +- Create file:src/output/mod.rs with OutputFormat enum and format_output() function +- Create file:src/output/table.rs: + - TTY detection using std::io::IsTerminal + - Table formatting with columns: String | Tags | Score | Section + - Non-TTY plain text output (one string per line) + - Primary tag display with comma-separation for same-priority tags +- Create file:src/output/json.rs: + - JSONL output (one JSON object per line) + - Include all FoundString fields + - Include original_text if present + - Include breakdown fields only if populated +- Create file:src/output/yara.rs: + - Complete YARA rule template generation + - Binary filename sanitization for rule names + - String escaping for YARA syntax + - Skip strings over 200 chars with comment + - Include metadata section (description, tool, date) +- Add comprehensive unit tests for each formatter +- Add integration tests with insta snapshots + +**Out of Scope:** +- CLI integration (separate ticket) +- Summary statistics formatting (handled in pipeline ticket) +- Progress feedback (separate ticket) + +## Acceptance Criteria + +- [ ] OutputFormat enum with Table, Json, Yara variants +- [ ] format_output() function with enum-based dispatch +- [ ] Table formatter with TTY detection and proper column alignment +- [ ] JSON formatter with complete field serialization +- [ ] YARA formatter with proper escaping and rule template +- [ ] All formatters handle edge cases (empty results, very long strings, special characters) +- [ ] Comprehensive unit tests for each formatter +- [ ] Integration tests with insta snapshots +- [ ] Each file under 500 lines +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/24940fed-1cc7-4d17-bc4b-fb5558c6f827 (Epic Brief - Problem 4: Integration Barriers) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/dbfae449-b832-46a9-8fe7-748d7c5f5a20 (Core Flows - Flows 4, 5, 6) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Component 4) +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 12) + +## Dependencies + +- Ticket: "Extend FoundString data model" (needs all fields for serialization) +- Ticket: "Implement ranking system" (needs scores for output) \ No newline at end of file diff --git a/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md b/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md new file mode 100644 index 0000000..d9ffb2b --- /dev/null +++ b/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md @@ -0,0 +1,50 @@ +# Implement ranking system with configurable scoring + +## Objective + +Create the ranking system that calculates relevance scores for strings based on section weights, semantic tags, and noise penalties. + +## Scope + +**In Scope:** +- Create file:src/classification/ranking.rs with RankingEngine +- Define RankingConfig with hardcoded defaults: + - Section weight mappings (SectionType -> i32) + - Tag boost mappings (Tag -> i32) + - Noise penalty configuration +- Implement score calculation algorithm: + - Apply section weights from ContainerInfo + - Apply semantic boosts based on tags + - Calculate noise penalties from confidence scores + - Populate score breakdown fields when debug mode enabled +- Implement rank_strings() to sort by score +- Add comprehensive unit tests with known scoring scenarios +- Split into submodules if exceeds 500 lines (section_weights.rs, semantic_boosts.rs, noise_penalties.rs) + +**Out of Scope:** +- User-configurable ranking (hardcoded defaults only) +- CLI integration (separate ticket) +- Output formatting (separate ticket) + +## Acceptance Criteria + +- [ ] RankingEngine struct implemented with new() and rank_strings() methods +- [ ] RankingConfig with sensible hardcoded defaults +- [ ] Score calculation populates final score and breakdown fields +- [ ] Strings sorted by score in descending order +- [ ] Breakdown fields populated only when debug mode enabled +- [ ] Comprehensive unit tests covering various scoring scenarios +- [ ] File size under 500 lines (split if needed) +- [ ] Zero clippy warnings + +## References + +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/24940fed-1cc7-4d17-bc4b-fb5558c6f827 (Epic Brief - Problem 3: No Prioritization) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/dbfae449-b832-46a9-8fe7-748d7c5f5a20 (Core Flows - Score Range) +- spec:f7d1261c-26d8-423a-8211-2cead3688bb0/04e2c976-db88-4de2-b59f-72f841ef2767 (Technical Plan - Component 1) +- file:.kiro/specs/stringy-binary-analyzer/tasks.md (Task 11) + +## Dependencies + +- Ticket: "Extend FoundString data model" (needs breakdown fields) +- Ticket: "Complete semantic classification" (needs tags for semantic boosts) \ No newline at end of file From 4d30cbf2a6ed4134ca36d6018f0d295eb08cd292 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:30:47 -0500 Subject: [PATCH 15/19] chore: add MSRV check to CI workflow Signed-off-by: UncleSp1d3r --- .github/workflows/ci.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 418c817..3175728 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,6 +68,21 @@ jobs: - name: Run clippy (all features) run: cargo clippy --all-targets --all-features -- -D warnings + # MSRV (Minimum Supported Rust Version) check + msrv: + runs-on: ubuntu-latest + needs: changes + if: needs.changes.outputs.rust == 'true' + steps: + - uses: actions/checkout@v6 + - uses: dtolnay/rust-toolchain@1.91.0 + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + + - name: Check MSRV compliance + run: cargo check --all-features + test: runs-on: ubuntu-latest needs: changes From 3bdbf530294230e36b9e851af1b50c17f5110f11 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:30:58 -0500 Subject: [PATCH 16/19] chore: update character restrictions in copilot instructions Signed-off-by: UncleSp1d3r --- .github/copilot-instructions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index ae6aad9..861cb51 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -7,7 +7,7 @@ Stringy is a **smarter strings tool** for extracting meaningful strings from ELF ## Architecture & Data Flow ```text -Binary → Format Detection (goblin) → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output +Binary -> Format Detection (goblin) -> Container Parsing -> String Extraction -> Deduplication -> Classification -> Ranking -> Output ``` ### Module Organization @@ -27,7 +27,7 @@ Binary → Format Detection (goblin) → Container Parsing → String Extraction - **Rust 2024 Edition**: MSRV 1.85+, always use latest edition features - **File size limit**: Keep files \<=500-600 lines; split larger files into focused modules - **No blanket `#[allow]`**: Any `allow` attribute requires inline justification and cannot apply to entire files/modules -- **Character restrictions**: Never use emojis, em-dashes (—), or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) +- **Character restrictions**: Never use emojis, em-dashes, or other non-Latin characters in code or documentation. Use standard ASCII punctuation (hyphens, quotes, etc.) ### Error Handling with `thiserror` From d22a553d7484a982f64c4af21fba758a3a38b6c4 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:33:51 -0500 Subject: [PATCH 17/19] chore: update documentation and improve formatting - Adjusted numbering format in user interaction flows for consistency. - Enhanced clarity in the Epic Brief by refining problem statements. - Improved formatting in the Technical Plan for better readability. - Updated performance optimization ticket to reflect new dependencies. - Enhanced clarity in the pipeline orchestration ticket scope. - Refined semantic classification ticket to include additional patterns. - Improved integration testing ticket to cover more binary types. - Updated FoundString data model ticket for clarity on new fields. - Enhanced CLI argument parsing ticket to include all flags. - Improved output formatters ticket to clarify scope and dependencies. - Refined ranking system ticket to ensure clarity on scoring logic. Signed-off-by: UncleSp1d3r --- ...e_Flows__Stringy_v1.0_User_Interactions.md | 79 +++++++++-------- .../Epic_Brief__Stringy_v1.0_Completion.md | 16 ++-- ...nical_Plan__Stringy_v1.0_Implementation.md | 87 ++++++++++--------- ...ce_optimizations_and_dependency_updates.md | 4 +- ...on_with_filtering_and_progress_feedback.md | 22 ++--- ...with_new_patterns_and_symbol_demangling.md | 4 +- ...-end_integration_testing_and_validation.md | 4 +- ..._model_for_demangling_and_debug_support.md | 4 +- ...ent_CLI_argument_parsing_and_validation.md | 4 +- ...d_output_formatters_(Table,_JSON,_YARA).md | 4 +- ...anking_system_with_configurable_scoring.md | 4 +- 11 files changed, 128 insertions(+), 104 deletions(-) diff --git a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md index 09e77db..5da1ef7 100644 --- a/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md +++ b/project_plan/specs/Core_Flows__Stringy_v1.0_User_Interactions.md @@ -64,15 +64,15 @@ sequenceDiagram **Steps:** -1. User invokes Stringy with binary file path -2. System displays "Parsing..." to stderr -3. System detects format (ELF/PE/Mach-O) and parses container structure -4. System displays "Extracting..." to stderr -5. System extracts strings from appropriate sections using format knowledge -6. System displays "Classifying..." to stderr -7. System applies semantic classification (URLs, IPs, paths, GUIDs, etc.) -8. System demangles Rust/C++ symbols to readable form -9. System displays "Ranking..." to stderr +01. User invokes Stringy with binary file path +02. System displays "Parsing..." to stderr +03. System detects format (ELF/PE/Mach-O) and parses container structure +04. System displays "Extracting..." to stderr +05. System extracts strings from appropriate sections using format knowledge +06. System displays "Classifying..." to stderr +07. System applies semantic classification (URLs, IPs, paths, GUIDs, etc.) +08. System demangles Rust/C++ symbols to readable form +09. System displays "Ranking..." to stderr 10. System calculates scores based on section weights, semantic boosts, and noise penalties 11. System sorts strings by score (descending) 12. System outputs table to stdout with columns: String | Tags | Score | Section @@ -108,9 +108,10 @@ core::fmt::Display::fmt export 85 .text 1. User invokes Stringy with filtering flags 2. System performs standard analysis pipeline (Parsing... Extracting... Classifying... Ranking...) 3. System applies filters with AND logic: - - String must have tag "url" OR "ipv4" - - String length must be >= 10 characters - - String encoding must be UTF-16 + - String must have tag "url" OR "ipv4" + - String length must be >= 10 characters + - String encoding must be UTF-16 + 4. System outputs filtered results in table format 5. If no strings match filters, system displays to stderr: "Analyzed 1,234 strings, 0 matched filters" 6. If strings match, system outputs table with matching strings only @@ -124,7 +125,7 @@ core::fmt::Display::fmt export 85 .text **Exit:** Returns exit code 0 (even if no matches) -**Conflicting Flags:** +**Conflicting Flags:**\ If multiple output format flags are specified (e.g., --json and --yara), the system displays an error: - Output to stderr: "Error: Cannot specify multiple output formats (--json, --yara)" @@ -201,10 +202,11 @@ If multiple output format flags are specified (e.g., --json and --yara), the sys 3. Progress indicators go to stderr 4. System generates complete YARA rule template to stdout 5. Rule includes: - - Rule name (derived from binary filename) - - Metadata section (file hash, analysis date, tool version) - - Strings section with properly escaped strings - - Condition section (basic template) + - Rule name (derived from binary filename) + - Metadata section (file hash, analysis date, tool version) + - Strings section with properly escaped strings + - Condition section (basic template) + 6. Strings are escaped according to YARA syntax rules 7. Very long strings (>200 chars) are truncated with comment @@ -234,7 +236,7 @@ rule binary_strings { - Rule name derived from binary filename - Non-alphanumeric characters replaced with underscore - File extension removed -- Suffix "_strings" added +- Suffix `_strings` added - Example: "binary.exe" becomes "binary_strings", "my-app.dll" becomes "my_app_strings" **YARA String Handling:** @@ -290,15 +292,15 @@ core::fmt::Display::fmt 2. System performs standard analysis pipeline 3. System outputs results in standard format (table or JSON) 4. After results, system outputs summary to stdout: - - Binary format detected - - Total strings extracted - - Strings after filtering (if filters applied) - - Top tags found - - Analysis time + - Binary format detected + - Total strings extracted + - Strings after filtering (if filters applied) + - Top tags found + - Analysis time **Summary Format:** -``` +```text [Results table here] Summary: @@ -356,21 +358,19 @@ Summary: ## CLI Flag Reference - -| Flag | Description | Example | -| ------------------ | ------------------------------------------------------------------------------ | ---------------------- | -| `--min-len N` | Minimum string length | `--min-len 10` | -| `--enc ENCODING` | Filter by encoding. Accepts: ascii, utf8, utf16 (both LE/BE), utf16le, utf16be | `--enc utf16` | -| `--only-tags TAGS` | Include only specified tags (comma-separated) | `--only-tags url,ipv4` | -| `--notags TAGS` | Exclude specified tags | `--notags debug,test` | -| `--top N` | Show only top N results | `--top 50` | -| `--json` | Output JSONL format | `--json` | -| `--yara` | Output YARA rule template | `--yara` | -| `--summary` | Show summary statistics | `--summary` | +| Flag | Description | Example | +| ------------------ | --------------------------------------------------------------------------------- | ---------------------- | +| `--min-len N` | Minimum string length | `--min-len 10` | +| `--enc ENCODING` | Filter by encoding. Accepts: ascii, utf8, utf16 (both LE/BE), utf16le, utf16be | `--enc utf16` | +| `--only-tags TAGS` | Include only specified tags (comma-separated) | `--only-tags url,ipv4` | +| `--notags TAGS` | Exclude specified tags | `--notags debug,test` | +| `--top N` | Show only top N results | `--top 50` | +| `--json` | Output JSONL format | `--json` | +| `--yara` | Output YARA rule template | `--yara` | +| `--summary` | Show summary statistics | `--summary` | | `--debug` | Include score breakdown in output (section_weight, semantic_boost, noise_penalty) | `--debug` | -| `--help` | Show help including available tags | `--help` | -| `--version` | Show version information | `--version` | - +| `--help` | Show help including available tags | `--help` | +| `--version` | Show version information | `--version` | **Available Tags (shown in --help):** @@ -402,7 +402,7 @@ Summary: - Score: 6 characters (right-aligned integer) - Section: 20 characters (section name) -**Tag Priority (for display):** +**Tag Priority (for display):**\ When multiple tags exist, show tags from the highest priority level. If multiple tags exist at the same priority level, show them comma-separated (e.g., "url,ipv4"). Priority order: @@ -422,4 +422,3 @@ Priority order: - 70-89: Meaningful strings (file paths, exports) - 50-69: Moderate relevance (general strings) - Below 50: Low relevance (potential noise) - diff --git a/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md b/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md index c83fd8f..448cf02 100644 --- a/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md +++ b/project_plan/specs/Epic_Brief__Stringy_v1.0_Completion.md @@ -9,6 +9,7 @@ Stringy v1.0 completion transforms a promising foundation into a production-read ### Who's Affected **Primary Users:** + - **Security researchers and malware analysts** who need to quickly identify indicators of compromise (IOCs), behavioral patterns, and malicious infrastructure in binaries - **Reverse engineers** who need to understand binary functionality, dependencies, and internal structure through string analysis - **Open-source community** seeking a modern, intelligent alternative to decades-old tools like GNU strings @@ -18,17 +19,14 @@ These users share a common need: efficient, intelligent binary analysis that sur ### Current Pain Points -**Problem 1: Signal vs Noise** -Traditional tools like GNU strings extract every printable character sequence, producing thousands of results where 90%+ are meaningless - padding bytes, binary tables, random data that happens to be printable. Users must manually scan through this noise to find the 10% that matters: URLs, file paths, registry keys, function names. This manual filtering is time-consuming, error-prone, and doesn't scale. +**Problem 1: Signal vs Noise** Traditional tools like GNU strings extract every printable character sequence, producing thousands of results where 90%+ are meaningless - padding bytes, binary tables, random data that happens to be printable. Users must manually scan through this noise to find the 10% that matters: URLs, file paths, registry keys, function names. This manual filtering is time-consuming, error-prone, and doesn't scale. + +**Problem 2: Cryptic Symbols** Modern binaries contain mangled symbols (especially Rust, C++) that appear as cryptic strings like `_ZN4core3fmt3num52_$LT$impl$`. Without demangling, users cannot understand what functions are called, what libraries are used, or what the binary actually does. This forces users to copy-paste symbols into external demangling tools, breaking their workflow. -**Problem 2: Cryptic Symbols** -Modern binaries contain mangled symbols (especially Rust, C++) that appear as cryptic strings like `_ZN4core3fmt3num52_$LT$impl$`. Without demangling, users cannot understand what functions are called, what libraries are used, or what the binary actually does. This forces users to copy-paste symbols into external demangling tools, breaking their workflow. +**Problem 3: No Prioritization** Even when users find potentially interesting strings, they have no way to know which ones are most important. Is this URL critical infrastructure or a help link? Is this file path a configuration file or a debug artifact? Without ranking and context, users waste time investigating low-value strings while missing critical indicators. -**Problem 3: No Prioritization** -Even when users find potentially interesting strings, they have no way to know which ones are most important. Is this URL critical infrastructure or a help link? Is this file path a configuration file or a debug artifact? Without ranking and context, users waste time investigating low-value strings while missing critical indicators. +**Problem 4: Integration Barriers** Users cannot integrate the current incomplete tool into automated workflows because it lacks: -**Problem 4: Integration Barriers** -Users cannot integrate the current incomplete tool into automated workflows because it lacks: - Structured output formats (JSON) for programmatic consumption - Filtering capabilities to focus on specific string types - Reliable, production-ready behavior @@ -54,6 +52,7 @@ The gaps exist across the entire analysis pipeline: **Current State:** Stringy has a solid foundation with format detection, container parsing, and basic string extraction working. Users can extract strings from ELF, PE, and Mach-O binaries with encoding awareness (ASCII, UTF-16). However, the output is raw and unprocessed - similar to GNU strings but with better encoding support. **Desired State:** Users run Stringy on any binary and immediately see: + - All meaningful strings ranked by importance (format-aware filtering removes noise) - Automatic semantic tags (URL, filepath, ipv4, guid, etc.) highlighting what each string represents - Demangled symbols showing actual function and type names @@ -65,6 +64,7 @@ The gaps exist across the entire analysis pipeline: ### Success Criteria When v1.0 is complete, users will be able to: + 1. Run Stringy on a binary and immediately see the most relevant strings ranked by importance 2. Quickly identify IOCs, file paths, URLs, and other semantic patterns without manual searching 3. Understand binary functionality through demangled symbols and import/export analysis diff --git a/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md b/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md index 8cbf585..a6360b8 100644 --- a/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md +++ b/project_plan/specs/Technical_Plan__Stringy_v1.0_Implementation.md @@ -140,21 +140,21 @@ pub struct FoundString { pub section: Option, pub length: u32, pub tags: Vec, - pub score: i32, // Final calculated score + pub score: i32, // Final calculated score pub source: StringSource, pub confidence: f32, - + // New fields for symbol demangling - pub original_text: Option, // Original mangled form (if demangled) - + pub original_text: Option, // Original mangled form (if demangled) + // Optional debug fields (only populated with --debug flag) - pub section_weight: Option, // Score from section type - pub semantic_boost: Option, // Bonus from semantic tags - pub noise_penalty: Option, // Penalty from noise detection + pub section_weight: Option, // Score from section type + pub semantic_boost: Option, // Bonus from semantic tags + pub noise_penalty: Option, // Penalty from noise detection } ``` -**Rationale**: +**Rationale**: - `original_text` preserves the mangled symbol for cross-referencing and recovery - Breakdown fields (section_weight, semantic_boost, noise_penalty) are optional to avoid exposing internal implementation details in the public API @@ -231,19 +231,31 @@ Add new variants to existing `Tag` enum in file:src/types.rs with specificity le ```rust pub enum Tag { // Existing tags (specific) - Url, Domain, IPv4, IPv6, FilePath, RegistryPath, - Import, Export, Version, Manifest, Resource, - DylibPath, Rpath, RpathVariable, FrameworkPath, - + Url, + Domain, + IPv4, + IPv6, + FilePath, + RegistryPath, + Import, + Export, + Version, + Manifest, + Resource, + DylibPath, + Rpath, + RpathVariable, + FrameworkPath, + // New specific tags for v1.0 - Guid, // GUIDs/UUIDs (specific) - Email, // Email addresses (specific) - FormatString, // Printf-style format strings (specific) - UserAgent, // User agent strings (specific) + Guid, // GUIDs/UUIDs (specific) + Email, // Email addresses (specific) + FormatString, // Printf-style format strings (specific) + UserAgent, // User agent strings (specific) DemangledSymbol, // Demangled Rust/C++ symbols (specific) - + // Broad/ambiguous tags - Base64, // Base64-encoded data (broad - many false positives) + Base64, // Base64-encoded data (broad - many false positives) } ``` @@ -368,7 +380,7 @@ impl SymbolDemangler { **YARA Formatter** (src/output/yara.rs): - Generate complete YARA rule template -- Sanitize binary filename for rule name (replace non-alphanumeric with underscore, remove extension, add "_strings" suffix) +- Sanitize binary filename for rule name (replace non-alphanumeric with underscore, remove extension, add `_strings` suffix) - Include metadata section (description, tool, date, file hash) - Escape strings according to YARA syntax (backslashes, quotes, newlines) - Skip strings over 200 characters with comment: "// Skipped: too long (N chars)" @@ -385,7 +397,7 @@ impl SymbolDemangler { ```rust pub struct Pipeline { config: PipelineConfig, - progress: ProgressBar, // from indicatif + progress: ProgressBar, // from indicatif } pub struct PipelineConfig { @@ -403,15 +415,15 @@ impl Pipeline { **Workflow**: -1. Display "Parsing..." progress indicator -2. Attempt memory-map file using `memmap2`, fall back to `std::fs::read()` on failure -3. Detect format and parse container (fail fast on error) -4. Display "Extracting..." progress indicator -5. Extract strings using `BasicExtractor` (fail fast on critical errors) -6. Display "Classifying..." progress indicator -7. Apply semantic classification (graceful degradation - skip failed strings) -8. Apply symbol demangling (graceful degradation - keep original on failure) -9. Display "Ranking..." progress indicator +01. Display "Parsing..." progress indicator +02. Attempt memory-map file using `memmap2`, fall back to `std::fs::read()` on failure +03. Detect format and parse container (fail fast on error) +04. Display "Extracting..." progress indicator +05. Extract strings using `BasicExtractor` (fail fast on critical errors) +06. Display "Classifying..." progress indicator +07. Apply semantic classification (graceful degradation - skip failed strings) +08. Apply symbol demangling (graceful degradation - keep original on failure) +09. Display "Ranking..." progress indicator 10. Calculate scores using `RankingEngine` (populate breakdown fields if debug_mode) 11. Apply filters from FilterConfig (min-len, encoding, tags) 12. Sort by score and apply --top limit @@ -475,15 +487,13 @@ impl Pipeline { ### Integration Points Summary - -| Component | Consumes | Produces | Integration Point | -| ------------------ | ------------------------------------ | ----------------------------- | ----------------------------------------------- | -| Pipeline | CLI args, file path | Formatted output | Orchestrates all components + filtering | -| RankingEngine | Vec<FoundString>, debug flag | Scored Vec<FoundString> | Called after classification | -| SymbolDemangler | &mut FoundString | () | Called during classification, modifies in-place | -| SemanticClassifier | FoundString | Vec<Tag> | Extended with new patterns | -| format_output() | OutputFormat, Vec<FoundString> | String | Enum-based dispatch to formatters | - +| Component | Consumes | Produces | Integration Point | +| ------------------ | ------------------------------- | ------------------------ | ----------------------------------------------- | +| Pipeline | CLI args, file path | Formatted output | Orchestrates all components + filtering | +| RankingEngine | Vec\, debug flag | Scored Vec\ | Called after classification | +| SymbolDemangler | &mut FoundString | () | Called during classification, modifies in-place | +| SemanticClassifier | FoundString | Vec\ | Extended with new patterns | +| format_output() | OutputFormat, Vec\ | String | Enum-based dispatch to formatters | ### Testing Strategy @@ -507,4 +517,3 @@ impl Pipeline { - Regex pattern matching performance - Memory mapping vs regular file I/O - Overall pipeline throughput - diff --git a/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md b/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md index aea53d6..1465b49 100644 --- a/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md +++ b/project_plan/tickets/Add_performance_optimizations_and_dependency_updates.md @@ -7,6 +7,7 @@ Integrate performance optimizations including memory mapping with fallback, rege ## Scope **In Scope:** + - Add dependencies to Cargo.toml: - memmap2 for memory-mapped file I/O - once_cell for modern lazy initialization @@ -22,6 +23,7 @@ Integrate performance optimizations including memory mapping with fallback, rege - Update documentation with performance characteristics **Out of Scope:** + - Pipeline implementation (separate ticket, but this ticket provides the tools) - Classification implementation (separate ticket, but this ticket provides regex caching) @@ -44,4 +46,4 @@ Integrate performance optimizations including memory mapping with fallback, rege ## Dependencies -None - this ticket provides infrastructure for other tickets but can be implemented independently. \ No newline at end of file +None - this ticket provides infrastructure for other tickets but can be implemented independently. diff --git a/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md b/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md index 50b2cff..d02b187 100644 --- a/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md +++ b/project_plan/tickets/Build_Pipeline_orchestration_with_filtering_and_progress_feedback.md @@ -7,17 +7,18 @@ Create the Pipeline struct that orchestrates the entire analysis workflow, inclu ## Scope **In Scope:** + - Create Pipeline struct in file:src/main.rs with PipelineConfig - Implement 14-step workflow: - 1. Progress indicator setup (indicatif) - 2. Memory-mapped file reading with fallback to regular read - 3. Format detection and container parsing (fail fast) - 4. String extraction (fail fast on critical errors) - 5. Semantic classification (graceful degradation) - 6. Symbol demangling (graceful degradation) - 7. Ranking with optional debug breakdown - 8. Filtering (min-len, encoding, tags) - 9. Sorting and top-N limiting + 01. Progress indicator setup (indicatif) + 02. Memory-mapped file reading with fallback to regular read + 03. Format detection and container parsing (fail fast) + 04. String extraction (fail fast on critical errors) + 05. Semantic classification (graceful degradation) + 06. Symbol demangling (graceful degradation) + 07. Ranking with optional debug breakdown + 08. Filtering (min-len, encoding, tags) + 09. Sorting and top-N limiting 10. Output formatting - Implement FilterConfig struct for CLI filter parameters - Implement stage-specific error recovery: @@ -29,6 +30,7 @@ Create the Pipeline struct that orchestrates the entire analysis workflow, inclu - Keep main.rs under 500 lines **Out of Scope:** + - CLI argument parsing (handled in same file but separate concern) - Individual component implementations (separate tickets) - Output formatter implementations (separate ticket) @@ -57,4 +59,4 @@ Create the Pipeline struct that orchestrates the entire analysis workflow, inclu - Ticket: "Complete semantic classification" (needs classification logic) - Ticket: "Implement ranking system" (needs RankingEngine) -- Ticket: "Implement output formatters" (needs formatters) \ No newline at end of file +- Ticket: "Implement output formatters" (needs formatters) diff --git a/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md b/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md index 67f3631..a25a6b0 100644 --- a/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md +++ b/project_plan/tickets/Complete_semantic_classification_with_new_patterns_and_symbol_demangling.md @@ -7,6 +7,7 @@ Extend the semantic classification system to detect all required patterns (GUIDs ## Scope **In Scope:** + - Extend file:src/classification/semantic.rs with new pattern detection: - GUID pattern matching - Email address validation @@ -24,6 +25,7 @@ Extend the semantic classification system to detect all required patterns (GUIDs - Split file:src/classification/semantic.rs if it exceeds 500 lines **Out of Scope:** + - Ranking/scoring logic (separate ticket) - CLI integration (separate ticket) - Output formatting (separate ticket) @@ -49,4 +51,4 @@ Extend the semantic classification system to detect all required patterns (GUIDs ## Dependencies -- Ticket: "Extend FoundString data model" (needs original_text field) \ No newline at end of file +- Ticket: "Extend FoundString data model" (needs original_text field) diff --git a/project_plan/tickets/End-to-end_integration_testing_and_validation.md b/project_plan/tickets/End-to-end_integration_testing_and_validation.md index 223f998..6fd806c 100644 --- a/project_plan/tickets/End-to-end_integration_testing_and_validation.md +++ b/project_plan/tickets/End-to-end_integration_testing_and_validation.md @@ -7,6 +7,7 @@ Create comprehensive integration tests that validate the complete pipeline with ## Scope **In Scope:** + - Expand tests/fixtures/ with diverse test binaries: - ELF binaries with various string types - PE binaries with resources and imports @@ -24,6 +25,7 @@ Create comprehensive integration tests that validate the complete pipeline with - Add performance benchmarks for complete pipeline **Out of Scope:** + - Unit tests for individual components (handled in component tickets) - Implementation of components (separate tickets) @@ -48,4 +50,4 @@ Create comprehensive integration tests that validate the complete pipeline with ## Dependencies -- All other tickets (this validates the complete implementation) \ No newline at end of file +- All other tickets (this validates the complete implementation) diff --git a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md index 7311e59..3a2da39 100644 --- a/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md +++ b/project_plan/tickets/Extend_FoundString_data_model_for_demangling_and_debug_support.md @@ -7,6 +7,7 @@ Update the `FoundString` struct in file:src/types.rs to support symbol demanglin ## Scope **In Scope:** + - Add `original_text: Option` field to preserve mangled symbols - Add optional breakdown fields: `section_weight: Option`, `semantic_boost: Option`, `noise_penalty: Option` - Update serde serialization to handle new fields correctly @@ -14,6 +15,7 @@ Update the `FoundString` struct in file:src/types.rs to support symbol demanglin - Update documentation with field descriptions **Out of Scope:** + - Actual demangling logic (handled in separate ticket) - Score calculation logic (handled in separate ticket) - CLI --debug flag implementation (handled in pipeline ticket) @@ -34,4 +36,4 @@ Update the `FoundString` struct in file:src/types.rs to support symbol demanglin ## Dependencies -None - this is the foundational ticket that other work depends on. \ No newline at end of file +None - this is the foundational ticket that other work depends on. diff --git a/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md b/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md index c719c10..f468910 100644 --- a/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md +++ b/project_plan/tickets/Implement_CLI_argument_parsing_and_validation.md @@ -7,6 +7,7 @@ Complete the CLI interface with all filtering flags, output format selection, an ## Scope **In Scope:** + - Extend Cli struct in file:src/main.rs with all flags: - --min-len N - --enc ENCODING (accept: ascii, utf8, utf16, utf16le, utf16be) @@ -27,6 +28,7 @@ Complete the CLI interface with all filtering flags, output format selection, an - Wire CLI to Pipeline **Out of Scope:** + - Pipeline implementation (separate ticket) - Filter execution logic (handled in Pipeline) - Output formatting (separate ticket) @@ -50,4 +52,4 @@ Complete the CLI interface with all filtering flags, output format selection, an ## Dependencies -- Ticket: "Extend FoundString data model" (needs to know about new fields) \ No newline at end of file +- Ticket: "Extend FoundString data model" (needs to know about new fields) diff --git a/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md b/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md index 638c8fb..f034756 100644 --- a/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md +++ b/project_plan/tickets/Implement_enum-based_output_formatters_(Table,_JSON,_YARA).md @@ -7,6 +7,7 @@ Create output formatters for human-readable tables, JSONL, and YARA rules using ## Scope **In Scope:** + - Create file:src/output/mod.rs with OutputFormat enum and format_output() function - Create file:src/output/table.rs: - TTY detection using std::io::IsTerminal @@ -28,6 +29,7 @@ Create output formatters for human-readable tables, JSONL, and YARA rules using - Add integration tests with insta snapshots **Out of Scope:** + - CLI integration (separate ticket) - Summary statistics formatting (handled in pipeline ticket) - Progress feedback (separate ticket) @@ -55,4 +57,4 @@ Create output formatters for human-readable tables, JSONL, and YARA rules using ## Dependencies - Ticket: "Extend FoundString data model" (needs all fields for serialization) -- Ticket: "Implement ranking system" (needs scores for output) \ No newline at end of file +- Ticket: "Implement ranking system" (needs scores for output) diff --git a/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md b/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md index d9ffb2b..cb82b65 100644 --- a/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md +++ b/project_plan/tickets/Implement_ranking_system_with_configurable_scoring.md @@ -7,6 +7,7 @@ Create the ranking system that calculates relevance scores for strings based on ## Scope **In Scope:** + - Create file:src/classification/ranking.rs with RankingEngine - Define RankingConfig with hardcoded defaults: - Section weight mappings (SectionType -> i32) @@ -22,6 +23,7 @@ Create the ranking system that calculates relevance scores for strings based on - Split into submodules if exceeds 500 lines (section_weights.rs, semantic_boosts.rs, noise_penalties.rs) **Out of Scope:** + - User-configurable ranking (hardcoded defaults only) - CLI integration (separate ticket) - Output formatting (separate ticket) @@ -47,4 +49,4 @@ Create the ranking system that calculates relevance scores for strings based on ## Dependencies - Ticket: "Extend FoundString data model" (needs breakdown fields) -- Ticket: "Complete semantic classification" (needs tags for semantic boosts) \ No newline at end of file +- Ticket: "Complete semantic classification" (needs tags for semantic boosts) From 4113f60838f5f36987a014ba3c541bcb00b4c87b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:34:08 -0500 Subject: [PATCH 18/19] chore: update directory structure path in analysis Signed-off-by: UncleSp1d3r --- codebase_analysis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codebase_analysis.md b/codebase_analysis.md index 382d293..7af0617 100644 --- a/codebase_analysis.md +++ b/codebase_analysis.md @@ -28,7 +28,7 @@ ## 2. Directory Structure Analysis ```text -D:\Stringy\ +./ |-- .github/ | |-- copilot-instructions.md # AI agent guidelines | |-- dependabot.yml # Dependency updates From 8b9765721f2c7035393ab28ef9cb9fe01365ea2a Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sat, 17 Jan 2026 21:35:02 -0500 Subject: [PATCH 19/19] chore: update Cargo.toml and rust-toolchain for Rust 1.91 - Bump rust-version in Cargo.toml to 1.91 - Update rust-toolchain.toml to use channel 1.91.0 - Enhance semantic classification for Windows paths with case-insensitivity and additional validation checks - Add new test file for let chains example Signed-off-by: UncleSp1d3r --- Cargo.toml | 50 +++++++-------- rust-toolchain.toml | 4 +- src/classification/semantic.rs | 108 +++++++++++++++++++++++++++++---- test_let_chains.rs | 6 ++ 4 files changed, 130 insertions(+), 38 deletions(-) create mode 100644 test_let_chains.rs diff --git a/Cargo.toml b/Cargo.toml index c1a7703..fcd24a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] -name = "stringy" -version = "0.1.0" -edition = "2024" -rust-version = "1.85" -authors = ["UncleSp1d3r "] -description = "A smarter alternative to the strings command that leverages format-specific knowledge" -license = "Apache-2.0" -repository = "https://github.com/EvilBit-Labs/Stringy" -homepage = "http://evilbitlabs.io/Stringy/" -keywords = ["binary", "strings", "analysis", "reverse-engineering", "malware"] -categories = ["command-line-utilities", "development-tools"] +name = "stringy" +version = "0.1.0" +edition = "2024" +rust-version = "1.91" +authors = [ "UncleSp1d3r " ] +description = "A smarter alternative to the strings command that leverages format-specific knowledge" +license = "Apache-2.0" +repository = "https://github.com/EvilBit-Labs/Stringy" +homepage = "http://evilbitlabs.io/Stringy/" +keywords = [ "binary", "strings", "analysis", "reverse-engineering", "malware" ] +categories = [ "command-line-utilities", "development-tools" ] [lib] name = "stringy" @@ -20,34 +20,34 @@ name = "stringy" path = "src/main.rs" [dependencies] -clap = { version = "4.5.54", features = ["derive"] } -entropy = "0.4.2" -goblin = "0.10.4" +clap = { version = "4.5.54", features = [ "derive" ] } +entropy = "0.4.2" +goblin = "0.10.4" lazy_static = "1.5" -pelite = "0.10.0" -regex = "1.12.2" -serde = { version = "1.0.228", features = ["derive"] } -serde_json = "1.0.148" -thiserror = "2.0.17" +pelite = "0.10.0" +regex = "1.12.2" +serde = { version = "1.0.228", features = [ "derive" ] } +serde_json = "1.0.148" +thiserror = "2.0.17" [dev-dependencies] criterion = "0.8.1" -insta = "1.46.0" -tempfile = "3.24.0" +insta = "1.46.0" +tempfile = "3.24.0" # The profile that 'dist' will build with [profile.dist] inherits = "release" -lto = "thin" +lto = "thin" [[bench]] -name = "elf" +name = "elf" harness = false [[bench]] -name = "pe" +name = "pe" harness = false [[bench]] -name = "ascii_extraction" +name = "ascii_extraction" harness = false diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 02d090f..d7b41d3 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "stable" -components = ["rustfmt", "clippy"] \ No newline at end of file +channel = "1.91.0" +components = [ "rustfmt", "clippy" ] diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 23ab9eb..2740f2f 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -112,13 +112,13 @@ lazy_static! { /// Regular expression for matching full Windows registry paths /// - /// Pattern matches registry paths starting with HKEY_ root keys. - static ref REGISTRY_PATH_REGEX: Regex = Regex::new(r"^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap(); + /// Pattern matches registry paths starting with HKEY_ root keys (case-insensitive). + static ref REGISTRY_PATH_REGEX: Regex = Regex::new(r"(?i)^HKEY_[A-Z_]+\\[^\x00\n\r]*").unwrap(); /// Regular expression for matching abbreviated registry paths /// - /// Pattern matches abbreviated registry forms like HKLM, HKCU, etc. - static ref REGISTRY_ABBREV_REGEX: Regex = Regex::new(r"^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap(); + /// Pattern matches abbreviated registry forms like HKLM, HKCU, etc. (case-insensitive). + static ref REGISTRY_ABBREV_REGEX: Regex = Regex::new(r"(?i)^HK(LM|CU|CR|U|CC)\\[^\x00\n\r]*").unwrap(); } lazy_static! { @@ -684,20 +684,84 @@ impl SemanticClassifier { /// Detects UNC network paths in the given text /// /// Returns `Some(Tag::FilePath)` if a UNC path is detected and valid. + /// Performs robust validation including: + /// - Maximum overall length (4096) and component length (255) + /// - Control character rejection + /// - Forward slash and printf placeholder rejection + /// - Reserved name and dots-only component rejection + /// - Empty segment detection pub fn classify_unc_path(&self, text: &str) -> Option { if !UNC_PATH_REGEX.is_match(text) { return None; } - let trimmed = text.trim_start_matches('\\'); - let mut parts = trimmed.split('\\'); - let server = parts.next().unwrap_or(""); - let share = parts.next().unwrap_or(""); + // Maximum overall length check + if text.len() > 4096 { + return None; + } + + // Reject control characters + if self.contains_control_chars(text) { + return None; + } + + // Reject forward slashes anywhere in the path + if text.contains('/') { + return None; + } + + let trimmed = text.trim_start_matches('\\').trim_end_matches('\\'); + let parts: Vec<&str> = trimmed.split('\\').collect(); + + // Must have at least server and share + if parts.len() < 2 { + return None; + } + + let server = parts[0]; + let share = parts[1]; if server.is_empty() || share.is_empty() { return None; } + // Validate all segments (no empty segments from double backslashes) + for segment in &parts { + // Reject empty segments (from consecutive backslashes like \\\\server\\\\share) + if segment.is_empty() { + return None; + } + + // Enforce max component length (255 bytes) + if segment.len() > 255 { + return None; + } + + // Reject components consisting solely of dots (but allow dots in domain names) + // Only reject if the segment is exactly "." or ".." + if *segment == "." || *segment == ".." { + return None; + } + } + + // Reject printf-style placeholders in server or share + if self.contains_printf_placeholder(server) || self.contains_printf_placeholder(share) { + return None; + } + + // Reject reserved Windows device names in server or share + let reserved_names = [ + "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", + "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", + ]; + let server_upper = server.to_ascii_uppercase(); + let share_upper = share.to_ascii_uppercase(); + for reserved in &reserved_names { + if server_upper == *reserved || share_upper == *reserved { + return None; + } + } + Some(Tag::FilePath) } @@ -723,13 +787,15 @@ impl SemanticClassifier { .any(|prefix| text.starts_with(prefix)) } - /// Checks if the Windows path matches known suspicious locations + /// Checks if the Windows path matches known suspicious locations (case-insensitive) pub fn is_suspicious_windows_path(&self, text: &str) -> bool { + let lowered_text = text.to_ascii_lowercase(); SUSPICIOUS_WINDOWS_PATHS.iter().any(|prefix| { + let lowered_prefix = prefix.to_ascii_lowercase(); if prefix.starts_with('\\') { - text.contains(prefix) + lowered_text.contains(&lowered_prefix) } else { - text.starts_with(prefix) + lowered_text.starts_with(&lowered_prefix) } }) } @@ -775,6 +841,11 @@ impl SemanticClassifier { false } + /// Checks if text contains ASCII control characters (C0 controls: 0x00-0x1F and DEL: 0x7F) + fn contains_control_chars(&self, text: &str) -> bool { + text.bytes().any(|b| b <= 0x1F || b == 0x7F) + } + /// Validates POSIX path structure pub fn is_valid_posix_path(&self, text: &str) -> bool { if text.len() > 4096 { @@ -811,6 +882,11 @@ impl SemanticClassifier { /// Validates Windows path structure pub fn is_valid_windows_path(&self, text: &str) -> bool { + // Reject control characters early to prevent regex/prefix matching from being fooled + if self.contains_control_chars(text) { + return false; + } + if text.len() > 4096 { return false; } @@ -841,6 +917,16 @@ impl SemanticClassifier { /// Validates Windows registry path structure pub fn is_valid_registry_path(&self, text: &str) -> bool { + // Reject control characters early to prevent regex/prefix matching from being fooled + if self.contains_control_chars(text) { + return false; + } + + // Maximum length check (4096 bytes) + if text.len() > 4096 { + return false; + } + if text.contains('/') { return false; } diff --git a/test_let_chains.rs b/test_let_chains.rs new file mode 100644 index 0000000..306ad86 --- /dev/null +++ b/test_let_chains.rs @@ -0,0 +1,6 @@ +fn main() { + let x = Some(5); + if true && let Some(y) = x { + println!("y = {}", y); + } +}