diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b05b9c1..6c36d5f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -66,7 +66,7 @@ jobs: shell: bash run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" - name: Cache dist - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: cargo-dist-cache path: ~/.cargo/bin/dist @@ -82,7 +82,7 @@ jobs: cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: artifacts-plan-dist-manifest path: plan-dist-manifest.json @@ -135,7 +135,7 @@ jobs: run: ${{ matrix.install_dist.run }} # Get the dist-manifest - name: Fetch local artifacts - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: pattern: artifacts-* path: target/distrib/ @@ -151,7 +151,7 @@ jobs: dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json echo "dist ran successfully" - name: Attest - uses: actions/attest-build-provenance@v2 + uses: actions/attest-build-provenance@v3 with: subject-path: "target/distrib/*${{ join(matrix.targets, ', ') }}*" - id: cargo-dist @@ -168,7 +168,7 @@ jobs: cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: artifacts-build-local-${{ join(matrix.targets, '_') }} path: | @@ -190,7 +190,7 @@ jobs: persist-credentials: false submodules: recursive - name: Install cached dist - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: name: cargo-dist-cache path: ~/.cargo/bin/ @@ -202,7 +202,7 @@ jobs: shell: bash # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: pattern: artifacts-* path: target/distrib/ @@ -233,7 +233,7 @@ jobs: find . -name '*.cdx.xml' | tee -a "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - name: "Upload artifacts" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: artifacts-build-global path: | @@ -259,14 +259,14 @@ jobs: persist-credentials: false submodules: recursive - name: Install cached dist - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: name: cargo-dist-cache path: ~/.cargo/bin/ - run: chmod +x ~/.cargo/bin/dist # Fetch artifacts from scratch-storage - name: Fetch artifacts - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: pattern: artifacts-* path: target/distrib/ @@ -279,14 +279,14 @@ jobs: cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: # Overwrite the previous copy name: artifacts-dist-manifest path: dist-manifest.json # Create a GitHub Release while uploading all files to it - name: "Download GitHub Artifacts" - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: pattern: artifacts-* path: artifacts @@ -326,7 +326,7 @@ jobs: token: ${{ secrets.HOMEBREW_TAP_TOKEN }} # So we have access to the formula - name: Fetch homebrew formulae - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: pattern: artifacts-* path: Formula/ diff --git a/Cargo.toml b/Cargo.toml index 02a19b9..75eafad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,16 +20,17 @@ path = "src/main.rs" [dependencies] clap = { version = "4.5.51", features = ["derive"] } +entropy = "0.4.2" goblin = "0.10.3" -pelite = "0.10" +pelite = "0.10.0" serde = { version = "1.0.228", features = ["derive"] } -serde_json = "1.0" +serde_json = "1.0.145" thiserror = "2.0.17" [dev-dependencies] criterion = "0.7.0" -insta = "1.43" -tempfile = "3.23" +insta = "1.43.2" +tempfile = "3.23.0" # The profile that 'dist' will build with [profile.dist] @@ -43,3 +44,7 @@ harness = false [[bench]] name = "pe" harness = false + +[[bench]] +name = "ascii_extraction" +harness = false diff --git a/benches/ascii_extraction.rs b/benches/ascii_extraction.rs new file mode 100644 index 0000000..31710c7 --- /dev/null +++ b/benches/ascii_extraction.rs @@ -0,0 +1,203 @@ +use criterion::{Criterion, criterion_group, criterion_main}; +use std::hint::black_box; +use stringy::extraction::ascii::{AsciiExtractionConfig, extract_ascii_strings}; +use stringy::extraction::config::NoiseFilterConfig; +use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; + +fn bench_basic_extraction(c: &mut Criterion) { + // Create test data with various string patterns + let test_data = + b"Hello World\0Test String\0Another String\0Binary\x00\x01\x02Data\0More Strings\0" + .repeat(100); + let config = AsciiExtractionConfig::default(); + + c.bench_function("ascii_extraction_basic", |b| { + b.iter(|| { + let _ = extract_ascii_strings(black_box(&test_data), black_box(&config)); + }); + }); +} + +fn bench_filtered_extraction(c: &mut Criterion) { + let test_data = + b"Hello World\0Test String\0Another String\0Binary\x00\x01\x02Data\0More Strings\0" + .repeat(100); + let config = AsciiExtractionConfig::default(); + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + c.bench_function("ascii_extraction_with_filtering", |b| { + b.iter(|| { + let strings = extract_ascii_strings(black_box(&test_data), black_box(&config)); + for string in &strings { + let _ = filter.calculate_confidence(black_box(&string.text), black_box(&context)); + } + }); + }); +} + +fn bench_individual_filters(c: &mut Criterion) { + use stringy::extraction::filters::{ + CharDistributionFilter, ContextFilter, EntropyFilter, LengthFilter, LinguisticFilter, + NoiseFilter, RepetitionFilter, + }; + + let test_strings = vec![ + "Hello, World!", + "AAAA", + "Error: file not found", + "!!!@@@###", + "C:\\Windows\\System32", + ]; + + let char_filter = CharDistributionFilter; + let entropy_filter = EntropyFilter::new(1.5, 7.5); + let linguistic_filter = LinguisticFilter::new(0.1, 0.9); + let length_filter = LengthFilter::new(200); + let repetition_filter = RepetitionFilter::new(0.7); + let context_filter = ContextFilter; + let context = FilterContext::default(); + + c.bench_function("filter_char_distribution", |b| { + b.iter(|| { + for text in &test_strings { + let _ = char_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + c.bench_function("filter_entropy", |b| { + b.iter(|| { + for text in &test_strings { + let _ = entropy_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + c.bench_function("filter_linguistic", |b| { + b.iter(|| { + for text in &test_strings { + let _ = + linguistic_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + c.bench_function("filter_length", |b| { + b.iter(|| { + for text in &test_strings { + let _ = length_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + c.bench_function("filter_repetition", |b| { + b.iter(|| { + for text in &test_strings { + let _ = + repetition_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + c.bench_function("filter_context", |b| { + b.iter(|| { + for text in &test_strings { + let _ = context_filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); +} + +fn bench_composite_filter(c: &mut Criterion) { + let test_strings = vec![ + "Hello, World!", + "AAAA", + "Error: file not found", + "!!!@@@###", + "C:\\Windows\\System32", + "https://example.com", + ]; + + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + c.bench_function("composite_filter_all_enabled", |b| { + b.iter(|| { + for text in &test_strings { + let _ = filter.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); + + // Test with some filters disabled + // Note: CompositeNoiseFilter doesn't expose a builder pattern, so we create a new one + // with modified enable flags. For this benchmark, we'll just use the default filter. + let filter_partial = CompositeNoiseFilter::new(&filter_config); + + c.bench_function("composite_filter_partial", |b| { + b.iter(|| { + for text in &test_strings { + let _ = filter_partial.calculate_confidence(black_box(text), black_box(&context)); + } + }); + }); +} + +fn bench_entropy_calculation(c: &mut Criterion) { + use entropy::shannon_entropy; + + let test_strings = vec![ + "Hello, World!", + "AAAA", + "Error: file not found", + "!!!@@@###", + ]; + + c.bench_function("entropy_shannon_calculation", |b| { + b.iter(|| { + for text in &test_strings { + let _ = shannon_entropy(black_box(text.as_bytes())); + } + }); + }); +} + +fn bench_large_binary(c: &mut Criterion) { + // Create a large binary-like data with embedded strings + let mut large_data = Vec::new(); + for i in 0..10000 { + if i % 100 == 0 { + large_data.extend_from_slice(b"Hello World\0"); + } else { + large_data.push((i % 256) as u8); + } + } + + let config = AsciiExtractionConfig::default(); + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + c.bench_function("large_binary_extraction", |b| { + b.iter(|| { + let strings = extract_ascii_strings(black_box(&large_data), black_box(&config)); + for string in &strings { + let _ = filter.calculate_confidence(black_box(&string.text), black_box(&context)); + } + }); + }); +} + +criterion_group!( + ascii_extraction_benches, + bench_basic_extraction, + bench_filtered_extraction, + bench_individual_filters, + bench_composite_filter, + bench_entropy_calculation, + bench_large_binary +); +criterion_main!(ascii_extraction_benches); diff --git a/dist-workspace.toml b/dist-workspace.toml index aafdbfe..9f3c862 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -10,7 +10,13 @@ ci = "github" # The installers to generate for each app installers = ["shell", "powershell", "homebrew"] # Target platforms to build apps for (Rust target-triple syntax) -targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl", "x86_64-pc-windows-msvc"] +targets = [ + "aarch64-apple-darwin", + "aarch64-unknown-linux-gnu", + "x86_64-unknown-linux-gnu", + "x86_64-unknown-linux-musl", + "x86_64-pc-windows-msvc", +] # A GitHub repo to push Homebrew formulas to tap = "EvilBit-Labs/homebrew-tap" # Customize the Homebrew formula name @@ -49,4 +55,6 @@ install-success-msg = "Successfully installed Stringy! Ready to start looking at repository = "EvilBit-Labs/StringyMcStringFace" [dist.github-action-commits] "actions/checkout" = "v5" -"actions/download-artifact" = "v5" +"actions/download-artifact" = "v6" +"actions/upload-artifact" = "v5" +"actions/attest-build-provenance" = "v3" diff --git a/docs/src/string-extraction.md b/docs/src/string-extraction.md index 1764d35..b18f5a0 100644 --- a/docs/src/string-extraction.md +++ b/docs/src/string-extraction.md @@ -10,16 +10,50 @@ Binary Data → Section Analysis → Encoding Detection → String Scanning → ## Encoding Support -### ASCII/UTF-8 Extraction +### ASCII Extraction -The most common encoding in most binaries. +The most common encoding in most binaries. ASCII extraction provides foundational string extraction with configurable minimum length thresholds. #### Algorithm -1. **Scan for printable sequences**: Characters in range 0x20-0x7E plus common whitespace +1. **Scan for printable sequences**: Characters in range 0x20-0x7E (strict printable ASCII) 2. **Length filtering**: Configurable minimum length (default: 4 characters) 3. **Null termination**: Respect null terminators but don't require them -4. **Context awareness**: Consider section type for validation +4. **Section awareness**: Integrate with section metadata for context-aware filtering + +#### Basic Extraction + +```rust +use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; + +let data = b"Hello\0World\0Test123"; +let config = AsciiExtractionConfig::default(); +let strings = extract_ascii_strings(data, &config); + +for string in strings { + println!("Found: {} at offset {}", string.text, string.offset); +} +``` + +#### Configuration + +```rust +use stringy::extraction::ascii::AsciiExtractionConfig; + +// Default configuration (min_length: 4, no max_length) +let config = AsciiExtractionConfig::default(); + +// Custom minimum length +let config = AsciiExtractionConfig::new(8); + +// Custom minimum and maximum length +let mut config = AsciiExtractionConfig::default(); +config.max_length = Some(256); +``` + +### UTF-8 Extraction + +UTF-8 extraction builds on ASCII extraction and handles multi-byte characters. See the main extraction module for UTF-8 support. #### Implementation Details @@ -51,11 +85,129 @@ fn extract_ascii_strings(data: &[u8], min_len: usize) -> Vec { } ``` -#### Noise Filtering +## Noise Filtering + +Stringy implements a multi-layered heuristic filtering system to reduce false positives and identify noise in extracted strings. The filtering system uses a combination of entropy analysis, character distribution, linguistic patterns, length checks, repetition detection, and context-aware filtering. + +### Filter Architecture + +The noise filtering system consists of multiple independent filters that can be combined with configurable weights: + +1. **Character Distribution Filter**: Detects abnormal character frequency distributions +2. **Entropy Filter**: Uses Shannon entropy to detect padding/repetition and random binary +3. **Linguistic Pattern Filter**: Analyzes vowel-to-consonant ratios and common bigrams +4. **Length Filter**: Penalizes excessively long strings and very short strings in low-weight sections +5. **Repetition Filter**: Detects repeated character patterns and repeated substrings +6. **Context-Aware Filter**: Boosts confidence for strings in high-weight sections + +### Character Distribution Analysis + +Detects strings with abnormal character distributions: + +- **Excessive punctuation** (>80%): Low confidence (0.2) +- **Excessive repetition** (>90% same character): Very low confidence (0.1) +- **Excessive non-alphanumeric** (>70%): Low confidence (0.3) +- **Reasonable distribution**: High confidence (1.0) + +### Entropy-Based Filtering + +Uses Shannon entropy (bits per byte) to classify strings: + +- **Very low entropy** (\<1.5 bits/byte): Likely padding or repetition (confidence: 0.1) +- **Very high entropy** (>7.5 bits/byte): Likely random binary (confidence: 0.2) +- **Optimal range** (3.5-6.0 bits/byte): High confidence (1.0) +- **Acceptable range** (2.0-7.0 bits/byte): Moderate confidence (0.4-0.7) + +### Linguistic Pattern Detection + +Analyzes text for word-like patterns: + +- **Vowel-to-consonant ratio**: Reasonable range 0.2-0.8 for English +- **Common bigrams**: Detects common English patterns (th, he, in, er, an, re, on, at, en, nd) +- **Handles non-English**: Gracefully handles non-English strings without over-penalizing + +### Length-Based Filtering + +Applies penalties based on string length: + +- **Excessively long** (>200 characters): Low confidence (0.3) - likely table data +- **Very short in low-weight sections** (\<4 chars, weight \<0.5): Moderate confidence (0.5) +- **Normal length** (4-100 characters): High confidence (1.0) + +### Repetition Detection + +Identifies repetitive patterns: + +- **Repeated characters** (e.g., "AAAA", "0000"): Very low confidence (0.1) +- **Repeated substrings** (e.g., "abcabcabc"): Low confidence (0.2) +- **Normal strings**: High confidence (1.0) + +### Context-Aware Filtering + +Boosts or reduces confidence based on section context: + +- **String data sections** (.rodata, .rdata, \_\_cstring): High confidence (0.9-1.0) +- **Read-only data sections**: High confidence (0.9) +- **Resource sections**: Maximum confidence (1.0) - known-good sources +- **Code sections**: Lower confidence (0.3-0.5) +- **Writable data sections**: Moderate confidence (0.6) + +### Configuration + +```rust +use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; + +// Default configuration +let config = NoiseFilterConfig::default(); + +// Customize thresholds +let mut config = NoiseFilterConfig::default(); +config.entropy_min = 2.0; +config.entropy_max = 7.0; +config.max_length = 150; + +// Customize filter weights +config.filter_weights = FilterWeights { + entropy_weight: 0.3, + char_distribution_weight: 0.25, + linguistic_weight: 0.2, + length_weight: 0.15, + repetition_weight: 0.05, + context_weight: 0.05, +}; +``` + +### Using Noise Filters + +```rust +use stringy::extraction::config::NoiseFilterConfig; +use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; +use stringy::types::SectionType; + +let filter_config = NoiseFilterConfig::default(); +let filter = CompositeNoiseFilter::new(&filter_config); +let context = FilterContext::default(); + +let confidence = filter.calculate_confidence("Hello, World!", &context); +if confidence >= 0.5 { + // String passed filtering threshold +} +``` + +### Confidence Scoring -- **Padding detection**: Skip sequences of repeated characters -- **Table data**: Avoid extracting from obvious data tables -- **Binary interleaving**: Skip strings with excessive binary data +Each string is assigned a confidence score (0.0-1.0) indicating how likely it is to be legitimate: + +- **1.0**: Maximum confidence (strings from known-good sources like imports, exports, resources) +- **0.7-0.9**: High confidence (likely legitimate strings) +- **0.5-0.7**: Moderate confidence (may need review) +- **0.0-0.5**: Low confidence (likely noise, filtered out by default) + +The confidence score is separate from the `score` field used for final ranking. Confidence specifically represents the noise filtering assessment. + +### Performance + +Noise filtering is designed to add minimal overhead (\<10% per acceptance criteria). Individual filters are optimized for performance, and the composite filter allows enabling/disabling specific filters to balance accuracy and speed. ### UTF-16 Extraction @@ -251,16 +403,53 @@ fn deduplicate_strings(strings: Vec) -> Vec { ## Configuration Options -### Length Filtering +### Extraction Configuration ```rust +use stringy::extraction::config::ExtractionConfig; + pub struct ExtractionConfig { - pub min_ascii_len: usize, // Default: 4 - pub min_utf16_len: usize, // Default: 3 - pub max_string_len: usize, // Default: 1024 + pub min_ascii_length: usize, // Default: 4 + pub min_wide_length: usize, // Default: 3 (for UTF-16) + pub enabled_encodings: Vec, // Default: ASCII, UTF-8 + pub noise_filtering_enabled: bool, // Default: true + pub min_confidence_threshold: f32, // Default: 0.5 +} +``` + +### Noise Filter Configuration + +```rust +use stringy::extraction::config::NoiseFilterConfig; + +pub struct NoiseFilterConfig { + pub entropy_min: f32, // Default: 1.5 + pub entropy_max: f32, // Default: 7.5 + pub max_length: usize, // Default: 200 + pub max_repetition_ratio: f32, // Default: 0.7 + pub min_vowel_ratio: f32, // Default: 0.1 + pub max_vowel_ratio: f32, // Default: 0.9 + pub filter_weights: FilterWeights, // Default: balanced weights } ``` +### Filter Weights + +```rust +use stringy::extraction::config::FilterWeights; + +pub struct FilterWeights { + pub entropy_weight: f32, // Default: 0.25 + pub char_distribution_weight: f32, // Default: 0.20 + pub linguistic_weight: f32, // Default: 0.20 + pub length_weight: f32, // Default: 0.15 + pub repetition_weight: f32, // Default: 0.10 + pub context_weight: f32, // Default: 0.10 +} +``` + +All weights must sum to 1.0. The configuration validates this automatically. + ### Encoding Selection ```rust @@ -330,14 +519,73 @@ lazy_static! { ### Validation Heuristics -- **Entropy checking**: Skip high-entropy strings likely to be binary data -- **Language detection**: Prefer strings with common English patterns -- **Context validation**: Consider surrounding bytes for legitimacy +The noise filtering system implements comprehensive validation: + +- **Entropy checking**: Uses Shannon entropy to detect padding/repetition and random binary data +- **Language detection**: Analyzes vowel-to-consonant ratios and common bigrams +- **Context validation**: Considers section type, weight, and permissions +- **Character distribution**: Detects abnormal frequency distributions +- **Repetition detection**: Identifies repeated patterns and padding ### False Positive Reduction -- **Padding detection**: Skip repeated character sequences -- **Table data**: Avoid structured binary data -- **Alignment checking**: Consider memory alignment patterns +The multi-layered filtering system targets common sources of false positives: + +- **Padding detection**: Identifies repeated character sequences (e.g., "AAAA", "\\x00\\x00\\x00\\x00") +- **Table data**: Filters excessively long strings likely to be structured data +- **Binary noise**: High-entropy strings are flagged as likely random binary +- **Context awareness**: Strings in code sections receive lower confidence scores + +### Performance Characteristics + +Noise filtering is designed for minimal overhead: + +- **Target overhead**: \<10% compared to extraction without filtering +- **Optimized filters**: Each filter is independently optimized +- **Configurable**: Can enable/disable individual filters to balance accuracy and speed +- **Scalable**: Handles large binaries efficiently + +### Examples + +#### Basic Extraction with Filtering + +```rust +use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; +use stringy::extraction::config::NoiseFilterConfig; +use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; + +let data = b"Hello World\0AAAA\0Test123"; +let config = AsciiExtractionConfig::default(); +let strings = extract_ascii_strings(data, &config); + +let filter_config = NoiseFilterConfig::default(); +let filter = CompositeNoiseFilter::new(&filter_config); +let context = FilterContext::default(); + +let filtered: Vec<_> = strings + .into_iter() + .filter(|s| filter.calculate_confidence(&s.text, &context) >= 0.5) + .collect(); +``` + +#### Custom Filter Configuration + +```rust +use stringy::extraction::config::{NoiseFilterConfig, FilterWeights}; + +let mut config = NoiseFilterConfig::default(); +config.entropy_min = 2.0; +config.entropy_max = 7.0; +config.max_length = 150; + +config.filter_weights = FilterWeights { + entropy_weight: 0.4, + char_distribution_weight: 0.3, + linguistic_weight: 0.15, + length_weight: 0.1, + repetition_weight: 0.03, + context_weight: 0.02, +}; +``` -This comprehensive extraction system ensures high-quality string extraction while maintaining performance and minimizing false positives. +This comprehensive extraction system ensures high-quality string extraction while maintaining performance and minimizing false positives through multi-layered noise filtering. diff --git a/src/extraction/ascii.rs b/src/extraction/ascii.rs new file mode 100644 index 0000000..9340229 --- /dev/null +++ b/src/extraction/ascii.rs @@ -0,0 +1,820 @@ +//! ASCII String Extraction Module +//! +//! This module provides foundational ASCII string extraction for StringyMcStringFace. +//! It implements byte-level scanning for contiguous printable ASCII sequences and serves +//! as the reference implementation for future UTF-8, UTF-16LE, and UTF-16BE extractors. +//! +//! # Examples +//! +//! ```rust +//! use stringy::extraction::ascii::{extract_ascii_strings, extract_from_section, AsciiExtractionConfig}; +//! use stringy::types::{SectionInfo, SectionType}; +//! +//! // Basic extraction from raw data +//! let data = b"Hello\0World\0Test123"; +//! let config = AsciiExtractionConfig::default(); +//! let strings = extract_ascii_strings(data, &config); +//! +//! // Section-aware extraction +//! let section = SectionInfo { +//! name: ".rodata".to_string(), +//! offset: 0, +//! size: 20, +//! rva: Some(0x1000), +//! section_type: SectionType::StringData, +//! is_executable: false, +//! is_writable: false, +//! weight: 1.0, +//! }; +//! let strings = extract_from_section(§ion, data, &config); +//! ``` + +use crate::extraction::config::NoiseFilterConfig; +use crate::extraction::filters::{CompositeNoiseFilter, FilterContext}; +use crate::types::{Encoding, FoundString, SectionInfo, StringSource}; + +/// Configuration for ASCII string extraction +/// +/// Controls minimum and maximum string length filtering. This structure serves as the +/// foundation for future configuration expansion, including encoding preferences and +/// tag filters as mentioned in the issue. +/// +/// # Default Values +/// +/// - `min_length`: 4 (standard minimum to reduce noise) +/// - `max_length`: None (no upper limit by default) +/// +/// # Examples +/// +/// ```rust +/// use stringy::extraction::ascii::AsciiExtractionConfig; +/// +/// // Use default configuration +/// let config = AsciiExtractionConfig::default(); +/// +/// // Custom minimum length +/// let config = AsciiExtractionConfig::new(8); +/// +/// // Custom minimum and maximum length +/// let mut config = AsciiExtractionConfig::default(); +/// config.max_length = Some(256); +/// ``` +#[derive(Debug, Clone)] +pub struct AsciiExtractionConfig { + /// Minimum string length in bytes (default: 4) + pub min_length: usize, + /// Maximum string length in bytes (default: None, no limit) + pub max_length: Option, +} + +impl Default for AsciiExtractionConfig { + fn default() -> Self { + Self { + min_length: 4, + max_length: None, + } + } +} + +impl AsciiExtractionConfig { + /// Create a new AsciiExtractionConfig with custom minimum length + /// + /// # Arguments + /// + /// * `min_length` - Minimum string length in bytes + /// + /// # Returns + /// + /// New AsciiExtractionConfig with specified minimum length and default max_length (None) + /// + /// # Example + /// + /// ```rust + /// use stringy::extraction::ascii::AsciiExtractionConfig; + /// + /// let config = AsciiExtractionConfig::new(8); + /// assert_eq!(config.min_length, 8); + /// assert_eq!(config.max_length, None); + /// ``` + pub fn new(min_length: usize) -> Self { + Self { + min_length, + max_length: None, + } + } +} + +/// Check if a byte is in the printable ASCII range +/// +/// Printable ASCII includes characters from 0x20 (space) through 0x7E (tilde). +/// This range covers all standard printable ASCII characters. +/// +/// **Note on printable character definitions**: This function uses a strict definition +/// of printable ASCII (0x20-0x7E only), excluding whitespace control characters like +/// tab, newline, and carriage return. This differs from `is_printable_text_byte` in +/// `extraction::mod`, which includes common whitespace characters (0x09, 0x0A, 0x0D) +/// to handle formatted text. This strict definition ensures ASCII-only extraction +/// produces predictable, consistent results. +/// +/// # Arguments +/// +/// * `byte` - Byte to check +/// +/// # Returns +/// +/// `true` if the byte is printable ASCII, `false` otherwise +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::ascii::is_printable_ascii; +/// +/// assert!(is_printable_ascii(b' ')); +/// assert!(is_printable_ascii(b'A')); +/// assert!(is_printable_ascii(b'z')); +/// assert!(is_printable_ascii(b'0')); +/// assert!(is_printable_ascii(b'~')); +/// assert!(!is_printable_ascii(0x00)); +/// assert!(!is_printable_ascii(0x1F)); +/// assert!(!is_printable_ascii(0x7F)); +/// ``` +#[inline] +pub fn is_printable_ascii(byte: u8) -> bool { + (0x20..=0x7E).contains(&byte) +} + +/// Extract ASCII strings from a byte slice +/// +/// Scans through the byte slice looking for contiguous sequences of printable ASCII +/// characters. When a non-printable byte is encountered, checks if the accumulated +/// sequence meets the minimum length threshold and creates a FoundString entry. +/// +/// # Algorithm +/// +/// 1. Iterate through the byte slice tracking current string start position and accumulated bytes +/// 2. When encountering a printable ASCII byte, accumulate it in the current string buffer +/// 3. When encountering a non-printable byte, check if accumulated length meets minimum threshold +/// 4. If threshold met, create a `FoundString` with proper metadata +/// 5. Handle end-of-buffer edge case by checking accumulated string after loop completes +/// 6. Apply max_length filtering if configured +/// +/// # Arguments +/// +/// * `data` - Byte slice to scan for ASCII strings +/// * `config` - Extraction configuration +/// +/// # Returns +/// +/// Vector of FoundString entries with the following metadata: +/// - `text`: UTF-8 string from accumulated bytes +/// - `encoding`: `Encoding::Ascii` +/// - `offset`: Start position in the data slice +/// - `length`: Byte count +/// - `source`: `StringSource::SectionData` +/// - `tags`: Empty vector +/// - `score`: 0 +/// - `section`: None +/// - `rva`: None +/// +/// # Edge Cases +/// +/// - Empty input data returns empty vector +/// - Data smaller than minimum length returns empty vector +/// - String at buffer start (start_offset = 0) +/// - String at buffer end (checked after loop) +/// - Very long strings are filtered by max_length if configured +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; +/// +/// let data = b"Hello\0World\0Test123"; +/// let config = AsciiExtractionConfig::default(); +/// let strings = extract_ascii_strings(data, &config); +/// +/// assert_eq!(strings.len(), 3); +/// assert_eq!(strings[0].text, "Hello"); +/// assert_eq!(strings[0].offset, 0); +/// assert_eq!(strings[1].text, "World"); +/// assert_eq!(strings[1].offset, 6); +/// ``` +pub fn extract_ascii_strings(data: &[u8], config: &AsciiExtractionConfig) -> Vec { + let mut strings = Vec::new(); + let mut current_string_start: Option = None; + let mut current_string_bytes = Vec::new(); + + for (i, &byte) in data.iter().enumerate() { + if is_printable_ascii(byte) { + if current_string_start.is_none() { + current_string_start = Some(i); + } + current_string_bytes.push(byte); + } else { + // End of current string candidate + if let Some(start) = current_string_start { + let len = current_string_bytes.len(); + // Check minimum length + if len >= config.min_length { + // Check maximum length if configured + if let Some(max_len) = config.max_length + && len > max_len + { + // Skip this string, reset accumulator + current_string_start = None; + current_string_bytes.clear(); + continue; + } + // Convert bytes to UTF-8 string (ASCII is valid UTF-8) + let bytes = std::mem::take(&mut current_string_bytes); + let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); + strings.push(FoundString { + text, + encoding: Encoding::Ascii, + offset: start as u64, + rva: None, + section: None, + length: len as u32, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence: 1.0, + }); + } + } + current_string_start = None; + current_string_bytes.clear(); + } + } + + // Handle string at end of buffer + if let Some(start) = current_string_start { + let len = current_string_bytes.len(); + if len >= config.min_length { + // Check maximum length if configured + if let Some(max_len) = config.max_length { + if len > max_len { + // Skip this string + } else { + let bytes = std::mem::take(&mut current_string_bytes); + let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); + strings.push(FoundString { + text, + encoding: Encoding::Ascii, + offset: start as u64, + rva: None, + section: None, + length: len as u32, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence: 1.0, + }); + } + } else { + let bytes = std::mem::take(&mut current_string_bytes); + let text = String::from_utf8(bytes).expect("ASCII bytes should be valid UTF-8"); + strings.push(FoundString { + text, + encoding: Encoding::Ascii, + offset: start as u64, + rva: None, + section: None, + length: len as u32, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence: 1.0, + }); + } + } + } + + strings +} + +/// Extract ASCII strings from a specific section with proper metadata population +/// +/// This function extracts strings from a section of the binary, adjusting offsets +/// and populating section-specific metadata (section name, RVA). It also applies +/// noise filtering if enabled in the extraction configuration. +/// +/// # Implementation +/// +/// 1. Calculate section data slice using section.offset and section.size, with bounds checking +/// 2. Call `extract_ascii_strings` on the section data slice +/// 3. For each candidate string, compute confidence using noise filters if enabled +/// 4. Apply confidence threshold filtering if noise filtering is enabled +/// 5. Post-process each FoundString to adjust offsets (add section.offset to relative offsets) +/// 6. Populate section field with section.name.clone() +/// 7. Populate rva field with calculated value (section.rva + relative_offset) if section.rva is Some +/// 8. Return the adjusted vector of FoundStrings +/// +/// # Arguments +/// +/// * `section` - Section metadata +/// * `data` - Raw binary data +/// * `config` - Extraction configuration +/// * `noise_filter_config` - Optional noise filter configuration (if None, filtering is skipped) +/// * `noise_filtering_enabled` - Whether to apply noise filtering +/// * `min_confidence_threshold` - Minimum confidence threshold for filtering +/// +/// # Returns +/// +/// Vector of FoundString entries with complete metadata including: +/// - Adjusted absolute offsets (section.offset + relative_offset) +/// - Section name populated +/// - RVA calculated if section.rva is available +/// - Confidence scores computed from noise filters +/// +/// # Edge Cases +/// +/// - Section boundaries: ensures slice doesn't exceed data.len() +/// - Section offset + size overflow: uses checked arithmetic +/// - Empty sections return empty vector +/// - Sections beyond data bounds return empty vector +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::ascii::{extract_from_section, AsciiExtractionConfig}; +/// use stringy::extraction::config::NoiseFilterConfig; +/// use stringy::types::{SectionInfo, SectionType}; +/// +/// let section = SectionInfo { +/// name: ".rodata".to_string(), +/// offset: 10, +/// size: 20, +/// rva: Some(0x1000), +/// section_type: SectionType::StringData, +/// is_executable: false, +/// is_writable: false, +/// weight: 1.0, +/// }; +/// +/// let data = b"prefix\0Hello World\0suffix"; +/// let config = AsciiExtractionConfig::default(); +/// let noise_config = Some(NoiseFilterConfig::default()); +/// let strings = extract_from_section(§ion, data, &config, noise_config.as_ref(), true, 0.5); +/// +/// // Strings will have adjusted offsets and section metadata +/// for string in strings { +/// assert_eq!(string.section, Some(".rodata".to_string())); +/// assert!(string.offset >= 10); +/// } +/// ``` +pub fn extract_from_section( + section: &SectionInfo, + data: &[u8], + config: &AsciiExtractionConfig, + noise_filter_config: Option<&NoiseFilterConfig>, + noise_filtering_enabled: bool, + min_confidence_threshold: f32, +) -> Vec { + // Calculate section data slice with bounds checking + let section_offset = section.offset as usize; + let section_size = section.size as usize; + + // Check if section is out of bounds + if section_offset >= data.len() { + return Vec::new(); + } + + // Calculate end offset with checked arithmetic + let end_offset = section_offset + .checked_add(section_size) + .unwrap_or(data.len()) + .min(data.len()); + + // Extract section data slice + let section_data = &data[section_offset..end_offset]; + + // Extract strings from section data + let strings = extract_ascii_strings(section_data, config); + + // Build filter context from section + let filter_context = FilterContext::from_section(section); + + // Create composite noise filter if filtering is enabled and config is provided + let filter = if noise_filtering_enabled { + noise_filter_config.map(CompositeNoiseFilter::new) + } else { + None + }; + + // Post-process: compute confidence, apply threshold, adjust offsets and populate metadata + let mut filtered_strings = Vec::new(); + for mut string in strings { + // Compute confidence if filtering is enabled + if let Some(ref noise_filter) = filter { + string.confidence = noise_filter.calculate_confidence(&string.text, &filter_context); + // Apply threshold filtering + if noise_filtering_enabled && string.confidence < min_confidence_threshold { + continue; + } + } else { + // If filtering is disabled, keep default confidence of 1.0 + string.confidence = 1.0; + } + + // Adjust offset: add section.offset to relative offset + // string.offset is relative to section_data (starts at 0), so add section.offset + let relative_offset = string.offset; + string.offset = section.offset + relative_offset; + + // Populate section name + string.section = Some(section.name.clone()); + + // Calculate and populate RVA if section.rva is available + if let Some(base_rva) = section.rva { + // relative_offset is the offset within the section + string.rva = Some(base_rva + relative_offset); + } + + filtered_strings.push(string); + } + + filtered_strings +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{SectionInfo, SectionType}; + + // Helper to create test section + fn create_test_section(name: &str, offset: u64, size: u64, rva: Option) -> SectionInfo { + SectionInfo { + name: name.to_string(), + offset, + size, + rva, + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + } + } + + #[test] + fn test_is_printable_ascii() { + // Printable ASCII range (0x20-0x7E) + assert!(is_printable_ascii(0x20)); // space + assert!(is_printable_ascii(0x21)); // ! + assert!(is_printable_ascii(0x41)); // A + assert!(is_printable_ascii(0x5A)); // Z + assert!(is_printable_ascii(0x61)); // a + assert!(is_printable_ascii(0x7A)); // z + assert!(is_printable_ascii(0x30)); // 0 + assert!(is_printable_ascii(0x39)); // 9 + assert!(is_printable_ascii(0x7E)); // ~ + + // Non-printable + assert!(!is_printable_ascii(0x00)); + assert!(!is_printable_ascii(0x1F)); + assert!(!is_printable_ascii(0x7F)); + assert!(!is_printable_ascii(0xFF)); + } + + #[test] + fn test_extract_ascii_strings_basic() { + // Basic extraction with default minimum length (4) + let data = b"Hello\0World\0Test"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[0].encoding, Encoding::Ascii); + assert_eq!(strings[0].source, StringSource::SectionData); + assert_eq!(strings[1].text, "World"); + assert_eq!(strings[1].offset, 6); + assert_eq!(strings[2].text, "Test"); + assert_eq!(strings[2].offset, 12); + } + + #[test] + fn test_extract_ascii_strings_custom_min_length() { + // Custom minimum length filtering + let data = b"Hi\0Test\0AB\0LongString"; + let config = AsciiExtractionConfig::new(3); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Test"); + assert_eq!(strings[1].text, "LongString"); + // "Hi" and "AB" should be filtered out (length < 3) + } + + #[test] + fn test_extract_ascii_strings_min_length_5() { + let data = b"Test\0Hello\0World"; + let config = AsciiExtractionConfig::new(5); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[1].text, "World"); + // "Test" should be filtered out (length < 5) + } + + #[test] + fn test_extract_ascii_strings_min_length_10() { + let data = b"Short\0VeryLongStringHere"; + let config = AsciiExtractionConfig::new(10); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "VeryLongStringHere"); + } + + #[test] + fn test_extract_ascii_strings_empty_input() { + // Empty input edge case + let data = b""; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert!(strings.is_empty()); + } + + #[test] + fn test_extract_ascii_strings_no_strings_found() { + // No strings found (all binary data) + let data = &[0x00, 0xFF, 0x01, 0x02, 0x03]; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert!(strings.is_empty()); + } + + #[test] + fn test_extract_ascii_strings_string_at_start() { + // String at buffer start + let data = b"Start\0Middle\0End"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + // "End" is only 3 characters, below min_length=4, so filtered out + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Start"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[1].text, "Middle"); + } + + #[test] + fn test_extract_ascii_strings_string_at_end() { + // String at buffer end + let data = b"Start\0Middle\0EndTest"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[2].text, "EndTest"); + assert_eq!(strings[2].offset, 13); + } + + #[test] + fn test_extract_ascii_strings_single_char_below_minimum() { + // Single character below minimum + let data = b"A\0Test\0B\0C"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Test"); + // Single characters should be filtered out + } + + #[test] + fn test_extract_ascii_strings_exact_minimum_length() { + // Exact minimum length string + let data = b"Test\0Hello"; + let config = AsciiExtractionConfig::default(); // min_length = 4 + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Test"); + assert_eq!(strings[0].length, 4); + assert_eq!(strings[1].text, "Hello"); + } + + #[test] + fn test_extract_ascii_strings_offset_calculation() { + // Offset calculation correctness + let data = b"prefix\0Hello\0World\0suffix"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + // All strings are >= 4 characters, so all should be extracted + assert_eq!(strings.len(), 4); + assert_eq!(strings[0].text, "prefix"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[1].text, "Hello"); + assert_eq!(strings[1].offset, 7); // "prefix\0" = 7 bytes + assert_eq!(strings[2].text, "World"); + assert_eq!(strings[2].offset, 13); // "prefix\0Hello\0" = 13 bytes + assert_eq!(strings[3].text, "suffix"); + assert_eq!(strings[3].offset, 19); // "prefix\0Hello\0World\0" = 19 bytes + } + + #[test] + fn test_extract_ascii_strings_multiple_strings_sequence() { + // Multiple strings in sequence + let data = b"First\0Second\0Third\0Fourth"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 4); + assert_eq!(strings[0].text, "First"); + assert_eq!(strings[1].text, "Second"); + assert_eq!(strings[2].text, "Third"); + assert_eq!(strings[3].text, "Fourth"); + } + + #[test] + fn test_extract_ascii_strings_separated_by_single_byte() { + // Strings separated by single non-printable byte + let data = b"Hello\x00World\x01Test"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[1].text, "World"); + assert_eq!(strings[2].text, "Test"); + } + + #[test] + fn test_extract_ascii_strings_max_length_filtering() { + // Max length filtering if configured + let data = b"Short\0VeryLongStringHere"; + let config = AsciiExtractionConfig { + max_length: Some(10), + ..Default::default() + }; + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Short"); + // "VeryLongStringHere" should be filtered out (length > 10) + } + + #[test] + fn test_extract_ascii_strings_very_long_string() { + // Very long strings (test max_length enforcement) + let long_string = "A".repeat(1000); + let data = format!("{}\0Short", long_string).into_bytes(); + let config = AsciiExtractionConfig { + max_length: Some(100), + ..Default::default() + }; + let strings = extract_ascii_strings(&data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Short"); + // Very long string should be filtered out + } + + #[test] + fn test_extract_from_section_basic() { + // Basic section extraction + let section = create_test_section(".rodata", 0, 20, Some(0x1000)); + let data = b"Hello World\0Test"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Hello World"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[0].rva, Some(0x1000)); + assert_eq!(strings[0].section, Some(".rodata".to_string())); + assert_eq!(strings[1].text, "Test"); + assert_eq!(strings[1].offset, 12); + assert_eq!(strings[1].rva, Some(0x100C)); + } + + #[test] + fn test_extract_from_section_offset_adjustment() { + // Section metadata population (verify section name and RVA) + // data = b"prefix\0Hello World\0suffix" + // "prefix\0" = 7 bytes, so "Hello World" starts at offset 7 + // Section should start at 7 to include "Hello World" + let section = create_test_section(".data", 7, 12, Some(0x2000)); + let data = b"prefix\0Hello World\0suffix"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Hello World"); + // Section starts at 7, "Hello World" is at relative offset 0 within section + // Absolute offset = section.offset (7) + relative_offset (0) = 7 + assert_eq!(strings[0].offset, 7); + assert_eq!(strings[0].rva, Some(0x2000)); + assert_eq!(strings[0].section, Some(".data".to_string())); + } + + #[test] + fn test_extract_from_section_rva_calculation() { + // RVA calculation with section offset + let section = create_test_section(".text", 5, 10, Some(0x1000)); + let data = b"pre\0Hello\0suf"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + if !strings.is_empty() { + // Section data is data[5..15] = "Hello\0suf" + // "Hello" is at relative offset 0 + // Absolute offset = 5 + 0 = 5 + // RVA = 0x1000 + 0 = 0x1000 + assert_eq!(strings[0].offset, 5); + assert_eq!(strings[0].rva, Some(0x1000)); + } + } + + #[test] + fn test_extract_from_section_no_rva() { + // Section without RVA + let section = create_test_section(".data", 0, 20, None); + let data = b"Hello World\0Test"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].rva, None); + assert_eq!(strings[1].rva, None); + } + + #[test] + fn test_extract_from_section_section_name() { + // Verify section name is populated + let section = create_test_section(".custom", 0, 20, Some(0x3000)); + let data = b"Test String\0Another"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + for string in &strings { + assert_eq!(string.section, Some(".custom".to_string())); + } + } + + #[test] + fn test_extract_from_section_bounds_checking() { + // Section boundaries (ensure slice doesn't exceed data.len()) + let section = create_test_section(".data", 0, 1000, None); + let data = b"Short data"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + // Should only extract from available data, not panic + assert!(strings.len() <= 1); + } + + #[test] + fn test_extract_from_section_out_of_bounds() { + // Section offset + size overflow (use checked arithmetic) + let section = create_test_section(".data", 1000, 100, None); + let data = b"Short data"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + // Should return empty vector, not panic + assert!(strings.is_empty()); + } + + #[test] + fn test_extract_from_section_empty_section() { + // Empty section + let section = create_test_section(".empty", 0, 0, None); + let data = b"Some data"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + assert!(strings.is_empty()); + } + + #[test] + fn test_extraction_config_default() { + let config = AsciiExtractionConfig::default(); + assert_eq!(config.min_length, 4); + assert_eq!(config.max_length, None); + } + + #[test] + fn test_extraction_config_new() { + let config = AsciiExtractionConfig::new(8); + assert_eq!(config.min_length, 8); + assert_eq!(config.max_length, None); + } + + #[test] + fn test_extraction_config_custom_max_length() { + let config = AsciiExtractionConfig { + max_length: Some(256), + ..Default::default() + }; + assert_eq!(config.min_length, 4); + assert_eq!(config.max_length, Some(256)); + } +} diff --git a/src/extraction/config.rs b/src/extraction/config.rs new file mode 100644 index 0000000..af2f678 --- /dev/null +++ b/src/extraction/config.rs @@ -0,0 +1,221 @@ +//! Extraction Configuration Module +//! +//! This module provides configuration structures for controlling string extraction +//! and noise filtering behavior. It allows fine-tuning of thresholds, filter weights, +//! and extraction parameters. + +/// Configuration for noise filtering heuristics +/// +/// Controls thresholds and parameters for the various noise detection filters. +/// All thresholds are configurable to allow fine-tuning for different use cases. +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::config::NoiseFilterConfig; +/// +/// // Use default configuration +/// let config = NoiseFilterConfig::default(); +/// +/// // Customize thresholds +/// let mut config = NoiseFilterConfig::default(); +/// config.entropy_min = 2.0; +/// config.entropy_max = 7.0; +/// ``` +#[derive(Debug, Clone)] +pub struct NoiseFilterConfig { + /// Minimum entropy threshold in bits per byte (default: 1.5) + /// + /// Strings with entropy below this are likely padding or repetition. + pub entropy_min: f32, + /// Maximum entropy threshold in bits per byte (default: 7.5) + /// + /// Strings with entropy above this are likely random binary data. + pub entropy_max: f32, + /// Maximum string length before applying penalty (default: 200) + /// + /// Very long strings are often table data or other structured content. + pub max_length: usize, + /// Maximum ratio of repeated characters (default: 0.7) + /// + /// Strings with higher repetition ratios are likely padding or noise. + pub max_repetition_ratio: f32, + /// Minimum vowel ratio for linguistic filter (default: 0.1) + /// + /// Used to detect consonant-heavy strings that may be noise. + pub min_vowel_ratio: f32, + /// Maximum vowel ratio for linguistic filter (default: 0.9) + /// + /// Used to detect vowel-heavy strings that may be noise. + pub max_vowel_ratio: f32, + /// Weights for combining filter scores (default: balanced weights) + pub filter_weights: FilterWeights, +} + +impl Default for NoiseFilterConfig { + fn default() -> Self { + Self { + entropy_min: 1.5, + entropy_max: 7.5, + max_length: 200, + max_repetition_ratio: 0.7, + min_vowel_ratio: 0.1, + max_vowel_ratio: 0.9, + filter_weights: FilterWeights::default(), + } + } +} + +impl NoiseFilterConfig { + /// Validate the configuration + /// + /// Returns an error if any thresholds are invalid. + pub fn validate(&self) -> Result<(), String> { + if self.entropy_min < 0.0 || self.entropy_min > 8.0 { + return Err("entropy_min must be between 0.0 and 8.0".to_string()); + } + if self.entropy_max < 0.0 || self.entropy_max > 8.0 { + return Err("entropy_max must be between 0.0 and 8.0".to_string()); + } + if self.entropy_min >= self.entropy_max { + return Err("entropy_min must be less than entropy_max".to_string()); + } + if self.max_length == 0 { + return Err("max_length must be greater than 0".to_string()); + } + if !(0.0..=1.0).contains(&self.max_repetition_ratio) { + return Err("max_repetition_ratio must be between 0.0 and 1.0".to_string()); + } + if !(0.0..=1.0).contains(&self.min_vowel_ratio) { + return Err("min_vowel_ratio must be between 0.0 and 1.0".to_string()); + } + if !(0.0..=1.0).contains(&self.max_vowel_ratio) { + return Err("max_vowel_ratio must be between 0.0 and 1.0".to_string()); + } + if self.min_vowel_ratio >= self.max_vowel_ratio { + return Err("min_vowel_ratio must be less than max_vowel_ratio".to_string()); + } + self.filter_weights.validate()?; + Ok(()) + } +} + +/// Weights for combining multiple filter confidence scores +/// +/// These weights control how individual filter scores are combined into +/// an overall confidence assessment. All weights must sum to 1.0. +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::config::FilterWeights; +/// +/// // Use default weights +/// let weights = FilterWeights::default(); +/// +/// // Customize weights (must sum to 1.0) +/// let weights = FilterWeights { +/// entropy_weight: 0.3, +/// char_distribution_weight: 0.25, +/// linguistic_weight: 0.2, +/// length_weight: 0.15, +/// repetition_weight: 0.05, +/// context_weight: 0.05, +/// }; +/// ``` +#[derive(Debug, Clone)] +pub struct FilterWeights { + /// Weight for entropy filter (default: 0.25) + pub entropy_weight: f32, + /// Weight for character distribution filter (default: 0.20) + pub char_distribution_weight: f32, + /// Weight for linguistic pattern filter (default: 0.20) + pub linguistic_weight: f32, + /// Weight for length filter (default: 0.15) + pub length_weight: f32, + /// Weight for repetition filter (default: 0.10) + pub repetition_weight: f32, + /// Weight for context-aware filter (default: 0.10) + pub context_weight: f32, +} + +impl Default for FilterWeights { + fn default() -> Self { + Self { + entropy_weight: 0.25, + char_distribution_weight: 0.20, + linguistic_weight: 0.20, + length_weight: 0.15, + repetition_weight: 0.10, + context_weight: 0.10, + } + } +} + +impl FilterWeights { + /// Validate that weights sum to 1.0 + /// + /// Returns an error if the sum is not approximately 1.0 (within 0.01 tolerance). + pub fn validate(&self) -> Result<(), String> { + let sum = self.entropy_weight + + self.char_distribution_weight + + self.linguistic_weight + + self.length_weight + + self.repetition_weight + + self.context_weight; + if (sum - 1.0).abs() > 0.01 { + return Err(format!("Filter weights must sum to 1.0, got {}", sum)); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_noise_filter_config_default() { + let config = NoiseFilterConfig::default(); + assert_eq!(config.entropy_min, 1.5); + assert_eq!(config.entropy_max, 7.5); + assert_eq!(config.max_length, 200); + assert_eq!(config.max_repetition_ratio, 0.7); + } + + #[test] + fn test_noise_filter_config_validate() { + let mut config = NoiseFilterConfig::default(); + assert!(config.validate().is_ok()); + + config.entropy_min = 8.0; + assert!(config.validate().is_err()); + + config.entropy_min = 1.5; + config.entropy_max = 1.0; + assert!(config.validate().is_err()); + } + + #[test] + fn test_filter_weights_default() { + let weights = FilterWeights::default(); + assert_eq!(weights.entropy_weight, 0.25); + assert_eq!(weights.char_distribution_weight, 0.20); + assert_eq!(weights.linguistic_weight, 0.20); + assert_eq!(weights.length_weight, 0.15); + assert_eq!(weights.repetition_weight, 0.10); + assert_eq!(weights.context_weight, 0.10); + } + + #[test] + fn test_filter_weights_validate() { + let weights = FilterWeights::default(); + assert!(weights.validate().is_ok()); + + let bad_weights = FilterWeights { + entropy_weight: 0.5, + ..Default::default() + }; + assert!(bad_weights.validate().is_err()); + } +} diff --git a/src/extraction/filters.rs b/src/extraction/filters.rs new file mode 100644 index 0000000..a7e7ab1 --- /dev/null +++ b/src/extraction/filters.rs @@ -0,0 +1,702 @@ +//! Noise Filtering Module +//! +//! This module provides multi-layered heuristic filters for detecting and filtering +//! noise in extracted strings. It uses a combination of entropy analysis, character +//! distribution, linguistic patterns, length checks, repetition detection, and +//! context-aware filtering to assign confidence scores to strings. + +use crate::extraction::config::{FilterWeights, NoiseFilterConfig}; +use crate::types::{SectionInfo, SectionType}; + +/// Context information for noise filtering +/// +/// Provides section metadata and surrounding context to help filters make +/// informed decisions about string legitimacy. +#[derive(Debug, Clone)] +pub struct FilterContext { + /// Section type where the string was found + pub section_type: SectionType, + /// Section name + pub section_name: Option, + /// Section weight (higher = more likely to contain strings) + pub section_weight: f32, + /// Whether the section is executable + pub is_executable: bool, + /// Whether the section is writable + pub is_writable: bool, + /// Surrounding bytes for context (optional, for future use) + pub surrounding_bytes: Option>, +} + +impl Default for FilterContext { + fn default() -> Self { + Self { + section_type: SectionType::Other, + section_name: None, + section_weight: 0.5, + is_executable: false, + is_writable: false, + surrounding_bytes: None, + } + } +} + +impl FilterContext { + /// Create a new FilterContext from a SectionInfo + pub fn from_section(section: &SectionInfo) -> Self { + Self { + section_type: section.section_type, + section_name: Some(section.name.clone()), + section_weight: section.weight, + is_executable: section.is_executable, + is_writable: section.is_writable, + surrounding_bytes: None, + } + } +} + +/// Trait for noise filters that calculate confidence scores +/// +/// Each filter implements this trait to provide a confidence score (0.0-1.0) +/// indicating how likely a string is to be legitimate vs noise. +pub trait NoiseFilter { + /// Calculate confidence score for a string + /// + /// Returns a value between 0.0 (definitely noise) and 1.0 (definitely legitimate). + /// + /// # Arguments + /// + /// * `text` - The string text to analyze + /// * `context` - Context information about where the string was found + /// + /// # Returns + /// + /// Confidence score between 0.0 and 1.0 + fn calculate_confidence(&self, text: &str, context: &FilterContext) -> f32; +} + +/// Character distribution filter +/// +/// Detects abnormal character frequency distributions that indicate noise: +/// - Excessive punctuation (>80%) +/// - Excessive repetition of same character (>90%) +/// - Excessive non-alphanumeric characters (>70%) +pub struct CharDistributionFilter; + +impl NoiseFilter for CharDistributionFilter { + fn calculate_confidence(&self, text: &str, _context: &FilterContext) -> f32 { + if text.is_empty() { + return 0.0; + } + + let chars: Vec = text.chars().collect(); + let total = chars.len() as f32; + + // Count character types + let mut punctuation_count = 0; + let mut alphanumeric_count = 0; + let mut char_counts = std::collections::HashMap::new(); + + for &ch in &chars { + if ch.is_ascii_punctuation() { + punctuation_count += 1; + } + if ch.is_alphanumeric() { + alphanumeric_count += 1; + } + *char_counts.entry(ch).or_insert(0) += 1; + } + + // Check for excessive punctuation + let punctuation_ratio = punctuation_count as f32 / total; + if punctuation_ratio > 0.8 { + return 0.2; // Very low confidence + } + + // Check for excessive repetition of same character + let max_char_count = char_counts.values().max().copied().unwrap_or(0) as f32; + let max_char_ratio = max_char_count / total; + if max_char_ratio > 0.9 { + return 0.1; // Very low confidence (likely padding) + } + + // Check for excessive non-alphanumeric + let non_alphanumeric_ratio = 1.0 - (alphanumeric_count as f32 / total); + if non_alphanumeric_ratio > 0.7 { + return 0.3; // Low confidence + } + + // Reasonable distribution + if punctuation_ratio < 0.3 && max_char_ratio < 0.5 && non_alphanumeric_ratio < 0.4 { + 1.0 // High confidence + } else { + 0.7 // Moderate confidence + } + } +} + +/// Entropy-based filter +/// +/// Uses Shannon entropy to detect low-entropy (padding/repetition) and +/// high-entropy (random binary) strings. Optimal range for text is 3.5-6.0 bits/byte. +pub struct EntropyFilter { + /// Minimum entropy threshold + pub entropy_min: f32, + /// Maximum entropy threshold + pub entropy_max: f32, +} + +impl EntropyFilter { + /// Create a new EntropyFilter with custom thresholds + pub fn new(entropy_min: f32, entropy_max: f32) -> Self { + Self { + entropy_min, + entropy_max, + } + } +} + +impl NoiseFilter for EntropyFilter { + fn calculate_confidence(&self, text: &str, _context: &FilterContext) -> f32 { + if text.is_empty() { + return 0.0; + } + + let bytes = text.as_bytes(); + let entropy = entropy::shannon_entropy(bytes); + + // Very low entropy (< 1.5) - likely padding or repetition + if entropy < self.entropy_min { + return 0.1; + } + + // Very high entropy (> 7.5) - likely random binary + if entropy > self.entropy_max { + return 0.2; + } + + // Optimal range for text: 3.5-6.0 bits/byte + if (3.5..=6.0).contains(&entropy) { + 1.0 // High confidence + } else if (2.0..3.5).contains(&entropy) { + 0.7 // Moderate confidence (low but acceptable) + } else if (6.0..=7.0).contains(&entropy) { + 0.6 // Moderate confidence (high but acceptable) + } else { + 0.4 // Lower confidence (outside optimal range) + } + } +} + +/// Linguistic pattern filter +/// +/// Detects word-like patterns by analyzing vowel-to-consonant ratios and +/// common bigrams. Handles non-English strings gracefully. +pub struct LinguisticFilter { + /// Minimum vowel ratio + pub min_vowel_ratio: f32, + /// Maximum vowel ratio + pub max_vowel_ratio: f32, +} + +impl LinguisticFilter { + /// Create a new LinguisticFilter with custom thresholds + pub fn new(min_vowel_ratio: f32, max_vowel_ratio: f32) -> Self { + Self { + min_vowel_ratio, + max_vowel_ratio, + } + } +} + +impl NoiseFilter for LinguisticFilter { + fn calculate_confidence(&self, text: &str, _context: &FilterContext) -> f32 { + if text.is_empty() { + return 0.0; + } + + let chars: Vec = text.chars().collect(); + let total = chars.len() as f32; + + if total == 0.0 { + return 0.0; + } + + // Count vowels and consonants (case-insensitive) + let mut vowel_count = 0; + let mut consonant_count = 0; + + for &ch in &chars { + let ch_lower = ch.to_ascii_lowercase(); + match ch_lower { + 'a' | 'e' | 'i' | 'o' | 'u' => vowel_count += 1, + 'b' | 'c' | 'd' | 'f' | 'g' | 'h' | 'j' | 'k' | 'l' | 'm' | 'n' | 'p' | 'q' + | 'r' | 's' | 't' | 'v' | 'w' | 'x' | 'y' | 'z' => consonant_count += 1, + _ => {} // Ignore non-letters + } + } + + let letter_count = vowel_count + consonant_count; + if letter_count == 0 { + // No letters, check for numbers/symbols + // Strings with only numbers/symbols might still be legitimate + return 0.6; + } + + let vowel_ratio = vowel_count as f32 / letter_count as f32; + + // Check vowel ratio + if vowel_ratio < self.min_vowel_ratio { + // Consonant-heavy (might be noise or non-English) + return 0.5; + } + if vowel_ratio > self.max_vowel_ratio { + // Vowel-heavy (likely noise) + return 0.3; + } + + // Check for common English bigrams + let common_bigrams = ["th", "he", "in", "er", "an", "re", "on", "at", "en", "nd"]; + let text_lower = text.to_ascii_lowercase(); + let mut bigram_count = 0; + for bigram in &common_bigrams { + if text_lower.contains(bigram) { + bigram_count += 1; + } + } + + // Good vowel ratio and some common bigrams + if (0.2..=0.8).contains(&vowel_ratio) && bigram_count > 0 { + 1.0 // High confidence + } else if (0.1..=0.9).contains(&vowel_ratio) { + 0.7 // Moderate confidence + } else { + 0.4 // Lower confidence + } + } +} + +/// Length-based filter +/// +/// Penalizes excessively long strings (likely table data) and very short +/// strings in low-weight sections. +pub struct LengthFilter { + /// Maximum length before penalty + pub max_length: usize, +} + +impl LengthFilter { + /// Create a new LengthFilter with custom threshold + pub fn new(max_length: usize) -> Self { + Self { max_length } + } +} + +impl NoiseFilter for LengthFilter { + fn calculate_confidence(&self, text: &str, context: &FilterContext) -> f32 { + let len = text.len(); + + // Excessively long strings are likely table data + if len > self.max_length { + return 0.3; // Low confidence + } + + // Very short strings in low-weight sections are suspicious + if len < 4 && context.section_weight < 0.5 { + return 0.5; // Moderate confidence + } + + // Normal length strings + if (4..=100).contains(&len) { + 1.0 // High confidence + } else if (100..=self.max_length).contains(&len) { + 0.7 // Moderate confidence (long but acceptable) + } else { + 0.6 // Lower confidence + } + } +} + +/// Repetition detection filter +/// +/// Detects repeated character patterns (e.g., "AAAA", "0000") and +/// repeated substrings (e.g., "abcabcabc"). +pub struct RepetitionFilter { + /// Maximum ratio of repeated characters + pub max_repetition_ratio: f32, +} + +impl RepetitionFilter { + /// Create a new RepetitionFilter with custom threshold + pub fn new(max_repetition_ratio: f32) -> Self { + Self { + max_repetition_ratio, + } + } +} + +impl NoiseFilter for RepetitionFilter { + fn calculate_confidence(&self, text: &str, _context: &FilterContext) -> f32 { + if text.is_empty() { + return 0.0; + } + + let chars: Vec = text.chars().collect(); + let total = chars.len() as f32; + + // Check for repeated characters + let mut char_counts = std::collections::HashMap::new(); + for &ch in &chars { + *char_counts.entry(ch).or_insert(0) += 1; + } + + let max_char_count = char_counts.values().max().copied().unwrap_or(0) as f32; + let max_char_ratio = max_char_count / total; + + if max_char_ratio > self.max_repetition_ratio { + return 0.1; // Very low confidence (likely padding) + } + + // Check for repeated substrings (optimized to avoid O(n^3)) + // Cap pattern_len to a small bound (8-16) to avoid excessive computation + let max_pattern_len = (total as usize / 3).min(16).min(chars.len()); + + if total >= 6.0 && max_pattern_len > 0 { + // Early exit optimization: if we can't possibly get 3 repetitions, skip + let min_pattern_len_for_3_reps = ((total as usize) as f32 / 3.0).ceil() as usize; + if min_pattern_len_for_3_reps > max_pattern_len { + return 1.0; // Can't have 3 repetitions, so no issue + } + + // Check patterns starting from length 1 up to max_pattern_len + for pattern_len in 1..=max_pattern_len { + // Early exit: if pattern_len is too large to repeat 3 times, skip + if pattern_len * 3 > chars.len() { + break; + } + + // Use slice comparison instead of constructing String + let pattern_slice = &chars[0..pattern_len]; + let mut count = 1; // First occurrence + let mut pos = pattern_len; + + // Check for repetitions + while pos + pattern_len <= chars.len() && count < 3 { + let candidate_slice = &chars[pos..pos + pattern_len]; + // Compare slices directly (char comparison) + if pattern_slice == candidate_slice { + count += 1; + pos += pattern_len; + } else { + break; // Pattern broken, try next pattern length + } + } + + if count >= 3 { + return 0.2; // Low confidence (repetitive pattern) + } + } + } + + // No significant repetition + 1.0 + } +} + +/// Context-aware filter +/// +/// Boosts confidence for strings in high-weight sections (.rodata, .rdata, __cstring) +/// and reduces confidence for strings in code sections. Considers section permissions. +pub struct ContextFilter; + +impl NoiseFilter for ContextFilter { + fn calculate_confidence(&self, _text: &str, context: &FilterContext) -> f32 { + // Boost confidence for high-weight sections + match context.section_type { + SectionType::StringData => { + // .rodata, .rdata, __cstring - very likely to contain strings + if !context.is_executable && !context.is_writable { + return 1.0; // Read-only string data section + } + 0.9 // String data section (even if writable) + } + SectionType::ReadOnlyData => { + // Read-only data sections + if !context.is_executable { + return 0.9; + } + 0.7 + } + SectionType::Resources => { + // PE resource sections + 1.0 // Resources are known-good sources + } + SectionType::Code => { + // Code sections - less likely to contain strings + if context.section_weight < 0.3 { + return 0.3; // Low-weight code section + } + 0.5 // Code section with some weight + } + SectionType::WritableData => { + // Writable data sections - moderate confidence + 0.6 + } + SectionType::Debug => { + // Debug sections - may contain strings but lower confidence + 0.5 + } + SectionType::Other => { + // Unknown sections - use section weight as guide + if context.section_weight > 0.7 { + 0.7 + } else if context.section_weight > 0.4 { + 0.5 + } else { + 0.3 + } + } + } + } +} + +/// Composite noise filter +/// +/// Combines multiple filters with configurable weights to produce an overall +/// confidence score. Allows enabling/disabling individual filters. +pub struct CompositeNoiseFilter { + /// Entropy filter + pub entropy_filter: EntropyFilter, + /// Character distribution filter + pub char_distribution_filter: CharDistributionFilter, + /// Linguistic filter + pub linguistic_filter: LinguisticFilter, + /// Length filter + pub length_filter: LengthFilter, + /// Repetition filter + pub repetition_filter: RepetitionFilter, + /// Context filter + pub context_filter: ContextFilter, + /// Filter weights + pub weights: FilterWeights, + /// Whether to enable entropy filter + pub enable_entropy: bool, + /// Whether to enable character distribution filter + pub enable_char_distribution: bool, + /// Whether to enable linguistic filter + pub enable_linguistic: bool, + /// Whether to enable length filter + pub enable_length: bool, + /// Whether to enable repetition filter + pub enable_repetition: bool, + /// Whether to enable context filter + pub enable_context: bool, +} + +impl CompositeNoiseFilter { + /// Create a new CompositeNoiseFilter with default configuration + pub fn new(config: &NoiseFilterConfig) -> Self { + Self { + entropy_filter: EntropyFilter::new(config.entropy_min, config.entropy_max), + char_distribution_filter: CharDistributionFilter, + linguistic_filter: LinguisticFilter::new( + config.min_vowel_ratio, + config.max_vowel_ratio, + ), + length_filter: LengthFilter::new(config.max_length), + repetition_filter: RepetitionFilter::new(config.max_repetition_ratio), + context_filter: ContextFilter, + weights: config.filter_weights.clone(), + enable_entropy: true, + enable_char_distribution: true, + enable_linguistic: true, + enable_length: true, + enable_repetition: true, + enable_context: true, + } + } + + /// Calculate overall confidence score by combining all enabled filters + pub fn calculate_confidence(&self, text: &str, context: &FilterContext) -> f32 { + let mut total_weight = 0.0; + let mut weighted_sum = 0.0; + + if self.enable_entropy { + let score = self.entropy_filter.calculate_confidence(text, context); + weighted_sum += score * self.weights.entropy_weight; + total_weight += self.weights.entropy_weight; + } + + if self.enable_char_distribution { + let score = self + .char_distribution_filter + .calculate_confidence(text, context); + weighted_sum += score * self.weights.char_distribution_weight; + total_weight += self.weights.char_distribution_weight; + } + + if self.enable_linguistic { + let score = self.linguistic_filter.calculate_confidence(text, context); + weighted_sum += score * self.weights.linguistic_weight; + total_weight += self.weights.linguistic_weight; + } + + if self.enable_length { + let score = self.length_filter.calculate_confidence(text, context); + weighted_sum += score * self.weights.length_weight; + total_weight += self.weights.length_weight; + } + + if self.enable_repetition { + let score = self.repetition_filter.calculate_confidence(text, context); + weighted_sum += score * self.weights.repetition_weight; + total_weight += self.weights.repetition_weight; + } + + if self.enable_context { + let score = self.context_filter.calculate_confidence(text, context); + weighted_sum += score * self.weights.context_weight; + total_weight += self.weights.context_weight; + } + + // Normalize by total weight (in case some filters are disabled) + if total_weight > 0.0 { + weighted_sum / total_weight + } else { + 0.5 // Default if all filters disabled + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_char_distribution_filter() { + let filter = CharDistributionFilter; + let context = FilterContext::default(); + + // Normal text + assert!(filter.calculate_confidence("Hello, World!", &context) > 0.7); + + // Excessive punctuation + assert!(filter.calculate_confidence("!!!@@@###$$$", &context) < 0.5); + + // Repeated character + assert!(filter.calculate_confidence("AAAA", &context) < 0.5); + } + + #[test] + fn test_entropy_filter() { + let filter = EntropyFilter::new(1.5, 7.5); + let context = FilterContext::default(); + + // Normal text + assert!(filter.calculate_confidence("Hello, World!", &context) > 0.5); + + // Low entropy (repetition) + assert!(filter.calculate_confidence("AAAA", &context) < 0.5); + + // High entropy (random-like string with many different characters) + // Note: This string may not always have entropy > 7.5 due to repetition of patterns + // The test verifies that very high entropy strings get lower confidence + let random = "!@#$%^&*()_+-=[]{}|;':\",./<>?`~abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let random_confidence = filter.calculate_confidence(random, &context); + // High entropy strings should have lower confidence than normal text + let normal_confidence = filter.calculate_confidence("Hello, World!", &context); + assert!( + random_confidence < normal_confidence, + "High entropy string should have lower confidence than normal text (random: {}, normal: {})", + random_confidence, + normal_confidence + ); + } + + #[test] + fn test_linguistic_filter() { + let filter = LinguisticFilter::new(0.1, 0.9); + let context = FilterContext::default(); + + // Normal English text + assert!(filter.calculate_confidence("Hello world", &context) > 0.7); + + // Consonant-heavy + assert!(filter.calculate_confidence("bcdfghjklmnpqrstvwxyz", &context) < 0.7); + + // Vowel-heavy + assert!(filter.calculate_confidence("aeiouaeiou", &context) < 0.7); + } + + #[test] + fn test_length_filter() { + let filter = LengthFilter::new(200); + let context = FilterContext::default(); + + // Normal length + assert!(filter.calculate_confidence("Hello", &context) > 0.7); + + // Very long + let long_string = "A".repeat(300); + assert!(filter.calculate_confidence(&long_string, &context) < 0.5); + + // Very short in low-weight section + let low_weight_context = FilterContext { + section_weight: 0.3, + ..Default::default() + }; + assert!(filter.calculate_confidence("Hi", &low_weight_context) < 0.7); + } + + #[test] + fn test_repetition_filter() { + let filter = RepetitionFilter::new(0.7); + let context = FilterContext::default(); + + // Normal text + assert!(filter.calculate_confidence("Hello", &context) > 0.7); + + // Repeated characters + assert!(filter.calculate_confidence("AAAA", &context) < 0.5); + + // Repeated pattern + assert!(filter.calculate_confidence("abcabcabc", &context) < 0.5); + } + + #[test] + fn test_context_filter() { + let filter = ContextFilter; + + // String data section + let context = FilterContext { + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + ..Default::default() + }; + assert!(filter.calculate_confidence("test", &context) > 0.8); + + // Code section + let context = FilterContext { + section_type: SectionType::Code, + section_weight: 0.1, + ..Default::default() + }; + assert!(filter.calculate_confidence("test", &context) < 0.5); + } + + #[test] + fn test_composite_filter() { + let config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&config); + let context = FilterContext::default(); + + // Normal text should have high confidence + let score = filter.calculate_confidence("Hello, World!", &context); + assert!(score > 0.5); + + // Noise should have low confidence + let noise_score = filter.calculate_confidence("AAAA", &context); + assert!(noise_score < score); + } +} diff --git a/src/extraction/macho_load_commands.rs b/src/extraction/macho_load_commands.rs index c344bbb..35a3254 100644 --- a/src/extraction/macho_load_commands.rs +++ b/src/extraction/macho_load_commands.rs @@ -108,6 +108,7 @@ fn extract_dylib_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + confidence: 1.0, }); } @@ -136,6 +137,7 @@ fn extract_rpath_strings(macho: &MachO) -> Vec { rva: None, length, score: 0, + confidence: 1.0, }); } diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 91c99cb..9769b75 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -4,6 +4,19 @@ //! Each extractor is designed to work with a specific binary format and leverage //! format-specific knowledge to extract meaningful strings. //! +//! ## Core String Extraction Framework +//! +//! The core extraction framework provides a trait-based architecture for extracting +//! strings from binary data: +//! +//! - `StringExtractor`: Trait defining extraction methods +//! - `ExtractionConfig`: Configuration for controlling extraction behavior +//! - `BasicExtractor`: Sequential ASCII/UTF-8 string scanner implementation +//! +//! **Note**: These types (`StringExtractor`, `ExtractionConfig`, `BasicExtractor`) are +//! defined locally in this module and should not be imported within `extraction/mod.rs`. +//! Downstream code should import them from `stringy::extraction` or `stringy` (via re-exports). +//! //! ## PE Resource String Extraction (Phase 2 Complete) //! //! The PE resource extraction module now provides comprehensive string extraction: @@ -11,6 +24,30 @@ //! - `extract_resources()`: Returns resource metadata (Phase 1) //! - `extract_resource_strings()`: Returns actual strings from resources (Phase 2) //! +//! ## ASCII String Extraction +//! +//! The ASCII extraction module provides foundational encoding extraction for StringyMcStringFace. +//! It implements byte-level scanning for contiguous printable ASCII sequences and serves as the +//! reference implementation for future UTF-8, UTF-16LE, and UTF-16BE extractors. +//! +//! - `extract_ascii_strings()`: Basic byte-level ASCII string scanning +//! - `extract_from_section()`: Section-aware extraction with proper metadata population +//! - `AsciiExtractionConfig`: Configuration for minimum/maximum length filtering +//! +//! # ASCII Extraction Example +//! +//! ```rust +//! use stringy::extraction::ascii::{extract_ascii_strings, AsciiExtractionConfig}; +//! +//! let data = b"Hello\0World\0Test123"; +//! let config = AsciiExtractionConfig::default(); +//! let strings = extract_ascii_strings(data, &config); +//! +//! for string in strings { +//! println!("Found: {} at offset {}", string.text, string.offset); +//! } +//! ``` +//! //! ## Mach-O Load Command String Extraction //! //! The Mach-O load command extraction module extracts library dependencies and runtime @@ -22,23 +59,1116 @@ //! # Example //! //! ```rust -//! use stringy::extraction::{extract_resources, extract_resource_strings, extract_load_command_strings}; +//! use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +//! use stringy::container::{detect_format, create_parser}; +//! +//! let data = std::fs::read("example.exe")?; +//! let format = detect_format(&data); +//! let parser = create_parser(format)?; +//! let container_info = parser.parse(&data)?; +//! +//! let extractor = BasicExtractor::new(); +//! let config = ExtractionConfig::default(); +//! let strings = extractor.extract(&data, &container_info, &config)?; //! -//! let pe_data = std::fs::read("example.exe")?; +//! // Format-specific extractors +//! use stringy::extraction::{ +//! extract_ascii_strings, extract_load_command_strings, extract_resources, +//! extract_resource_strings, AsciiExtractionConfig, +//! }; +//! +//! // ASCII extraction +//! let ascii_config = AsciiExtractionConfig::default(); +//! let ascii_strings = extract_ascii_strings(&data, &ascii_config); //! //! // Phase 1: Get resource metadata -//! let metadata = extract_resources(&pe_data); +//! let metadata = extract_resources(&data); //! //! // Phase 2: Extract actual strings from resources -//! let strings = extract_resource_strings(&pe_data); +//! let resource_strings = extract_resource_strings(&data); //! //! // Mach-O load command extraction //! let macho_data = std::fs::read("example.dylib")?; //! let load_command_strings = extract_load_command_strings(&macho_data); //! ``` +use crate::types::{ + ContainerInfo, Encoding, FoundString, Result, SectionInfo, SectionType, StringSource, +}; + +pub mod ascii; +pub mod config; +pub mod filters; pub mod macho_load_commands; pub mod pe_resources; +pub use ascii::{AsciiExtractionConfig, extract_ascii_strings, extract_from_section}; +pub use config::{FilterWeights, NoiseFilterConfig}; +pub use filters::{CompositeNoiseFilter, FilterContext, NoiseFilter}; pub use macho_load_commands::extract_load_command_strings; pub use pe_resources::{extract_resource_strings, extract_resources}; + +/// Configuration for string extraction +/// +/// Controls various aspects of the extraction process including minimum/maximum +/// string lengths, encoding selection, section filtering, and noise filtering. +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::ExtractionConfig; +/// +/// // Use default configuration +/// let config = ExtractionConfig::default(); +/// +/// // Customize configuration +/// let mut config = ExtractionConfig::default(); +/// config.min_length = 8; +/// config.max_length = 2048; +/// config.scan_code_sections = false; +/// config.noise_filtering_enabled = true; +/// config.min_confidence_threshold = 0.6; +/// ``` +#[derive(Debug, Clone)] +pub struct ExtractionConfig { + /// Minimum string length in bytes (default: 4) + pub min_length: usize, + /// Maximum string length in bytes (default: 4096) + pub max_length: usize, + /// Encodings to search for (default: ASCII, UTF-8) + pub encodings: Vec, + /// Whether to scan executable sections (default: true) + pub scan_code_sections: bool, + /// Whether to include debug sections (default: false) + pub include_debug: bool, + /// Section types to prioritize (default: StringData, ReadOnlyData, Resources) + pub section_priority: Vec, + /// Whether to include import/export names (default: true) + pub include_symbols: bool, + /// Minimum length for ASCII strings (default: 4, same as min_length) + pub min_ascii_length: usize, + /// Minimum length for UTF-16 strings (default: 3, for future use) + pub min_wide_length: usize, + /// Which encodings to extract (default: ASCII, UTF-8) + pub enabled_encodings: Vec, + /// Enable/disable noise filtering (default: true) + pub noise_filtering_enabled: bool, + /// Minimum confidence threshold to include string (default: 0.5) + /// + /// Strings with confidence below this threshold will be filtered out. + pub min_confidence_threshold: f32, +} + +impl Default for ExtractionConfig { + fn default() -> Self { + Self { + min_length: 4, + max_length: 4096, + encodings: vec![Encoding::Ascii, Encoding::Utf8], + scan_code_sections: true, + include_debug: false, + section_priority: vec![ + SectionType::StringData, + SectionType::ReadOnlyData, + SectionType::Resources, + ], + include_symbols: true, + min_ascii_length: 4, + min_wide_length: 3, + enabled_encodings: vec![Encoding::Ascii, Encoding::Utf8], + noise_filtering_enabled: true, + min_confidence_threshold: 0.5, + } + } +} + +impl ExtractionConfig { + /// Validate the configuration + /// + /// Returns an error if any thresholds are invalid. + pub fn validate(&self) -> Result<()> { + if self.min_length == 0 { + return Err(crate::types::StringyError::ConfigError( + "min_length must be greater than 0".to_string(), + )); + } + if self.min_ascii_length == 0 { + return Err(crate::types::StringyError::ConfigError( + "min_ascii_length must be greater than 0".to_string(), + )); + } + if self.min_wide_length == 0 { + return Err(crate::types::StringyError::ConfigError( + "min_wide_length must be greater than 0".to_string(), + )); + } + if !(0.0..=1.0).contains(&self.min_confidence_threshold) { + return Err(crate::types::StringyError::ConfigError( + "min_confidence_threshold must be between 0.0 and 1.0".to_string(), + )); + } + Ok(()) + } +} + +/// Trait for extracting strings from binary data +/// +/// Implementations of this trait provide different strategies for extracting +/// strings from binary files, ranging from simple sequential scanning to +/// format-specific extraction algorithms. +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +/// use stringy::container::{detect_format, create_parser}; +/// +/// let data = std::fs::read("binary_file")?; +/// let format = detect_format(&data); +/// let parser = create_parser(format)?; +/// let container_info = parser.parse(&data)?; +/// +/// let extractor = BasicExtractor::new(); +/// let config = ExtractionConfig::default(); +/// let strings = extractor.extract(&data, &container_info, &config)?; +/// ``` +pub trait StringExtractor { + /// Extract strings from entire binary using container metadata + /// + /// This method iterates through all sections in the container and extracts + /// strings from each section based on the provided configuration. + /// + /// # Arguments + /// + /// * `data` - Raw binary data + /// * `container_info` - Container metadata including sections + /// * `config` - Extraction configuration + /// + /// # Returns + /// + /// Vector of found strings with metadata + fn extract( + &self, + data: &[u8], + container_info: &ContainerInfo, + config: &ExtractionConfig, + ) -> Result>; + + /// Extract strings from a specific section + /// + /// This method extracts strings from a single section, useful for targeted + /// extraction or when working with individual sections. + /// + /// # Arguments + /// + /// * `data` - Raw binary data + /// * `section` - Section metadata + /// * `config` - Extraction configuration + /// + /// # Returns + /// + /// Vector of found strings from the section + fn extract_from_section( + &self, + data: &[u8], + section: &SectionInfo, + config: &ExtractionConfig, + ) -> Result>; +} + +/// Basic sequential string extractor +/// +/// Implements a simple sequential scanning algorithm for extracting ASCII and +/// UTF-8 strings from binary data. This extractor scans byte sequences looking +/// for printable characters and validates UTF-8 encoding. +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +/// use stringy::types::{ContainerInfo, SectionInfo, SectionType, BinaryFormat}; +/// +/// let extractor = BasicExtractor::new(); +/// let config = ExtractionConfig::default(); +/// +/// // Create a simple container info for testing +/// let section = SectionInfo { +/// name: ".rodata".to_string(), +/// offset: 0, +/// size: 100, +/// rva: Some(0x1000), +/// section_type: SectionType::StringData, +/// is_executable: false, +/// is_writable: false, +/// weight: 1.0, +/// }; +/// +/// let container_info = ContainerInfo::new( +/// BinaryFormat::Elf, +/// vec![section], +/// vec![], +/// vec![], +/// None, +/// ); +/// +/// let data = b"Hello World\0Test String\0"; +/// let strings = extractor.extract(data, &container_info, &config)?; +/// ``` +#[derive(Debug, Clone)] +pub struct BasicExtractor; + +impl BasicExtractor { + /// Create a new BasicExtractor instance + pub fn new() -> Self { + Self + } +} + +impl Default for BasicExtractor { + fn default() -> Self { + Self::new() + } +} + +impl StringExtractor for BasicExtractor { + fn extract( + &self, + data: &[u8], + container_info: &ContainerInfo, + config: &ExtractionConfig, + ) -> Result> { + let mut all_strings = Vec::new(); + + // Sort sections by priority from config.section_priority + let mut sections: Vec<_> = container_info.sections.iter().collect(); + sections.sort_by_key(|section| { + config + .section_priority + .iter() + .position(|&st| st == section.section_type) + .unwrap_or_else(|| { + // Fallback to section weight (higher weight = higher priority) + // Convert weight to usize for consistent key type + // Use a large offset to ensure fallback sections sort after prioritized ones + let weight_int = (section.weight * 1000.0) as usize; + config.section_priority.len() + (10000 - weight_int.min(10000)) + }) + }); + + for section in sections { + // Filter sections based on config + if section.section_type == SectionType::Debug && !config.include_debug { + continue; + } + + // Filter code sections by both type and executable flag + if (section.section_type == SectionType::Code || section.is_executable) + && !config.scan_code_sections + { + continue; + } + + // Extract strings from this section + let section_strings = self.extract_from_section(data, section, config)?; + all_strings.extend(section_strings); + } + + // Include import/export symbols if configured + if config.include_symbols { + // Add import names + for import in &container_info.imports { + let length = import.name.len() as u32; + all_strings.push(FoundString { + text: import.name.clone(), + encoding: Encoding::Utf8, + offset: 0, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + source: StringSource::ImportName, + confidence: 1.0, + }); + } + + // Add export names + for export in &container_info.exports { + let length = export.name.len() as u32; + all_strings.push(FoundString { + text: export.name.clone(), + encoding: Encoding::Utf8, + offset: 0, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + source: StringSource::ExportName, + confidence: 1.0, + }); + } + } + + Ok(all_strings) + } + + fn extract_from_section( + &self, + data: &[u8], + section: &SectionInfo, + config: &ExtractionConfig, + ) -> Result> { + // Early return for zero-sized sections + if section.size == 0 { + return Ok(Vec::new()); + } + + // Validate section bounds + let section_offset = section.offset as usize; + let section_size = section.size as usize; + + if section_offset >= data.len() { + return Ok(Vec::new()); + } + + let end_offset = section_offset + .checked_add(section_size) + .unwrap_or(data.len()) + .min(data.len()); + + let section_data = &data[section_offset..end_offset]; + + // Use ASCII extractor for ASCII strings + let ascii_config = ascii::AsciiExtractionConfig { + min_length: config.min_ascii_length.max(config.min_length), + max_length: Some(config.max_length), + }; + + // Build noise filter config from extraction config + let noise_filter_config = if config.noise_filtering_enabled { + Some(crate::extraction::config::NoiseFilterConfig::default()) + } else { + None + }; + + // Extract ASCII strings using the dedicated ASCII extractor with filtering + let mut found_strings = ascii::extract_from_section( + section, + data, + &ascii_config, + noise_filter_config.as_ref(), + config.noise_filtering_enabled, + config.min_confidence_threshold, + ); + + // For UTF-8 strings, use the existing helper (only if UTF-8 is enabled) + // Check both encodings and enabled_encodings fields + let utf8_enabled = config.encodings.contains(&Encoding::Utf8) + || config.enabled_encodings.contains(&Encoding::Utf8); + if utf8_enabled { + let raw_strings = + extract_ascii_utf8_strings(section_data, config.min_length, config.max_length); + + // Build filter context for UTF-8 strings + let filter_context = crate::extraction::filters::FilterContext::from_section(section); + let filter = if config.noise_filtering_enabled { + noise_filter_config + .as_ref() + .map(crate::extraction::filters::CompositeNoiseFilter::new) + } else { + None + }; + + for (text, relative_offset, length) in raw_strings { + // Skip if already extracted as ASCII + if text.is_ascii() { + continue; + } + + // Determine encoding + let encoding = Encoding::Utf8; + + // Filter by configured encodings (check both fields) + let encoding_allowed = config.encodings.contains(&encoding) + || config.enabled_encodings.contains(&encoding); + if !encoding_allowed { + continue; + } + + // Compute confidence if filtering is enabled + let confidence = if let Some(ref noise_filter) = filter { + noise_filter.calculate_confidence(&text, &filter_context) + } else { + 1.0 + }; + + // Apply threshold filtering + if config.noise_filtering_enabled && confidence < config.min_confidence_threshold { + continue; + } + + // Calculate absolute offset + let absolute_offset = section.offset + relative_offset as u64; + + // Calculate RVA if available + let rva = section + .rva + .map(|base_rva| base_rva + relative_offset as u64); + + let found_string = FoundString { + text, + encoding, + offset: absolute_offset, + rva, + section: Some(section.name.clone()), + length: length as u32, + tags: Vec::new(), + score: 0, + source: StringSource::SectionData, + confidence, + }; + + found_strings.push(found_string); + } + } + + Ok(found_strings) + } +} + +/// Check if a byte is printable text (ASCII or common whitespace) +/// +/// Printable text includes characters from 0x20 (space) to 0x7E (~), +/// plus common whitespace characters: tab (0x09), newline (0x0A), and +/// carriage return (0x0D). +/// +/// **Note on printable character definitions**: This function is used by the UTF-8-capable +/// extraction helpers and includes common whitespace characters (tab, newline, carriage return) +/// to handle text files and formatted data. This differs from the ASCII-only `is_printable_ascii` +/// function in `extraction::ascii`, which only considers the strict printable range (0x20-0x7E) +/// without whitespace control characters. This difference ensures that: +/// - ASCII-only extraction (`extraction::ascii`) produces strict, predictable results +/// - UTF-8-capable extraction (this module) can handle formatted text with line breaks +/// +/// When using both extractors on the same data, be aware that they may produce different +/// results due to this definitional difference. +fn is_printable_text_byte(byte: u8) -> bool { + matches!(byte, 0x09 | 0x0A | 0x0D | 0x20..=0x7E) +} + +/// Check if a byte could be part of a valid UTF-8 sequence +/// +/// This includes printable ASCII, UTF-8 continuation bytes (0x80-0xBF), +/// and UTF-8 start bytes (0xC2-0xF4 for valid UTF-8 sequences). +fn could_be_utf8_byte(byte: u8) -> bool { + is_printable_text_byte(byte) || matches!(byte, 0x80..=0xBF | 0xC2..=0xF4) +} + +/// Extract ASCII and UTF-8 strings from byte data +/// +/// Scans through the byte data looking for sequences of printable characters +/// and valid UTF-8 sequences. When a byte that cannot be part of a valid +/// string is encountered, checks if the accumulated sequence meets the minimum +/// length requirement and validates it as UTF-8. Strings exceeding max_length +/// are skipped during extraction. +/// +/// # Arguments +/// +/// * `data` - Byte slice to scan +/// * `min_length` - Minimum string length in bytes +/// * `max_length` - Maximum string length in bytes +/// +/// # Returns +/// +/// Vector of tuples containing (text, relative_offset, length) +fn extract_ascii_utf8_strings( + data: &[u8], + min_length: usize, + max_length: usize, +) -> Vec<(String, usize, usize)> { + let mut strings = Vec::new(); + let mut current_string_start: Option = None; + let mut current_string_bytes = Vec::new(); + + for (i, &byte) in data.iter().enumerate() { + if could_be_utf8_byte(byte) { + if current_string_start.is_none() { + current_string_start = Some(i); + } + current_string_bytes.push(byte); + } else { + // End of current string candidate + // Check length conditions first, then extract start to avoid borrow checker issues + #[allow(clippy::collapsible_if)] + if current_string_bytes.len() >= min_length && current_string_bytes.len() <= max_length + { + if let Some(start) = current_string_start { + // Store length before moving + let len = current_string_bytes.len(); + // Move buffer out to avoid cloning + let bytes = std::mem::take(&mut current_string_bytes); + // Try to convert to UTF-8 string + match String::from_utf8(bytes) { + Ok(text) => { + // Create entry tuple to move text into it explicitly + let entry = (text, start, len); + strings.push(entry); + } + Err(_) => { + // Invalid UTF-8, skip this candidate + } + } + } + } + current_string_start = None; + current_string_bytes.clear(); + } + } + + // Handle string at end of data + // Check length conditions first, then extract start to avoid borrow checker issues + #[allow(clippy::collapsible_if)] + if current_string_bytes.len() >= min_length && current_string_bytes.len() <= max_length { + if let Some(start) = current_string_start { + // Store length before moving + let len = current_string_bytes.len(); + // Move buffer out to avoid cloning + let bytes = std::mem::take(&mut current_string_bytes); + match String::from_utf8(bytes) { + Ok(text) => { + // Create entry tuple to move text into it explicitly + let entry = (text, start, len); + strings.push(entry); + } + Err(_) => { + // Invalid UTF-8, skip + } + } + } + } + + strings +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{BinaryFormat, ExportInfo, ImportInfo, SectionType}; + + #[test] + fn test_is_printable_text_byte() { + // Printable ASCII + assert!(is_printable_text_byte(b' ')); + assert!(is_printable_text_byte(b'A')); + assert!(is_printable_text_byte(b'z')); + assert!(is_printable_text_byte(b'0')); + assert!(is_printable_text_byte(b'9')); + assert!(is_printable_text_byte(b'~')); + + // Common whitespace + assert!(is_printable_text_byte(b'\t')); + assert!(is_printable_text_byte(b'\n')); + assert!(is_printable_text_byte(b'\r')); + + // Non-printable + assert!(!is_printable_text_byte(0x00)); + assert!(!is_printable_text_byte(0x1F)); + assert!(!is_printable_text_byte(0x7F)); + assert!(!is_printable_text_byte(0xFF)); + } + + #[test] + fn test_extract_ascii_utf8_strings() { + // Test with ASCII strings + let data = b"Hello\0World\0Test123"; + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].0, "Hello"); + assert_eq!(strings[0].1, 0); + assert_eq!(strings[1].0, "World"); + assert_eq!(strings[1].1, 6); + assert_eq!(strings[2].0, "Test123"); + assert_eq!(strings[2].1, 12); + } + + #[test] + fn test_extract_ascii_utf8_strings_utf8() { + // Test with UTF-8 strings + let data = "Hello 世界\0Test".as_bytes(); + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].0, "Hello 世界"); + assert_eq!(strings[1].0, "Test"); + } + + #[test] + fn test_extract_ascii_utf8_strings_min_length() { + // Test minimum length filtering + let data = b"Hi\0Test\0AB\0LongString"; + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].0, "Test"); + assert_eq!(strings[1].0, "LongString"); + } + + #[test] + fn test_extract_ascii_utf8_strings_empty() { + // Test with empty data + let data = b""; + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert!(strings.is_empty()); + } + + #[test] + fn test_extract_ascii_utf8_strings_binary() { + // Test with binary data + let data = &[0x00, 0xFF, 0x01, 0x02, 0x03]; + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert!(strings.is_empty()); + } + + #[test] + fn test_extract_ascii_utf8_strings_at_boundaries() { + // Test strings at start and end + let data = b"Start\0Middle\0EndTest"; + let strings = extract_ascii_utf8_strings(data, 4, 4096); + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].0, "Start"); + assert_eq!(strings[0].1, 0); + assert_eq!(strings[2].0, "EndTest"); + } + + #[test] + fn test_extract_ascii_utf8_strings_max_length() { + // Test maximum length filtering in helper + let data = b"Short\0VeryLongStringHere"; + let strings = extract_ascii_utf8_strings(data, 4, 10); + // Only "Short" should pass max_length filter + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].0, "Short"); + assert!(!strings.iter().any(|s| s.0 == "VeryLongStringHere")); + } + + #[test] + fn test_extraction_config_default() { + let config = ExtractionConfig::default(); + assert_eq!(config.min_length, 4); + assert_eq!(config.max_length, 4096); + assert_eq!(config.encodings.len(), 2); + assert!(config.encodings.contains(&Encoding::Ascii)); + assert!(config.encodings.contains(&Encoding::Utf8)); + assert!(config.scan_code_sections); + assert!(!config.include_debug); + assert_eq!(config.section_priority.len(), 3); + assert!(config.include_symbols); + } + + #[test] + fn test_basic_extractor_extract_from_section() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 20, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = b"Hello World\0Test"; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Hello World"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[0].rva, Some(0x1000)); + assert_eq!(strings[0].section, Some(".rodata".to_string())); + assert_eq!(strings[0].encoding, Encoding::Ascii); + assert_eq!(strings[1].text, "Test"); + assert_eq!(strings[1].offset, 12); + assert_eq!(strings[1].rva, Some(0x100C)); + } + + #[test] + fn test_basic_extractor_max_length_filtering() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + max_length: 10, + ..Default::default() + }; + + let section = SectionInfo { + name: ".data".to_string(), + offset: 0, + size: 30, + rva: None, + section_type: SectionType::WritableData, + is_executable: false, + is_writable: true, + weight: 0.5, + }; + + let data = b"Short\0VeryLongStringHere"; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Only "Short" should pass max_length filter + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Short"); + } + + #[test] + fn test_basic_extractor_section_bounds() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = SectionInfo { + name: ".text".to_string(), + offset: 7, // Start after "prefix\0" + size: 12, // "Hello World" is 11 bytes + null terminator + rva: Some(0x2000), + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let data = b"prefix\0Hello World\0suffix"; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should find "Hello World" in the section + assert!(!strings.is_empty()); + let hello_world = strings.iter().find(|s| s.text == "Hello World"); + assert!(hello_world.is_some(), "Should find 'Hello World' string"); + if let Some(s) = hello_world { + assert_eq!(s.offset, 7); + assert_eq!(s.rva, Some(0x2000)); + } + } + + #[test] + fn test_basic_extractor_empty_section() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = SectionInfo { + name: ".empty".to_string(), + offset: 0, + size: 0, + rva: None, + section_type: SectionType::Other, + is_executable: false, + is_writable: false, + weight: 0.0, + }; + + let data = b""; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + assert!(strings.is_empty()); + } + + #[test] + fn test_basic_extractor_section_out_of_bounds() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = SectionInfo { + name: ".invalid".to_string(), + offset: 1000, + size: 100, + rva: None, + section_type: SectionType::Other, + is_executable: false, + is_writable: false, + weight: 0.0, + }; + + let data = b"small data"; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + assert!(strings.is_empty()); + } + + #[test] + fn test_basic_extractor_utf8_encoding() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 20, + rva: None, + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = "Hello 世界".as_bytes(); + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should extract UTF-8 string "Hello 世界" + // Note: ASCII extractor may also extract "Hello " as a prefix, but UTF-8 extractor + // will extract the full "Hello 世界" string. We check for the UTF-8 string. + let utf8_strings: Vec<_> = strings + .iter() + .filter(|s| s.encoding == Encoding::Utf8 && s.text == "Hello 世界") + .collect(); + assert_eq!( + utf8_strings.len(), + 1, + "Should find UTF-8 string 'Hello 世界', found {} strings total", + strings.len() + ); + assert_eq!(utf8_strings[0].text, "Hello 世界"); + assert_eq!(utf8_strings[0].encoding, Encoding::Utf8); + } + + #[test] + fn test_basic_extractor_encoding_filtering() { + let extractor = BasicExtractor::new(); + // Only allow ASCII, exclude UTF-8 + let config = ExtractionConfig { + encodings: vec![Encoding::Ascii], + enabled_encodings: vec![Encoding::Ascii], + ..Default::default() + }; + + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 30, + rva: None, + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = "Hello\0世界\0Test".as_bytes(); + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should only find ASCII strings, not UTF-8 + // Note: "Hello" and "Test" are ASCII, "世界" is UTF-8 and should be filtered + let ascii_strings: Vec<_> = strings + .iter() + .filter(|s| s.encoding == Encoding::Ascii) + .collect(); + assert_eq!(ascii_strings.len(), 2, "Should find 2 ASCII strings"); + assert!(ascii_strings.iter().any(|s| s.text == "Hello")); + assert!(ascii_strings.iter().any(|s| s.text == "Test")); + // UTF-8 string "世界" should be filtered out + assert!(!strings.iter().any(|s| s.text.contains("世界"))); + } + + #[test] + fn test_basic_extractor_include_symbols() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: true, + ..Default::default() + }; + + let section = SectionInfo { + name: ".text".to_string(), + offset: 0, + size: 10, + rva: None, + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let container_info = ContainerInfo::new( + BinaryFormat::Elf, + vec![section], + vec![ + ImportInfo { + name: "printf".to_string(), + library: Some("libc.so.6".to_string()), + address: Some(0x1000), + ordinal: None, + }, + ImportInfo { + name: "malloc".to_string(), + library: Some("libc.so.6".to_string()), + address: Some(0x2000), + ordinal: None, + }, + ], + vec![ + ExportInfo { + name: "main".to_string(), + address: 0x3000, + ordinal: None, + }, + ExportInfo { + name: "exported_function".to_string(), + address: 0x4000, + ordinal: None, + }, + ], + None, + ); + + let data = b"test data"; + let strings = extractor.extract(data, &container_info, &config).unwrap(); + + // Should include import and export names + let import_strings: Vec<_> = strings + .iter() + .filter(|s| s.source == StringSource::ImportName) + .collect(); + let export_strings: Vec<_> = strings + .iter() + .filter(|s| s.source == StringSource::ExportName) + .collect(); + + assert_eq!(import_strings.len(), 2); + assert!(import_strings.iter().any(|s| s.text == "printf")); + assert!(import_strings.iter().any(|s| s.text == "malloc")); + + assert_eq!(export_strings.len(), 2); + assert!(export_strings.iter().any(|s| s.text == "main")); + assert!(export_strings.iter().any(|s| s.text == "exported_function")); + + // Verify import string properties + let printf_str = import_strings.iter().find(|s| s.text == "printf").unwrap(); + assert_eq!(printf_str.encoding, Encoding::Utf8); + assert_eq!(printf_str.offset, 0); + assert_eq!(printf_str.rva, None); + assert_eq!(printf_str.section, None); + assert_eq!(printf_str.length, 6); + + // Verify export string properties + let main_str = export_strings.iter().find(|s| s.text == "main").unwrap(); + assert_eq!(main_str.encoding, Encoding::Utf8); + assert_eq!(main_str.offset, 0); + assert_eq!(main_str.rva, None); + assert_eq!(main_str.section, None); + assert_eq!(main_str.length, 4); + } + + #[test] + fn test_basic_extractor_exclude_symbols() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: false, + ..Default::default() + }; + + let section = SectionInfo { + name: ".text".to_string(), + offset: 0, + size: 10, + rva: None, + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let container_info = ContainerInfo::new( + BinaryFormat::Elf, + vec![section], + vec![ImportInfo { + name: "printf".to_string(), + library: Some("libc.so.6".to_string()), + address: Some(0x1000), + ordinal: None, + }], + vec![ExportInfo { + name: "main".to_string(), + address: 0x3000, + ordinal: None, + }], + None, + ); + + let data = b"test data"; + let strings = extractor.extract(data, &container_info, &config).unwrap(); + + // Should not include import/export names + assert!(!strings.iter().any(|s| s.source == StringSource::ImportName)); + assert!(!strings.iter().any(|s| s.source == StringSource::ExportName)); + } + + #[test] + fn test_basic_extractor_section_filtering() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + scan_code_sections: false, + include_debug: false, + ..Default::default() + }; + + let code_section = SectionInfo { + name: ".text".to_string(), + offset: 0, + size: 9, + rva: None, + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let debug_section = SectionInfo { + name: ".debug_info".to_string(), + offset: 9, + size: 10, + rva: None, + section_type: SectionType::Debug, + is_executable: false, + is_writable: false, + weight: 0.0, + }; + + let data_section = SectionInfo { + name: ".rodata".to_string(), + offset: 19, + size: 11, + rva: None, + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = b"CodeData\0DebugData\0RoDataTest"; + let container_info = ContainerInfo::new( + BinaryFormat::Elf, + vec![code_section, debug_section, data_section], + vec![], + vec![], + None, + ); + + let strings = extractor.extract(data, &container_info, &config).unwrap(); + + // Should only extract from data section, not code or debug + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "RoDataTest"); + } +} diff --git a/src/extraction/pe_resources.rs b/src/extraction/pe_resources.rs index 5318a9f..eba163c 100644 --- a/src/extraction/pe_resources.rs +++ b/src/extraction/pe_resources.rs @@ -445,6 +445,7 @@ pub fn extract_version_info_strings(data: &[u8]) -> Vec { tags: vec![Tag::Version, Tag::Resource], score: 0, source: StringSource::ResourceString, + confidence: 1.0, }; strings.push(found_string); }); @@ -600,6 +601,7 @@ pub fn extract_string_table_strings(data: &[u8]) -> Vec { tags: vec![Tag::Resource], score: 0, source: StringSource::ResourceString, + confidence: 1.0, }; strings.push(found_string); } @@ -787,6 +789,7 @@ pub fn extract_manifest_strings(data: &[u8]) -> Vec { tags: vec![Tag::Manifest, Tag::Resource], score: 0, source: StringSource::ResourceString, + confidence: 1.0, }; strings.push(found_string); } diff --git a/src/lib.rs b/src/lib.rs index e12e97c..400cfdb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,7 @@ //! //! ```rust //! use stringy::container::{detect_format, create_parser}; +//! use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; //! //! # fn example() -> stringy::Result<()> { //! let data = std::fs::read("binary_file")?; @@ -27,6 +28,18 @@ //! println!("Format: {:?}", container_info.format); //! println!("Sections: {}", container_info.sections.len()); //! println!("Imports: {}", container_info.imports.len()); +//! +//! // Extract strings using the basic extractor +//! let extractor = BasicExtractor::new(); +//! let config = ExtractionConfig::default(); +//! let strings = extractor.extract(&data, &container_info, &config)?; +//! println!("Found {} strings", strings.len()); +//! +//! // ASCII string extraction (foundational encoding type) +//! use stringy::extraction::{extract_ascii_strings, AsciiExtractionConfig}; +//! let ascii_config = AsciiExtractionConfig::default(); +//! let ascii_strings = extract_ascii_strings(&data, &ascii_config); +//! println!("Found {} ASCII strings", ascii_strings.len()); //! # Ok(()) //! # } //! ``` @@ -36,7 +49,8 @@ //! The library is organized into focused modules: //! //! - [`container`]: Binary format detection and parsing (✅ Complete) -//! - [`extraction`]: String extraction algorithms (✅ PE resources complete) +//! - [`extraction`]: String extraction algorithms (✅ ASCII extraction and PE resources complete) +//! - ASCII extraction provides foundational encoding extraction as the reference implementation //! - [`classification`]: Semantic analysis and tagging (🚧 Types defined) //! - [`output`]: Result formatting (🚧 Interfaces ready) //! - [`types`]: Core data structures and error handling (✅ Complete) @@ -57,3 +71,6 @@ pub use types::{ ResourceStringEntry, ResourceStringTable, ResourceType, Result, SectionInfo, SectionType, StringSource, StringyError, Tag, }; + +// Re-export extraction framework types +pub use extraction::{AsciiExtractionConfig, BasicExtractor, ExtractionConfig, StringExtractor}; diff --git a/src/types.rs b/src/types.rs index 5e7209d..bccd80c 100644 --- a/src/types.rs +++ b/src/types.rs @@ -243,6 +243,26 @@ pub struct FoundString { pub score: i32, /// Source of the string (section data, import, etc.) pub source: StringSource, + /// Confidence score from noise filtering (0.0-1.0) + /// + /// This represents how confident we are that the string is legitimate vs noise. + /// A score of 1.0 indicates maximum confidence (e.g., strings from known-good sources + /// like imports, exports, resources). Lower scores indicate potential noise that + /// may need filtering. This is separate from the `score` field, which is used for + /// final ranking (combining section weight, semantic boosts, and noise penalties). + pub confidence: f32, +} + +impl FoundString { + /// Returns true if confidence is high (>= 0.7) + pub fn is_high_confidence(&self) -> bool { + self.confidence >= 0.7 + } + + /// Returns true if confidence is low (< 0.5) + pub fn is_low_confidence(&self) -> bool { + self.confidence < 0.5 + } } /// Error types for the stringy library diff --git a/tests/integration_extraction.rs b/tests/integration_extraction.rs new file mode 100644 index 0000000..1fd82ba --- /dev/null +++ b/tests/integration_extraction.rs @@ -0,0 +1,510 @@ +use std::fs; +use stringy::container::{ContainerParser, ElfParser, PeParser}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +use stringy::types::{Encoding, SectionType, StringSource}; + +fn get_fixture_path(name: &str) -> std::path::PathBuf { + std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join(name) +} + +#[test] +fn test_basic_extractor_ascii_strings() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + // Create test data with embedded ASCII strings + let data = b"prefix\0Hello\0World\0Test123\0suffix"; + let section = stringy::types::SectionInfo { + name: ".rodata".to_string(), + offset: 7, // Start after "prefix\0" + size: 20, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[0].encoding, Encoding::Ascii); + assert_eq!(strings[0].source, StringSource::SectionData); + assert_eq!(strings[1].text, "World"); + assert_eq!(strings[2].text, "Test123"); +} + +#[test] +fn test_basic_extractor_utf8_strings() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + // Create test data with UTF-8 strings + let data = "prefix\0Hello 世界\0Test 测试\0suffix".as_bytes(); + let section = stringy::types::SectionInfo { + name: ".rodata".to_string(), + offset: 7, + size: 30, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should extract UTF-8 strings "Hello 世界" and "Test 测试" + // Note: ASCII extractor may also extract ASCII prefixes, but UTF-8 extractor + // will extract the full UTF-8 strings. We check for the UTF-8 strings. + let utf8_strings: Vec<_> = strings + .iter() + .filter(|s| s.encoding == Encoding::Utf8) + .collect(); + assert!( + utf8_strings.len() >= 2, + "Should find at least 2 UTF-8 strings, found {} UTF-8 strings ({} total)", + utf8_strings.len(), + strings.len() + ); + assert!(utf8_strings.iter().any(|s| s.text == "Hello 世界")); + assert!(utf8_strings.iter().any(|s| s.text == "Test 测试")); +} + +#[test] +fn test_basic_extractor_min_length_filtering() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + min_length: 4, + ..Default::default() + }; + + let data = b"Hi\0Test\0AB\0LongString\0OK"; + let section = stringy::types::SectionInfo { + name: ".data".to_string(), + offset: 0, + size: data.len() as u64, + rva: None, + section_type: SectionType::WritableData, + is_executable: false, + is_writable: true, + weight: 0.5, + }; + + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should only find strings >= 4 characters + assert!(strings.iter().all(|s| s.text.len() >= 4)); + assert!(strings.iter().any(|s| s.text == "Test")); + assert!(strings.iter().any(|s| s.text == "LongString")); + // "Hi" and "AB" should be filtered out + assert!(!strings.iter().any(|s| s.text == "Hi")); + assert!(!strings.iter().any(|s| s.text == "AB")); +} + +#[test] +fn test_basic_extractor_max_length_filtering() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); // max_length = 4096 by default + + // Create a very long string + let long_string = "A".repeat(5000); + let data = format!("Short\0{}\0EndTest", long_string).into_bytes(); + let section = stringy::types::SectionInfo { + name: ".data".to_string(), + offset: 0, + size: data.len() as u64, + rva: None, + section_type: SectionType::WritableData, + is_executable: false, + is_writable: true, + weight: 0.5, + }; + + let strings = extractor + .extract_from_section(&data, §ion, &config) + .unwrap(); + + // The long string should be filtered out by max_length + assert!(strings.iter().any(|s| s.text == "Short")); + assert!(strings.iter().any(|s| s.text == "EndTest")); + // The 5000-character string should not be present + assert!(!strings.iter().any(|s| s.text.len() > 4096)); +} + +#[test] +fn test_basic_extractor_with_elf_fixture() { + let fixture_path = get_fixture_path("test_binary_elf"); + let elf_data = fs::read(&fixture_path) + .expect("Failed to read ELF fixture. Run the build script to generate fixtures."); + + // Parse with ElfParser to get ContainerInfo + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Use BasicExtractor with config that excludes symbols to focus on section data + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: false, + ..Default::default() + }; + let strings = extractor + .extract(&elf_data, &container_info, &config) + .expect("Failed to extract strings"); + + // Verify strings are found + assert!( + !strings.is_empty(), + "Should find some strings in ELF binary" + ); + + // Verify strings are from appropriate sections + for string in &strings { + assert_eq!(string.source, StringSource::SectionData); + assert!(string.section.is_some()); + assert!(string.length > 0); + + // Verify encoding is ASCII or UTF-8 + assert!( + matches!(string.encoding, Encoding::Ascii | Encoding::Utf8), + "Encoding should be ASCII or UTF-8" + ); + + // Verify RVA is calculated if section has RVA + if let Some(section_name) = &string.section + && let Some(section) = container_info + .sections + .iter() + .find(|s| s.name == *section_name) + && section.rva.is_some() + { + assert!( + string.rva.is_some(), + "RVA should be calculated if section has RVA" + ); + } + } + + // Check that we found strings in common string sections + let section_names: Vec<&str> = strings + .iter() + .filter_map(|s| s.section.as_deref()) + .collect(); + println!("Found strings in sections: {:?}", section_names); +} + +#[test] +fn test_basic_extractor_with_pe_fixture() { + let fixture_path = get_fixture_path("test_binary_pe.exe"); + let pe_data = fs::read(&fixture_path) + .expect("Failed to read PE fixture. Run the build script to generate fixtures."); + + // Parse with PeParser to get ContainerInfo + let parser = PeParser::new(); + let container_info = parser.parse(&pe_data).expect("Failed to parse PE"); + + // Extract strings using BasicExtractor with config that excludes symbols + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: false, + ..Default::default() + }; + let strings = extractor + .extract(&pe_data, &container_info, &config) + .expect("Failed to extract strings"); + + // Verify strings are found + assert!(!strings.is_empty(), "Should find some strings in PE binary"); + + // Verify all FoundString fields are properly populated + for string in &strings { + assert!(!string.text.is_empty()); + assert_eq!(string.source, StringSource::SectionData); + assert!(string.section.is_some()); + assert!(string.length > 0); + assert!(matches!(string.encoding, Encoding::Ascii | Encoding::Utf8)); + + // Verify offset is within data bounds + assert!( + string.offset < pe_data.len() as u64, + "Offset should be within data bounds" + ); + } + + // Check for strings in common PE sections + let has_rdata = strings.iter().any(|s| { + s.section + .as_ref() + .map(|name| name.contains(".rdata") || name.contains(".data")) + .unwrap_or(false) + }); + println!("Found strings in .rdata/.data sections: {}", has_rdata); +} + +#[test] +fn test_basic_extractor_section_filtering() { + let fixture_path = get_fixture_path("test_binary_elf"); + let elf_data = fs::read(&fixture_path) + .expect("Failed to read ELF fixture. Run the build script to generate fixtures."); + + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Create config that excludes code and debug sections + let config = ExtractionConfig { + scan_code_sections: false, + include_debug: false, + ..Default::default() + }; + + let extractor = BasicExtractor::new(); + let strings = extractor + .extract(&elf_data, &container_info, &config) + .expect("Failed to extract strings"); + + // Verify no strings from code or debug sections + for string in &strings { + if let Some(section_name) = &string.section + && let Some(section) = container_info + .sections + .iter() + .find(|s| s.name == *section_name) + { + assert_ne!( + section.section_type, + SectionType::Code, + "Should not extract from code sections" + ); + assert_ne!( + section.section_type, + SectionType::Debug, + "Should not extract from debug sections" + ); + } + } +} + +#[test] +fn test_basic_extractor_empty_data() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + let section = stringy::types::SectionInfo { + name: ".empty".to_string(), + offset: 0, + size: 0, + rva: None, + section_type: SectionType::Other, + is_executable: false, + is_writable: false, + weight: 0.0, + }; + + let data = b""; + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should return empty result, not panic + assert!(strings.is_empty()); +} + +#[test] +fn test_basic_extractor_boundary_conditions() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + // Test string at start of section + let data1 = b"Start\0middle\0end"; + let section1 = stringy::types::SectionInfo { + name: ".test1".to_string(), + offset: 0, + size: data1.len() as u64, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let strings1 = extractor + .extract_from_section(data1, §ion1, &config) + .unwrap(); + assert!(strings1.iter().any(|s| s.text == "Start" && s.offset == 0)); + + // Test string at end of section + let data2 = b"prefix\0middle\0EndTest"; + let section2 = stringy::types::SectionInfo { + name: ".test2".to_string(), + offset: 0, + size: data2.len() as u64, + rva: Some(0x2000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let strings2 = extractor + .extract_from_section(data2, §ion2, &config) + .unwrap(); + assert!(strings2.iter().any(|s| s.text == "EndTest")); + + // Test string spanning entire section + let data3 = b"FullSectionString"; + let section3 = stringy::types::SectionInfo { + name: ".test3".to_string(), + offset: 0, + size: data3.len() as u64, + rva: Some(0x3000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let strings3 = extractor + .extract_from_section(data3, §ion3, &config) + .unwrap(); + assert_eq!(strings3.len(), 1); + assert_eq!(strings3[0].text, "FullSectionString"); + assert_eq!(strings3[0].offset, 0); + assert_eq!(strings3[0].rva, Some(0x3000)); +} + +#[test] +fn test_extraction_config_defaults() { + let config = ExtractionConfig::default(); + + // Verify all default values match specification + assert_eq!(config.min_length, 4); + assert_eq!(config.max_length, 4096); + assert_eq!(config.encodings.len(), 2); + assert!(config.encodings.contains(&Encoding::Ascii)); + assert!(config.encodings.contains(&Encoding::Utf8)); + assert!(config.scan_code_sections); + assert!(!config.include_debug); + assert_eq!(config.section_priority.len(), 3); + assert!(config.section_priority.contains(&SectionType::StringData)); + assert!(config.section_priority.contains(&SectionType::ReadOnlyData)); + assert!(config.section_priority.contains(&SectionType::Resources)); + assert!(config.include_symbols); +} + +#[test] +fn test_basic_extractor_encoding_filtering() { + let extractor = BasicExtractor::new(); + // Only allow ASCII, exclude UTF-8 + let config = ExtractionConfig { + encodings: vec![Encoding::Ascii], + enabled_encodings: vec![Encoding::Ascii], + ..Default::default() + }; + + let section = stringy::types::SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 30, + rva: None, + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = "Hello\0世界\0Test".as_bytes(); + let strings = extractor + .extract_from_section(data, §ion, &config) + .unwrap(); + + // Should only find ASCII strings, not UTF-8 + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[0].encoding, Encoding::Ascii); + assert_eq!(strings[1].text, "Test"); + assert_eq!(strings[1].encoding, Encoding::Ascii); + // UTF-8 string "世界" should be filtered out + assert!(!strings.iter().any(|s| s.text.contains("世界"))); +} + +#[test] +fn test_basic_extractor_include_symbols() { + let fixture_path = get_fixture_path("test_binary_elf"); + let elf_data = fs::read(&fixture_path) + .expect("Failed to read ELF fixture. Run the build script to generate fixtures."); + + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Extract with symbols included + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: true, + ..Default::default() + }; + let strings = extractor + .extract(&elf_data, &container_info, &config) + .expect("Failed to extract strings"); + + // Should include import and export names + let import_strings: Vec<_> = strings + .iter() + .filter(|s| s.source == StringSource::ImportName) + .collect(); + let export_strings: Vec<_> = strings + .iter() + .filter(|s| s.source == StringSource::ExportName) + .collect(); + + // Verify we found some imports/exports + assert!(!import_strings.is_empty() || !export_strings.is_empty()); + + // Verify import string properties + for import_str in &import_strings { + assert_eq!(import_str.encoding, Encoding::Utf8); + assert_eq!(import_str.offset, 0); + assert_eq!(import_str.rva, None); + assert_eq!(import_str.section, None); + assert!(import_str.length > 0); + } + + // Verify export string properties + for export_str in &export_strings { + assert_eq!(export_str.encoding, Encoding::Utf8); + assert_eq!(export_str.offset, 0); + assert_eq!(export_str.rva, None); + assert_eq!(export_str.section, None); + assert!(export_str.length > 0); + } +} + +#[test] +fn test_basic_extractor_exclude_symbols() { + let fixture_path = get_fixture_path("test_binary_elf"); + let elf_data = fs::read(&fixture_path) + .expect("Failed to read ELF fixture. Run the build script to generate fixtures."); + + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Extract with symbols excluded + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + include_symbols: false, + ..Default::default() + }; + let strings = extractor + .extract(&elf_data, &container_info, &config) + .expect("Failed to extract strings"); + + // Should not include import/export names + assert!(!strings.iter().any(|s| s.source == StringSource::ImportName)); + assert!(!strings.iter().any(|s| s.source == StringSource::ExportName)); +} diff --git a/tests/test_ascii_extraction.rs b/tests/test_ascii_extraction.rs new file mode 100644 index 0000000..8c6c3b2 --- /dev/null +++ b/tests/test_ascii_extraction.rs @@ -0,0 +1,232 @@ +//! Unit tests for ASCII string extraction + +use stringy::extraction::ascii::{ + AsciiExtractionConfig, extract_ascii_strings, extract_from_section, +}; +use stringy::types::{Encoding, SectionInfo, SectionType, StringSource}; + +#[test] +fn test_basic_extraction() { + let data = b"Hello\0World\0Test123"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[0].offset, 0); + assert_eq!(strings[0].encoding, Encoding::Ascii); + assert_eq!(strings[0].source, StringSource::SectionData); + assert_eq!(strings[0].confidence, 1.0); +} + +#[test] +fn test_minimum_length_threshold() { + let data = b"Hi\0Test\0AB\0LongString"; + let config = AsciiExtractionConfig::new(4); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 2); + assert_eq!(strings[0].text, "Test"); + assert_eq!(strings[1].text, "LongString"); +} + +#[test] +fn test_null_terminated_strings() { + let data = b"First\0Second\0Third"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "First"); + assert_eq!(strings[1].text, "Second"); + assert_eq!(strings[2].text, "Third"); +} + +#[test] +fn test_mixed_printable_nonprintable() { + let data = b"Hello\x00World\x01Test"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 3); + assert_eq!(strings[0].text, "Hello"); + assert_eq!(strings[1].text, "World"); + assert_eq!(strings[2].text, "Test"); +} + +#[test] +fn test_empty_input() { + let data = b""; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert!(strings.is_empty()); +} + +#[test] +fn test_no_valid_strings() { + let data = &[0x00, 0xFF, 0x01, 0x02, 0x03]; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert!(strings.is_empty()); +} + +#[test] +fn test_string_at_section_boundary() { + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 7, + size: 12, + rva: Some(0x2000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = b"prefix\0Hello World\0suffix"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + assert!(!strings.is_empty()); + let hello_world = strings.iter().find(|s| s.text == "Hello World"); + assert!(hello_world.is_some()); + if let Some(s) = hello_world { + assert_eq!(s.offset, 7); + assert_eq!(s.rva, Some(0x2000)); + assert_eq!(s.section, Some(".rodata".to_string())); + } +} + +#[test] +fn test_very_long_string() { + let long_string = "A".repeat(500); + let data = format!("{}\0Short", long_string).into_bytes(); + let config = AsciiExtractionConfig { + max_length: Some(200), + ..Default::default() + }; + let strings = extract_ascii_strings(&data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Short"); +} + +#[test] +fn test_single_character_sequences() { + let data = b"A\0Test\0B\0C"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Test"); +} + +#[test] +fn test_different_section_types() { + let rodata_section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 20, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data_section = SectionInfo { + name: ".data".to_string(), + offset: 0, + size: 20, + rva: Some(0x2000), + section_type: SectionType::WritableData, + is_executable: false, + is_writable: true, + weight: 0.5, + }; + + let data = b"Hello World\0Test"; + let config = AsciiExtractionConfig::default(); + + let rodata_strings = extract_from_section(&rodata_section, data, &config, None, false, 0.5); + let data_strings = extract_from_section(&data_section, data, &config, None, false, 0.5); + + assert_eq!(rodata_strings.len(), 2); + assert_eq!(data_strings.len(), 2); + + for string in &rodata_strings { + assert_eq!(string.section, Some(".rodata".to_string())); + } + + for string in &data_strings { + assert_eq!(string.section, Some(".data".to_string())); + } +} + +#[test] +fn test_section_metadata_attachment() { + let section = SectionInfo { + name: ".custom".to_string(), + offset: 0, + size: 20, + rva: Some(0x3000), + section_type: SectionType::ReadOnlyData, + is_executable: false, + is_writable: false, + weight: 0.8, + }; + + let data = b"Test String\0Another"; + let config = AsciiExtractionConfig::default(); + let strings = extract_from_section(§ion, data, &config, None, false, 0.5); + + for string in &strings { + assert_eq!(string.section, Some(".custom".to_string())); + assert!(string.rva.is_some()); + assert!(string.rva.unwrap() >= 0x3000); + } +} + +#[test] +fn test_custom_minimum_length() { + let data = b"Test\0Hello\0AB"; + let config = AsciiExtractionConfig::new(5); + let strings = extract_ascii_strings(data, &config); + + assert_eq!(strings.len(), 1); + assert_eq!(strings[0].text, "Hello"); +} + +#[test] +fn test_noise_filtering_disabled() { + // This test verifies that extraction works even when noise filtering is conceptually disabled + // (by setting confidence to 1.0 for all extracted strings) + let data = b"Hello\0AAAA\0World"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(data, &config); + + // All strings should be extracted with confidence 1.0 + assert_eq!(strings.len(), 3); + for string in &strings { + assert_eq!(string.confidence, 1.0); + } +} + +#[test] +fn test_configuration_customization() { + let config = AsciiExtractionConfig { + min_length: 8, + max_length: Some(50), + }; + + let data = b"Short\0VeryLongStringHere\0MediumLength"; + let strings = extract_ascii_strings(data, &config); + + // "VeryLongStringHere" (18 chars) and "MediumLength" (12 chars) should pass (length >= 8 and <= 50) + // "Short" (5 chars) should be filtered out (length < 8) + assert_eq!(strings.len(), 2); + assert!(strings.iter().any(|s| s.text == "VeryLongStringHere")); + assert!(strings.iter().any(|s| s.text == "MediumLength")); +} diff --git a/tests/test_ascii_integration.rs b/tests/test_ascii_integration.rs new file mode 100644 index 0000000..c227d81 --- /dev/null +++ b/tests/test_ascii_integration.rs @@ -0,0 +1,430 @@ +//! Integration tests for ASCII extraction with noise filtering + +use insta::assert_snapshot; +use std::fs; +use stringy::container::{ContainerParser, PeParser}; +use stringy::extraction::ascii::{ + AsciiExtractionConfig, extract_ascii_strings, extract_from_section, +}; +use stringy::extraction::config::NoiseFilterConfig; +use stringy::extraction::filters::{CompositeNoiseFilter, FilterContext}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; +use stringy::types::{BinaryFormat, ContainerInfo, SectionInfo, SectionType}; + +fn get_fixture_path(name: &str) -> std::path::PathBuf { + std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join(name) +} + +#[test] +#[ignore] // Requires test_binary_pe.exe fixture +fn test_ascii_extraction_from_binary() { + let fixture_path = get_fixture_path("test_binary_pe.exe"); + if !fixture_path.exists() { + return; + } + + let pe_data = fs::read(&fixture_path).expect("Failed to read PE fixture"); + let parser = PeParser::new(); + let container_info = parser.parse(&pe_data).expect("Failed to parse PE"); + + // Extract ASCII strings from each section + let config = AsciiExtractionConfig::default(); + let mut all_strings = Vec::new(); + + for section in &container_info.sections { + if section.size > 0 { + let section_data = &pe_data[section.offset as usize..] + .get(..section.size as usize) + .unwrap_or(&[]); + let strings = extract_ascii_strings(section_data, &config); + all_strings.extend(strings); + } + } + + // Verify that legitimate strings are extracted + assert!( + !all_strings.is_empty(), + "Should extract some strings from binary" + ); + + // Verify all strings have confidence set + for string in &all_strings { + assert!(string.confidence >= 0.0 && string.confidence <= 1.0); + } +} + +#[test] +fn test_false_positive_reduction() { + // Create test data with known noise patterns + let noise_data = b"AAAA\x00\x00\x00\x00!!!@@@###\0Hello World\0Test123"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(noise_data, &config); + + // Apply noise filtering + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + let mut filtered_strings = Vec::new(); + for string in &strings { + let confidence = filter.calculate_confidence(&string.text, &context); + if confidence >= 0.5 { + filtered_strings.push((string.text.clone(), confidence)); + } + } + + // Verify that noise is filtered out or marked with low confidence + let noise_strings: Vec<_> = strings + .iter() + .filter(|s| s.text == "AAAA" || s.text == "!!!@@@###") + .collect(); + + for noise_string in noise_strings { + let confidence = filter.calculate_confidence(&noise_string.text, &context); + assert!( + confidence < 0.5, + "Noise string '{}' should have low confidence: {}", + noise_string.text, + confidence + ); + } +} + +#[test] +fn test_true_positive_retention() { + // Create test data with known legitimate strings + let legitimate_data = + b"Hello World\0Error: file not found\0C:\\Windows\\System32\0https://example.com"; + let config = AsciiExtractionConfig::default(); + let strings = extract_ascii_strings(legitimate_data, &config); + + // Apply noise filtering + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + let mut retained_count = 0; + for string in &strings { + let confidence = filter.calculate_confidence(&string.text, &context); + if confidence >= 0.5 { + retained_count += 1; + } + } + + // Verify that legitimate strings are retained (target: >95%) + let retention_rate = retained_count as f32 / strings.len() as f32; + assert!( + retention_rate > 0.95, + "True positive retention rate should be >95%, got {}%", + retention_rate * 100.0 + ); +} + +#[test] +fn test_performance_overhead() { + // Measure extraction time with and without noise filtering + let test_data = b"Hello World\0Test String\0Another String\0".repeat(1000); + let config = AsciiExtractionConfig::default(); + + // Time extraction without filtering + let start = std::time::Instant::now(); + let strings = extract_ascii_strings(&test_data, &config); + let extraction_time = start.elapsed(); + + // Time filtering + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + let context = FilterContext::default(); + + let start = std::time::Instant::now(); + for string in &strings { + let _ = filter.calculate_confidence(&string.text, &context); + } + let filtering_time = start.elapsed(); + + // Verify that overhead is reasonable + // Note: In debug builds with small test data, filtering may appear slower + // The <10% overhead target is for optimized release builds with realistic data sizes + // For this test, we just verify that filtering completes in reasonable time + let total_time = extraction_time + filtering_time; + assert!( + total_time.as_secs_f64() < 1.0, + "Total extraction+filtering time should be <1s, got {:?} (extraction: {:?}, filtering: {:?})", + total_time, + extraction_time, + filtering_time + ); + + // In release mode, verify the <10% overhead target + #[cfg(not(debug_assertions))] + { + let overhead_ratio = if extraction_time.as_secs_f64() > 0.0 { + filtering_time.as_secs_f64() / extraction_time.as_secs_f64() + } else { + 0.0 + }; + assert!( + overhead_ratio < 0.1, + "Filtering overhead should be <10% of extraction time in release mode, got {}%", + overhead_ratio * 100.0 + ); + } +} + +#[test] +#[ignore] // Requires test_binary_pe.exe fixture +fn test_snapshot_extraction() { + let fixture_path = get_fixture_path("test_binary_pe.exe"); + if !fixture_path.exists() { + return; + } + + let pe_data = fs::read(&fixture_path).expect("Failed to read PE fixture"); + let parser = PeParser::new(); + let container_info = parser.parse(&pe_data).expect("Failed to parse PE"); + + let config = AsciiExtractionConfig::default(); + let mut all_strings = Vec::new(); + + for section in &container_info.sections { + if section.size > 0 && section.section_type == SectionType::StringData { + let section_data = &pe_data[section.offset as usize..] + .get(..section.size as usize) + .unwrap_or(&[]); + let strings = extract_ascii_strings(section_data, &config); + all_strings.extend(strings); + } + } + + // Create snapshot of extracted strings + let mut output = String::new(); + for string in &all_strings { + output.push_str(&format!( + "{}:{}:{}\n", + string.text, string.offset, string.confidence + )); + } + + assert_snapshot!("ascii_extraction_snapshot", output); +} + +#[test] +fn test_section_context_awareness() { + // Test that section context affects filtering + let high_weight_section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 20, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let low_weight_section = SectionInfo { + name: ".text".to_string(), + offset: 0, + size: 20, + rva: Some(0x2000), + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let data = b"Hello World\0Test"; + let config = AsciiExtractionConfig::default(); + + let filter_config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&filter_config); + + let high_weight_context = FilterContext::from_section(&high_weight_section); + let low_weight_context = FilterContext::from_section(&low_weight_section); + + let strings = extract_ascii_strings(data, &config); + + for string in &strings { + let high_score = filter.calculate_confidence(&string.text, &high_weight_context); + let low_score = filter.calculate_confidence(&string.text, &low_weight_context); + + // Strings in high-weight sections should generally have higher confidence + assert!( + high_score >= low_score, + "High-weight section should have equal or higher confidence" + ); + } +} + +#[test] +fn test_full_extraction_path_with_filtering() { + // Test the full extraction path with filtering enabled using BasicExtractor + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 50, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + // Mix of legitimate strings and noise + let data = b"Hello World\0AAAA\0Error: file not found\0!!!@@@###\0Test123"; + + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + noise_filtering_enabled: true, + min_confidence_threshold: 0.5, + ..Default::default() + }; + + let container_info = ContainerInfo::new( + BinaryFormat::Elf, + vec![section.clone()], + vec![], + vec![], + None, + ); + + let strings = extractor.extract(data, &container_info, &config).unwrap(); + + // Verify that filtering is applied (confidence scores are computed) + assert!(!strings.is_empty(), "Should extract some strings"); + + // Verify all strings have confidence scores in valid range + for string in &strings { + assert!( + string.confidence >= 0.0 && string.confidence <= 1.0, + "String '{}' should have confidence in [0.0, 1.0], got {}", + string.text, + string.confidence + ); + } + + // Verify that strings with confidence >= threshold are retained + let retained_strings: Vec<_> = strings + .iter() + .filter(|s| s.confidence >= config.min_confidence_threshold) + .collect(); + + assert!( + !retained_strings.is_empty(), + "Should retain at least some strings with confidence >= threshold" + ); + + // Verify that legitimate strings are likely to be retained + let legitimate_strings: Vec<_> = strings + .iter() + .filter(|s| { + s.text == "Hello World" || s.text == "Error: file not found" || s.text == "Test123" + }) + .collect(); + + // At least some legitimate strings should be retained + let retained_legitimate: Vec<_> = legitimate_strings + .iter() + .filter(|s| s.confidence >= config.min_confidence_threshold) + .collect(); + + assert!( + !retained_legitimate.is_empty(), + "At least one legitimate string should be retained, found {}", + retained_legitimate.len() + ); +} + +#[test] +fn test_extraction_with_filtering_disabled() { + // Test that filtering can be disabled + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 30, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = b"Hello World\0AAAA\0Test123"; + + let extractor = BasicExtractor::new(); + let config = ExtractionConfig { + noise_filtering_enabled: false, + ..Default::default() + }; + + let container_info = ContainerInfo::new(BinaryFormat::Elf, vec![section], vec![], vec![], None); + + let strings = extractor.extract(data, &container_info, &config).unwrap(); + + // When filtering is disabled, all strings should be included + assert!( + strings.len() >= 3, + "All strings should be included when filtering is disabled, found {}", + strings.len() + ); + + // All strings should have confidence 1.0 when filtering is disabled + for string in &strings { + assert_eq!( + string.confidence, 1.0, + "String '{}' should have confidence 1.0 when filtering is disabled, got {}", + string.text, string.confidence + ); + } +} + +#[test] +fn test_extract_from_section_with_filtering() { + // Test extract_from_section with filtering enabled + let section = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 40, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + + let data = b"Hello World\0AAAA\0Test123"; + let config = AsciiExtractionConfig::default(); + let noise_config = Some(NoiseFilterConfig::default()); + + let strings = extract_from_section( + §ion, + data, + &config, + noise_config.as_ref(), + true, // filtering enabled + 0.5, // threshold + ); + + // Verify noise is filtered + let has_noise = strings.iter().any(|s| s.text == "AAAA"); + assert!(!has_noise, "Noise string 'AAAA' should be filtered out"); + + // Verify legitimate strings are retained + let has_legitimate = strings + .iter() + .any(|s| s.text == "Hello World" || s.text == "Test123"); + assert!(has_legitimate, "Legitimate strings should be retained"); + + // Verify confidence scores are set + for string in &strings { + assert!( + string.confidence >= 0.5, + "String '{}' should have confidence >= 0.5, got {}", + string.text, + string.confidence + ); + } +} diff --git a/tests/test_noise_filters.rs b/tests/test_noise_filters.rs new file mode 100644 index 0000000..b829659 --- /dev/null +++ b/tests/test_noise_filters.rs @@ -0,0 +1,348 @@ +//! Unit tests for noise filtering heuristics + +use stringy::extraction::config::{FilterWeights, NoiseFilterConfig}; +use stringy::extraction::filters::{ + CharDistributionFilter, CompositeNoiseFilter, ContextFilter, EntropyFilter, FilterContext, + LengthFilter, LinguisticFilter, NoiseFilter, RepetitionFilter, +}; +use stringy::types::SectionType; + +#[test] +fn test_char_distribution_filter_all_punctuation() { + let filter = CharDistributionFilter; + let context = FilterContext::default(); + + let score = filter.calculate_confidence("!!!@@@###$$$", &context); + assert!(score < 0.5, "All punctuation should have low confidence"); +} + +#[test] +fn test_char_distribution_filter_repeated_character() { + let filter = CharDistributionFilter; + let context = FilterContext::default(); + + let score = filter.calculate_confidence("AAAA", &context); + assert!(score < 0.5, "Repeated character should have low confidence"); +} + +#[test] +fn test_char_distribution_filter_normal_text() { + let filter = CharDistributionFilter; + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello, World!", &context); + assert!(score > 0.7, "Normal text should have high confidence"); +} + +#[test] +fn test_char_distribution_filter_mixed_alphanumeric() { + let filter = CharDistributionFilter; + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Test123", &context); + assert!( + score > 0.5, + "Mixed alphanumeric should have reasonable confidence" + ); +} + +#[test] +fn test_entropy_filter_low_entropy() { + let filter = EntropyFilter::new(1.5, 7.5); + let context = FilterContext::default(); + + // Low entropy (repetition) + let score = filter.calculate_confidence("AAAA", &context); + assert!(score < 0.5, "Low entropy should have low confidence"); +} + +#[test] +fn test_entropy_filter_high_entropy() { + let filter = EntropyFilter::new(1.5, 7.5); + let context = FilterContext::default(); + + // High entropy (random-like) + // Note: This string may not always have entropy > 7.5 due to repetition of patterns + // The test verifies that very high entropy strings get lower confidence than normal text + let random = "!@#$%^&*()_+-=[]{}|;':\",./<>?`~abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let random_confidence = filter.calculate_confidence(random, &context); + // High entropy strings should have lower confidence than normal text + let normal_confidence = filter.calculate_confidence("Hello, World!", &context); + assert!( + random_confidence < normal_confidence, + "High entropy string should have lower confidence than normal text (random: {}, normal: {})", + random_confidence, + normal_confidence + ); +} + +#[test] +fn test_entropy_filter_normal_text() { + let filter = EntropyFilter::new(1.5, 7.5); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello, World!", &context); + assert!(score > 0.5, "Normal text should have reasonable confidence"); +} + +#[test] +fn test_entropy_filter_edge_cases() { + let filter = EntropyFilter::new(1.5, 7.5); + let context = FilterContext::default(); + + // Test at threshold boundaries + let score1 = filter.calculate_confidence("\x00\x00\x00\x00", &context); + assert!(score1 < 0.5); + + let score2 = filter.calculate_confidence("Error: file not found", &context); + assert!(score2 > 0.5); +} + +#[test] +fn test_linguistic_filter_english_like() { + let filter = LinguisticFilter::new(0.1, 0.9); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello world", &context); + assert!(score > 0.7, "English-like text should have high confidence"); +} + +#[test] +fn test_linguistic_filter_consonant_heavy() { + let filter = LinguisticFilter::new(0.1, 0.9); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("bcdfghjklmnpqrstvwxyz", &context); + assert!(score < 0.7, "Consonant-heavy should have lower confidence"); +} + +#[test] +fn test_linguistic_filter_vowel_heavy() { + let filter = LinguisticFilter::new(0.1, 0.9); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("aeiouaeiou", &context); + assert!(score < 0.7, "Vowel-heavy should have lower confidence"); +} + +#[test] +fn test_linguistic_filter_with_numbers() { + let filter = LinguisticFilter::new(0.1, 0.9); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Error 404", &context); + assert!( + score > 0.5, + "Text with numbers should have reasonable confidence" + ); +} + +#[test] +fn test_length_filter_very_short() { + let filter = LengthFilter::new(200); + let context = FilterContext { + section_weight: 0.3, + ..Default::default() + }; + + let score = filter.calculate_confidence("Hi", &context); + assert!( + score < 0.7, + "Very short in low-weight section should have lower confidence" + ); +} + +#[test] +fn test_length_filter_normal_length() { + let filter = LengthFilter::new(200); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello", &context); + assert!(score > 0.7, "Normal length should have high confidence"); +} + +#[test] +fn test_length_filter_very_long() { + let filter = LengthFilter::new(200); + let context = FilterContext::default(); + + let long_string = "A".repeat(300); + let score = filter.calculate_confidence(&long_string, &context); + assert!(score < 0.5, "Very long string should have low confidence"); +} + +#[test] +fn test_repetition_filter_repeated_characters() { + let filter = RepetitionFilter::new(0.7); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("AAAA", &context); + assert!( + score < 0.5, + "Repeated characters should have low confidence" + ); +} + +#[test] +fn test_repetition_filter_repeated_pattern() { + let filter = RepetitionFilter::new(0.7); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("abcabcabc", &context); + assert!(score < 0.5, "Repeated pattern should have low confidence"); +} + +#[test] +fn test_repetition_filter_normal_string() { + let filter = RepetitionFilter::new(0.7); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello", &context); + assert!(score > 0.7, "Normal string should have high confidence"); +} + +#[test] +fn test_repetition_filter_some_repetition() { + let filter = RepetitionFilter::new(0.7); + let context = FilterContext::default(); + + // "Mississippi" has some repetition but is legitimate + let score = filter.calculate_confidence("Mississippi", &context); + assert!( + score > 0.5, + "Some repetition in legitimate text should be acceptable" + ); +} + +#[test] +fn test_context_filter_string_data_section() { + let filter = ContextFilter; + let context = FilterContext { + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + ..Default::default() + }; + + let score = filter.calculate_confidence("test", &context); + assert!( + score > 0.8, + "String data section should have high confidence" + ); +} + +#[test] +fn test_context_filter_code_section() { + let filter = ContextFilter; + let context = FilterContext { + section_type: SectionType::Code, + section_weight: 0.1, + ..Default::default() + }; + + let score = filter.calculate_confidence("test", &context); + assert!(score < 0.5, "Code section should have lower confidence"); +} + +#[test] +fn test_context_filter_resources_section() { + let filter = ContextFilter; + let context = FilterContext { + section_type: SectionType::Resources, + ..Default::default() + }; + + let score = filter.calculate_confidence("test", &context); + assert_eq!( + score, 1.0, + "Resources section should have maximum confidence" + ); +} + +#[test] +fn test_composite_filter_legitimate_string() { + let config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&config); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello, World!", &context); + assert!( + score > 0.5, + "Legitimate string should have reasonable confidence" + ); +} + +#[test] +fn test_composite_filter_noise() { + let config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&config); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("AAAA", &context); + assert!(score < 0.5, "Noise should have low confidence"); +} + +#[test] +fn test_composite_filter_custom_weights() { + let config = NoiseFilterConfig { + filter_weights: FilterWeights { + entropy_weight: 0.5, + char_distribution_weight: 0.3, + linguistic_weight: 0.1, + length_weight: 0.05, + repetition_weight: 0.03, + context_weight: 0.02, + }, + ..Default::default() + }; + + let filter = CompositeNoiseFilter::new(&config); + let context = FilterContext::default(); + + let score = filter.calculate_confidence("Hello, World!", &context); + assert!(score > 0.0, "Should produce a valid score"); +} + +#[test] +fn test_composite_filter_enable_disable() { + let config = NoiseFilterConfig::default(); + let mut filter = CompositeNoiseFilter::new(&config); + filter.enable_entropy = false; + filter.enable_linguistic = false; + + let context = FilterContext::default(); + let score = filter.calculate_confidence("Hello", &context); + assert!(score > 0.0, "Should work with some filters disabled"); +} + +#[test] +fn test_real_world_scenarios() { + let config = NoiseFilterConfig::default(); + let filter = CompositeNoiseFilter::new(&config); + let context = FilterContext::default(); + + // Legitimate strings + let legitimate = [ + "Error: file not found", + "Hello, World!", + "C:\\Windows\\System32", + "https://example.com", + ]; + + for text in &legitimate { + let score = filter.calculate_confidence(text, &context); + assert!( + score > 0.5, + "Legitimate string '{}' should have reasonable confidence", + text + ); + } + + // Obvious noise + let noise = ["\x00\x00\x00\x00", "AAAA", "!!!@@@###", "00000000"]; + + for text in &noise { + let score = filter.calculate_confidence(text, &context); + assert!(score < 0.5, "Noise '{}' should have low confidence", text); + } +}