diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index cd02da9..4daf7b8 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -1,27 +1,27 @@ name: "Audit Dependencies" on: - push: - paths: - # Run if workflow changes - - ".github/workflows/audit.yml" - # Run on changed dependencies - - "**/Cargo.toml" - - "**/Cargo.lock" - # Run if the configuration file changes - - "**/audit.toml" - # Rerun periodically to pick up new advisories - schedule: - - cron: "0 0 * * *" - # Run manually - workflow_dispatch: + push: + paths: + # Run if workflow changes + - ".github/workflows/audit.yml" + # Run on changed dependencies + - "**/Cargo.toml" + - "**/Cargo.lock" + # Run if the configuration file changes + - "**/audit.toml" + # Rerun periodically to pick up new advisories + schedule: + - cron: "0 0 * * *" + # Run manually + workflow_dispatch: jobs: - audit: - runs-on: ubuntu-latest - permissions: - contents: read - issues: write - steps: - - uses: actions/checkout@v5 - - uses: actions-rust-lang/audit@v1 - name: Audit Rust Dependencies + audit: + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + steps: + - uses: actions/checkout@v5 + - uses: actions-rust-lang/audit@v1 + name: Audit Rust Dependencies diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e4a94a3..a24c615 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,164 +1,163 @@ name: CI on: - push: - branches: [main] - pull_request: - branches: [main] - workflow_dispatch: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: defaults: - run: - shell: bash + run: + shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true env: - CARGO_TERM_COLOR: always - CI: true - GITHUB_ACTIONS: true + CARGO_TERM_COLOR: always + CI: true + GITHUB_ACTIONS: true jobs: - # Detect if Rust code has changed - changes: - runs-on: ubuntu-latest - outputs: - rust: ${{ steps.filter.outputs.rust }} - docs: ${{ steps.filter.outputs.docs }} - steps: - - uses: actions/checkout@v5 - - uses: dorny/paths-filter@v3 - id: filter - with: - filters: | - rust: - - '**/*.rs' - - '**/Cargo.toml' - - '**/Cargo.lock' - - '**/build.rs' - - 'justfile' - - 'rust-toolchain.toml' - - 'deny.toml' - docs: - - 'docs/**' - - '*.md' - - '.kiro/**' - - 'spec/**' - - # Code quality checks - always run - quality: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - uses: dtolnay/rust-toolchain@1.90 - with: - components: rustfmt, clippy - - - name: Install just - uses: extractions/setup-just@v3 - - - name: Cache Rust dependencies - uses: Swatinem/rust-cache@v2 - - - name: Rustfmt Check - uses: actions-rust-lang/rustfmt@v1 - - - name: Run clippy (all features) - run: cargo clippy --all-targets --all-features -- -D warnings - - test: - runs-on: ubuntu-latest - needs: changes - if: needs.changes.outputs.rust == 'true' - steps: - - uses: actions/checkout@v5 - - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 - with: - components: rustfmt, clippy - - - name: Install cargo-nextest - uses: taiki-e/install-action@v2 - with: - tool: cargo-nextest - - - name: Run tests (all features) - run: cargo nextest run --all-features - - - name: Build release - run: cargo build --release --all-features - - # Test cross-platform - only run when Rust code changes - test-cross-platform: - strategy: - matrix: - include: - # Primary Support - Linux - - os: ubuntu-latest - platform: "Linux" - - os: arm - platform: "Linux" - # Primary Support - macOS (using available runners) - - os: macos-latest - platform: "macOS" - # Primary Support - Windows - - os: windows-latest - platform: "Windows" - - runs-on: ${{ matrix.os }} - needs: changes - if: needs.changes.outputs.rust == 'true' - steps: - - uses: actions/checkout@v5 - - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 - - - name: Install cargo-nextest - uses: taiki-e/install-action@v2 - with: - tool: cargo-nextest - - # Run tests and build the release binary - - run: cargo nextest run --all-features - - run: cargo build --release --all-features - - # Generate coverage for TLS-enabled builds - only run when Rust code changes - coverage: - runs-on: ubuntu-latest - needs: [changes, test, test-cross-platform] - if: needs.changes.outputs.rust == 'true' - steps: - - uses: actions/checkout@v5 - - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 - with: - components: llvm-tools - - - - name: Install cargo-llvm-cov - uses: taiki-e/install-action@v2 - with: - tool: cargo-llvm-cov - - - name: Generate coverage - run: cargo llvm-cov --all-features --no-report - - - name: Combine coverage reports - run: cargo llvm-cov report --lcov --output-path lcov.info - - - name: Upload to Codecov - uses: codecov/codecov-action@v5 - with: - files: lcov.info - fail_ci_if_error: false - token: ${{ secrets.CODECOV_TOKEN }} - slug: EvilBit-Labs/StringyMcStringFace - - uses: qltysh/qlty-action/coverage@v2 - with: - token: ${{ secrets.QLTY_COVERAGE_TOKEN }} - files: target/lcov.info + # Detect if Rust code has changed + changes: + runs-on: ubuntu-latest + outputs: + rust: ${{ steps.filter.outputs.rust }} + docs: ${{ steps.filter.outputs.docs }} + steps: + - uses: actions/checkout@v5 + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + rust: + - '**/*.rs' + - '**/Cargo.toml' + - '**/Cargo.lock' + - '**/build.rs' + - 'justfile' + - 'rust-toolchain.toml' + - 'deny.toml' + docs: + - 'docs/**' + - '*.md' + - '.kiro/**' + - 'spec/**' + + # Code quality checks - always run + quality: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: dtolnay/rust-toolchain@1.90 + with: + components: rustfmt, clippy + + - name: Install just + uses: extractions/setup-just@v3 + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + + - name: Rustfmt Check + uses: actions-rust-lang/rustfmt@v1 + + - name: Run clippy (all features) + run: cargo clippy --all-targets --all-features -- -D warnings + + test: + runs-on: ubuntu-latest + needs: changes + if: needs.changes.outputs.rust == 'true' + steps: + - uses: actions/checkout@v5 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 + with: + components: rustfmt, clippy + + - name: Install cargo-nextest + uses: taiki-e/install-action@v2 + with: + tool: cargo-nextest + + - name: Run tests (all features) + run: cargo nextest run --all-features + + - name: Build release + run: cargo build --release --all-features + + # Test cross-platform - only run when Rust code changes + test-cross-platform: + strategy: + matrix: + include: + # Primary Support - Linux + - os: ubuntu-latest + platform: "Linux" + - os: arm + platform: "Linux" + # Primary Support - macOS (using available runners) + - os: macos-latest + platform: "macOS" + # Primary Support - Windows + - os: windows-latest + platform: "Windows" + + runs-on: ${{ matrix.os }} + needs: changes + if: needs.changes.outputs.rust == 'true' + steps: + - uses: actions/checkout@v5 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 + + - name: Install cargo-nextest + uses: taiki-e/install-action@v2 + with: + tool: cargo-nextest + + # Run tests and build the release binary + - run: cargo nextest run --all-features + - run: cargo build --release --all-features + + # Generate coverage for TLS-enabled builds - only run when Rust code changes + coverage: + runs-on: ubuntu-latest + needs: [changes, test, test-cross-platform] + if: needs.changes.outputs.rust == 'true' + steps: + - uses: actions/checkout@v5 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 + with: + components: llvm-tools + + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@v2 + with: + tool: cargo-llvm-cov + + - name: Generate coverage + run: cargo llvm-cov --all-features --no-report + + - name: Combine coverage reports + run: cargo llvm-cov report --lcov --output-path lcov.info + + - name: Upload to Codecov + uses: codecov/codecov-action@v5 + with: + files: lcov.info + fail_ci_if_error: false + token: ${{ secrets.CODECOV_TOKEN }} + slug: EvilBit-Labs/StringyMcStringFace + - uses: qltysh/qlty-action/coverage@v2 + with: + token: ${{ secrets.QLTY_COVERAGE_TOKEN }} + files: target/lcov.info diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 5693fb1..325f5b5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,33 +1,33 @@ name: CodeQL on: - push: - branches: [main] - pull_request: - branches: [main] - schedule: - - cron: "43 22 * * 1" - workflow_dispatch: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: "43 22 * * 1" + workflow_dispatch: permissions: - contents: read - actions: read - security-events: write + contents: read + actions: read + security-events: write jobs: - analyze: - name: CodeQL Analyze - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v5 + analyze: + name: CodeQL Analyze + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v5 - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 - - uses: github/codeql-action/init@v3 - with: - languages: rust + - uses: github/codeql-action/init@v3 + with: + languages: rust - - uses: github/codeql-action/autobuild@v3 + - uses: github/codeql-action/autobuild@v3 - - uses: github/codeql-action/analyze@v3 + - uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 6b96fc1..cd12f15 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -3,70 +3,70 @@ name: "Copilot Setup Steps" # Automatically run the setup steps when they are changed to allow for easy validation, and # allow manual testing through the repository's "Actions" tab on: - workflow_dispatch: - push: - paths: - - .github/workflows/copilot-setup-steps.yml - pull_request: - paths: - - .github/workflows/copilot-setup-steps.yml + workflow_dispatch: + push: + paths: + - .github/workflows/copilot-setup-steps.yml + pull_request: + paths: + - .github/workflows/copilot-setup-steps.yml jobs: - # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot. - copilot-setup-steps: - runs-on: ubuntu-latest + # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot. + copilot-setup-steps: + runs-on: ubuntu-latest - # Set the permissions to the lowest permissions possible needed for your steps. - # Copilot will be given its own token for its operations. - permissions: - # If you want to clone the repository as part of your setup steps, for example to - # install dependencies, you'll need the `contents: read` permission. If you don't - # clone the repository in your setup steps, Copilot will do this for you - # automatically after the steps complete. - contents: read + # Set the permissions to the lowest permissions possible needed for your steps. + # Copilot will be given its own token for its operations. + permissions: + # If you want to clone the repository as part of your setup steps, for example to + # install dependencies, you'll need the `contents: read` permission. If you don't + # clone the repository in your setup steps, Copilot will do this for you + # automatically after the steps complete. + contents: read - # You can define any steps you want, and they will run before the agent starts. - # If you do not check out your code, Copilot will do this for you. - steps: - - name: Checkout code - uses: actions/checkout@v5 + # You can define any steps you want, and they will run before the agent starts. + # If you do not check out your code, Copilot will do this for you. + steps: + - name: Checkout code + uses: actions/checkout@v5 - - uses: dtolnay/rust-toolchain@1.90 + - uses: dtolnay/rust-toolchain@1.90 - - name: Install just task runner - uses: taiki-e/install-action@v2 - with: - tool: just + - name: Install just task runner + uses: taiki-e/install-action@v2 + with: + tool: just - - name: Set up Python for pre-commit - uses: actions/setup-python@v6 - with: - python-version: "3.13" + - name: Set up Python for pre-commit + uses: actions/setup-python@v6 + with: + python-version: "3.13" - - name: Install cargo tools - uses: taiki-e/install-action@v2 - with: - tool: cargo-nextest,cargo-llvm-cov,cargo-audit,cargo-deny,cargo-dist,mdbook + - name: Install cargo tools + uses: taiki-e/install-action@v2 + with: + tool: cargo-nextest,cargo-llvm-cov,cargo-audit,cargo-deny,cargo-dist,mdbook - - name: Install mdbook plugins - uses: taiki-e/install-action@v2 - with: - tool: mdbook-admonish,mdbook-mermaid,mdbook-linkcheck,mdbook-toc,mdbook-open-on-gh,mdbook-tabs,mdbook-i18n-helpers + - name: Install mdbook plugins + uses: taiki-e/install-action@v2 + with: + tool: mdbook-admonish,mdbook-mermaid,mdbook-linkcheck,mdbook-toc,mdbook-open-on-gh,mdbook-tabs,mdbook-i18n-helpers - - name: Run just install - run: | - just install-tools + - name: Run just install + run: | + just install-tools - - name: Setup summary - run: | - echo "✅ StringyMcStringFace development environment setup complete!" - echo "" - echo "Available tools:" - echo " - Rust toolchain: $(rustc --version)" - echo " - Cargo: $(cargo --version)" - echo " - just: $(just --version)" - echo " - cargo-nextest: $(cargo nextest --version)" - echo " - cargo-llvm-cov: $(cargo llvm-cov --version)" - echo " - cargo-audit: $(cargo audit --version)" - echo " - cargo-deny: $(cargo deny --version)" - echo " - cargo-dist: $(cargo dist --version)" + - name: Setup summary + run: | + echo "✅ StringyMcStringFace development environment setup complete!" + echo "" + echo "Available tools:" + echo " - Rust toolchain: $(rustc --version)" + echo " - Cargo: $(cargo --version)" + echo " - just: $(just --version)" + echo " - cargo-nextest: $(cargo nextest --version)" + echo " - cargo-llvm-cov: $(cargo llvm-cov --version)" + echo " - cargo-audit: $(cargo audit --version)" + echo " - cargo-deny: $(cargo deny --version)" + echo " - cargo-dist: $(cargo dist --version)" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ac8d01d..a258290 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,81 +1,81 @@ name: Deploy Documentation on: - push: - branches: [main] - pull_request: - branches: [main] - workflow_dispatch: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: permissions: - contents: read - pages: write - id-token: write + contents: read + pages: write + id-token: write defaults: - run: - shell: bash + run: + shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: false + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v5 + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v5 - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 - with: - components: rustfmt, clippy + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 + with: + components: rustfmt, clippy - - name: Setup mdBook - uses: jontze/action-mdbook@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - mdbook-version: latest - use-mermaid: true - use-toc: true - use-admonish: true + - name: Setup mdBook + uses: jontze/action-mdbook@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + mdbook-version: latest + use-mermaid: true + use-toc: true + use-admonish: true - - name: Install cargo-binstall - uses: cargo-bins/cargo-binstall@main + - name: Install cargo-binstall + uses: cargo-bins/cargo-binstall@main - - name: Install mdbook plugins - run: cargo binstall mdbook-tabs mdbook-i18n-helpers mdbook-alerts mdbook-yml-header mdbook-image-size --no-confirm + - name: Install mdbook plugins + run: cargo binstall mdbook-tabs mdbook-i18n-helpers mdbook-alerts mdbook-yml-header mdbook-image-size --no-confirm - - name: Build rustdoc - run: | - cargo doc --no-deps --document-private-items --target-dir target - mkdir -p docs/book/api - cp -r target/doc/* docs/book/api/ + - name: Build rustdoc + run: | + cargo doc --no-deps --document-private-items --target-dir target + mkdir -p docs/book/api + cp -r target/doc/* docs/book/api/ - - name: Build mdBook - run: | - cd docs - mdbook build + - name: Build mdBook + run: | + cd docs + mdbook build - - name: Setup Pages - if: github.ref == 'refs/heads/main' - uses: actions/configure-pages@v5 - - - name: Upload artifact - if: github.ref == 'refs/heads/main' - uses: actions/upload-pages-artifact@v4 - with: - path: docs/book + - name: Setup Pages + if: github.ref == 'refs/heads/main' + uses: actions/configure-pages@v5 - deploy: + - name: Upload artifact if: github.ref == 'refs/heads/main' - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 + uses: actions/upload-pages-artifact@v4 + with: + path: docs/book + + deploy: + if: github.ref == 'refs/heads/main' + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8021e31..b05b9c1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,9 +15,7 @@ name: Release permissions: - "attestations": "write" "contents": "write" - "id-token": "write" # This task will run whenever you push a git tag that looks like a version # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc. @@ -66,7 +64,7 @@ jobs: # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.0/cargo-dist-installer.sh | sh" + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" - name: Cache dist uses: actions/upload-artifact@v4 with: @@ -114,6 +112,10 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json + permissions: + "attestations": "write" + "contents": "read" + "id-token": "write" steps: - name: enable windows longpaths run: | @@ -244,8 +246,8 @@ jobs: - plan - build-local-artifacts - build-global-artifacts - # Only run if we're "publishing", and only if local and global didn't fail (skipped is fine) - if: ${{ always() && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} + # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine) + if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} runs-on: "ubuntu-22.04" diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index d6b9b64..2861320 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -1,42 +1,42 @@ name: Security on: - workflow_run: - workflows: [CI] - types: [completed] - schedule: - - cron: "0 6 * * *" - workflow_dispatch: + workflow_run: + workflows: [CI] + types: [completed] + schedule: + - cron: "0 6 * * *" + workflow_dispatch: permissions: - contents: read - security-events: write + contents: read + security-events: write defaults: - run: - shell: bash + run: + shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: - audit: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 + audit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 - - name: Setup Rust - uses: dtolnay/rust-toolchain@1.90 + - name: Setup Rust + uses: dtolnay/rust-toolchain@1.90 - - uses: taiki-e/install-action@v2 - with: - tool: cargo-outdated,cargo-dist + - uses: taiki-e/install-action@v2 + with: + tool: cargo-outdated,cargo-dist - - uses: EmbarkStudios/cargo-deny-action@v1 + - uses: EmbarkStudios/cargo-deny-action@v1 - - name: Run cargo outdated - run: cargo outdated --depth=1 --exit-code=1 + - name: Run cargo outdated + run: cargo outdated --depth=1 --exit-code=1 - - name: Run cargo dist check - run: cargo dist check + - name: Run cargo dist check + run: cargo dist check diff --git a/.kiro/specs/stringy-binary-analyzer/tasks.md b/.kiro/specs/stringy-binary-analyzer/tasks.md index e334b75..8b16737 100644 --- a/.kiro/specs/stringy-binary-analyzer/tasks.md +++ b/.kiro/specs/stringy-binary-analyzer/tasks.md @@ -1,68 +1,22 @@ # Implementation Plan -- [x] 1. Create basic project structure +- [x] 1. Create foundational project structure and data types - - Create Cargo.toml with essential dependencies (goblin, clap, serde, serde_json) + - Create complete project structure with Cargo.toml, essential dependencies (goblin, clap, serde, serde_json), and module hierarchy (src/container/, src/extraction/, src/classification/, src/output/) + - Define core data types in src/types.rs including FoundString struct, Encoding enum (Ascii, Utf8, Utf16Le, Utf16Be), Tag enum for semantic classification + - Define container and section types including SectionType and StringSource enums, ContainerInfo and SectionInfo structs + - Implement comprehensive error handling framework with StringyError enum and Result type alias + - _Requirements: 1.1, 1.4, 6.1, 9.1_ - - Create src/lib.rs and src/main.rs with basic module declarations +- [x] 2. Implement basic format detection and container parsers - - Create directory structure: src/container/, src/extraction/, src/classification/, src/output/ + - Create ContainerParser trait and implement format detection for ELF, PE, and Mach-O using goblin + - Build complete container parser stubs for all three formats (src/container/elf.rs, pe.rs, macho.rs) + - Implement basic section enumeration for each format with unit tests + - Add format detection capabilities to distinguish between binary types + - _Requirements: 1.1, 1.2, 1.3, 1.4_ - - _Requirements: 1.1, 9.1_ - - - [x] 1.1 Define core data types - - - Create src/types.rs with FoundString struct and basic serialization - - Define Encoding enum (Ascii, Utf8, Utf16Le, Utf16Be) - - Define Tag enum for semantic classification - - _Requirements: 1.1, 6.1_ - - - [x] 1.2 Define container and section types - - - Add SectionType enum (StringData, ReadOnlyData, WritableData, Code, Debug, Resources, Other) - - Add StringSource enum (SectionData, ImportName, ExportName, ResourceString, LoadCommand, DebugInfo) - - Add ContainerInfo and SectionInfo structs - - _Requirements: 1.1, 1.4_ - - - [x] 1.3 Create error handling framework - - - Define StringyError enum with common error types - - Create Result type alias for the project - - Add basic error conversion implementations - - _Requirements: 1.4_ - -- [x] 2. Implement basic format detection - - - Create ContainerParser trait in src/container/mod.rs - - - Implement basic format detection using goblin to identify ELF, PE, Mach-O - - - Add simple unit test for format detection - - - _Requirements: 1.1, 1.4_ - - - [x] 2.1 Create ELF container parser stub - - - Create src/container/elf.rs with basic ELF parser structure - - Implement ELF format detection and basic section enumeration - - Add unit test for ELF section identification - - _Requirements: 1.1_ - - - [x] 2.2 Create PE container parser stub - - - Create src/container/pe.rs with basic PE parser structure - - Implement PE format detection and basic section enumeration - - Add unit test for PE section identification - - _Requirements: 1.2_ - - - [x] 2.3 Create Mach-O container parser stub - - - Create src/container/macho.rs with basic Mach-O parser structure - - Implement Mach-O format detection and basic section enumeration - - Add unit test for Mach-O section identification - - _Requirements: 1.3_ - -- [ ] 3. Implement ELF section classification +- [x] 3. Implement ELF section classification - Enhance ELF parser to classify sections by type (string data vs code vs other) @@ -72,7 +26,7 @@ - _Requirements: 1.1, 1.4_ - - [ ] 3.1 Add ELF import/export extraction + - [x] 3.1 Add ELF import/export extraction - Extract import and export symbol names from ELF dynamic section - Classify symbols as imports vs exports for proper tagging diff --git a/.kiro/steering/development.md b/.kiro/steering/development.md new file mode 100644 index 0000000..86e4d84 --- /dev/null +++ b/.kiro/steering/development.md @@ -0,0 +1,119 @@ +--- +inclusion: always +--- + +# Development Standards & Preferences + +## Rust Code Quality Standards + +### Memory Safety & Performance + +- **Pure Rust**: No `unsafe` code except in vetted dependencies +- **Zero Warnings**: All code must pass `cargo clippy -- -D warnings` +- **Performance First**: Prefer zero-copy operations, efficient algorithms, and memory mapping for large data +- **RAII Patterns**: Leverage Rust's ownership system for resource management + +### Code Organization + +- **File Size**: Keep modules under 500 lines; split larger files into focused modules +- **Module Hierarchy**: Use clear module boundaries with `mod.rs` files for organization +- **Public APIs**: All public functions and types need comprehensive rustdoc with examples +- **Internal Documentation**: Document complex algorithms and business logic inline + +### Error Handling Philosophy + +- **Result Types**: Use `Result` patterns consistently throughout codebase +- **No Panics**: Avoid panics in library code; reserve for truly unrecoverable situations +- **Contextual Errors**: Provide descriptive error messages with sufficient context for debugging +- **Error Libraries**: Prefer `thiserror` for custom error types, `anyhow` for application errors + +## Development Workflow & Tooling + +### Preferred Build System + +- **Just**: Use justfile recipes for all development tasks instead of raw cargo commands +- **Cross-platform**: Ensure all recipes work on Linux, macOS, and Windows +- **Composable**: Break complex tasks into smaller, reusable recipes + +### Standard Development Commands + +```bash +# Development cycle +just check # Fast syntax/type checking and linting +just build # Build project +just test # Run all tests with nextest +just lint-rust # Linting with strict warnings +just fmt # Format code (Rust + markdown) + +# Quality assurance +just bench # Run benchmarks (use criterion) +just docs-build # Generate documentation (mdBook + rustdoc) +just coverage # Coverage measurement with llvm-cov (target: >85%) +just audit # Security audit with cargo-audit +just format-docs # Format markdown files with mdformat +``` + +### Testing Philosophy + +- **Test Coverage**: >85% coverage required for all changes +- **Test Types**: Unit tests (in-module), integration tests (in `tests/`), property tests with `proptest` +- **Performance Tests**: Benchmark critical paths with `criterion` +- **Documentation Tests**: Ensure all code examples in docs compile and run, mdformat checks pass on markdown files including embedded code blocks +- **Deterministic Testing**: Use `insta` for snapshot testing of CLI outputs + +### Code Formatting & Linting + +- **Rustfmt**: Use project-wide `rustfmt.toml` for consistent formatting +- **Clippy**: Enable all lints, treat warnings as errors in CI +- **Markdown Formatting**: Use `mdformat` with extensions for consistent markdown formatting +- **Pre-commit Hooks**: Run formatting and basic lints before commits +- **IDE Integration**: Configure rust-analyzer for real-time feedback + +## Documentation Standards + +### API Documentation + +- **Rustdoc**: Comprehensive documentation for all public APIs +- **Examples**: Include working code examples in doc comments +- **Error Cases**: Document when functions return errors and why +- **Safety**: Document any unsafe code or invariants clearly + +### Project Documentation + +- **mdBook**: Use mdBook for user-facing documentation and guides +- **User Guide Accuracy**: The `docs/src` user guide must accurately reflect exactly how the tool works right now, not aspirational features +- **Architecture Docs**: Maintain high-level architecture documentation +- **Decision Records**: Document significant technical decisions and trade-offs + +## Dependency Management + +### Dependency Selection Criteria + +- **Maintenance**: Prefer actively maintained crates with recent updates +- **Security**: Regular security audits, minimal dependency trees +- **Performance**: Choose performance-oriented crates for critical paths +- **Compatibility**: Ensure cross-platform support when needed + +### Preferred Crates + +- **Error Handling**: `thiserror` for libraries, `anyhow` for applications +- **CLI**: `clap` with derive macros for argument parsing +- **Serialization**: `serde` ecosystem for JSON/YAML/TOML +- **Testing**: `criterion` for benchmarks, `proptest` for property testing +- **Async**: `tokio` ecosystem when async is needed + +## Performance & Optimization + +### Performance Principles + +- **Measure First**: Profile before optimizing, use `cargo bench` and `perf` +- **Memory Efficiency**: Prefer stack allocation, use `Box`/`Arc` judiciously +- **Zero-Copy**: Minimize allocations in hot paths +- **Lazy Evaluation**: Defer expensive computations until needed + +### Profiling & Benchmarking + +- **Criterion**: Standard benchmarking with statistical analysis +- **Flamegraphs**: Use `cargo flamegraph` for performance profiling +- **Memory Profiling**: Use `valgrind` or `heaptrack` for memory analysis +- **Continuous Benchmarking**: Track performance regressions in CI diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md index 562aad3..760fd7c 100644 --- a/.kiro/steering/tech.md +++ b/.kiro/steering/tech.md @@ -30,39 +30,16 @@ - `clap` - Command-line argument parsing - `serde` + `serde_json` - JSON serialization for output formats -## Testing & Build Tools +## Project-Specific Testing Tools -- **Rust** - Primary language for performance and memory safety -- **Cargo** - Build system for Rust projects -- **cargo-nextest** - Test runner for faster, more reliable test execution -- **llvm-cov** - for coverage measurement and reporting (target: >85%) -- **insta** - for deterministic CLI output validation -- **criterion** - Performance benchmarks for critical path components +- **insta** - for deterministic CLI output validation (binary analysis results) +- **criterion** - Performance benchmarks for string extraction and classification ### Cross-platform Support - **CI Matrix**: Linux, macOS, Windows with multiple Rust versions (stable, beta, MSRV) - **Architecture**: x86_64 and ARM64 support validation -## Build Commands - -```bash -# Development build -cargo build - -# Release build (optimized) -cargo build --release - -# Run tests -cargo test - -# Run with example -cargo run -- binary_file.exe --json - -# Install locally -cargo install --path . -``` - ## Development Phases - **MVP**: Basic goblin + section extraction + ASCII/UTF-16 + tagging + JSONL output @@ -70,8 +47,9 @@ cargo install --path . - **v0.3**: Relocation hints + basic disassembly references - **v0.4**: DWARF support + Mach-O load commands + Go build info -## Performance Considerations +## Project-Specific Performance Considerations -- Use memory mapping for large binaries -- Lazy evaluation for optional features (DWARF, disasm) -- Efficient regex compilation and caching +- **Memory Mapping**: Use `memmap2` for large binary files (>1MB) +- **Lazy Evaluation**: Defer expensive features (DWARF parsing, disassembly) until requested +- **Regex Caching**: Compile semantic classification patterns once at startup +- **Section Filtering**: Skip irrelevant binary sections (debug, relocation) during extraction diff --git a/.markdownlint.json b/.markdownlint.json index 3cac11e..469cf6f 100644 --- a/.markdownlint.json +++ b/.markdownlint.json @@ -1,36 +1,34 @@ { - "$schema": "https://raw.githubusercontent.com/DavidAnson/markdownlint/main/schema/markdownlint-config-schema.json", - "default": true, - "heading-increment": true, - "MD003": { - "style": "atx" - }, - "line-length": false, - "MD004": { - "style": "consistent" - }, - "MD007": { - "indent": 2 - }, - "MD013": false, - "MD024": { - "siblings_only": true - }, - "MD029": false, - "MD033": { - "allowed_elements": [ - "span" - ] - }, - "MD035": { - "style": "---" - }, - "MD046": { - "style": "fenced" - }, - "MD048": { - "style": "backtick" - }, - "first-line-h1": false, - "fenced-code-language": true + "$schema": "https://raw.githubusercontent.com/DavidAnson/markdownlint/main/schema/markdownlint-config-schema.json", + "default": true, + "heading-increment": true, + "MD003": { + "style": "atx" + }, + "line-length": false, + "MD004": { + "style": "consistent" + }, + "MD007": { + "indent": 2 + }, + "MD013": false, + "MD024": { + "siblings_only": true + }, + "MD029": false, + "MD033": { + "allowed_elements": ["span"] + }, + "MD035": { + "style": "---" + }, + "MD046": { + "style": "fenced" + }, + "MD048": { + "style": "backtick" + }, + "first-line-h1": false, + "fenced-code-language": true } diff --git a/Cargo.toml b/Cargo.toml index 6ce1e54..36f62e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ thiserror = "2.0.17" [dev-dependencies] criterion = "0.7.0" insta = "1.0" +tempfile = "3.8" # The profile that 'dist' will build with [profile.dist] diff --git a/README.md b/README.md index 8918fd8..5e70179 100644 --- a/README.md +++ b/README.md @@ -96,24 +96,24 @@ cargo run -- --help stringy target_binary ``` -### Planned CLI Interface +### Current CLI Interface -The following features are being implemented: +Basic functionality is implemented with the full interface in development: ```bash -# Basic analysis (coming soon) +# Current: Basic analysis stringy target_binary -# Focused extraction (planned) +# In Development: Advanced features stringy --only url,filepath target_binary stringy --min-len 8 --enc ascii,utf16 target_binary stringy --top 50 --json target_binary -# PE-specific features (planned) +# Planned: Format-specific features stringy --pe-version --pe-manifest target.exe stringy --utf16-only target.exe -# Pipeline integration (planned) +# Planned: Pipeline integration stringy --json target_binary | jq '.[] | select(.tags[] | contains("url"))' stringy --yara candidates.txt target_binary ``` @@ -167,23 +167,37 @@ Score Offset Section Tags String This project is in active development. Current implementation status: -- ✅ **Core Infrastructure**: Project structure, data types, error handling +- ✅ **Core Infrastructure**: Complete project structure, comprehensive data types, robust error handling - ✅ **Format Detection**: ELF, PE, Mach-O binary format detection via `goblin` -- ✅ **Container Parsers**: Section classification, import/export extraction -- 🚧 **String Extraction**: ASCII/UTF-8 and UTF-16 extraction engines -- 🚧 **Semantic Classification**: URL, domain, path, GUID pattern matching -- 🚧 **Ranking System**: Section-aware scoring and relevance calculation -- 🚧 **Output Formats**: JSONL, human-readable, and YARA-friendly output -- 🚧 **CLI Interface**: Command-line argument parsing and main pipeline +- ✅ **Container Parsers**: Full section classification with weight-based prioritization +- ✅ **Import/Export Extraction**: Symbol extraction from all supported formats +- ✅ **Section Analysis**: Smart classification of string-rich sections +- 🚧 **String Extraction**: ASCII/UTF-8 and UTF-16 extraction engines (framework ready) +- 🚧 **Semantic Classification**: URL, domain, path, GUID pattern matching (types defined) +- 🚧 **Ranking System**: Section-aware scoring algorithm (framework in place) +- 🚧 **Output Formats**: JSONL, human-readable, and YARA-friendly output (types ready) +- 🚧 **CLI Interface**: Basic argument parsing implemented, main pipeline in progress ### Current Capabilities -The foundation is solid with working binary format parsers that can: +The foundation is robust with fully implemented binary format parsers that can: -- Detect ELF, PE, and Mach-O formats -- Classify sections by string likelihood (`.rodata`, `.rdata`, `__cstring`, etc.) -- Extract import/export symbol names -- Handle cross-platform section characteristics +- **Format Detection**: Automatically detect ELF, PE, and Mach-O formats using `goblin` +- **Section Classification**: Intelligently classify sections by string likelihood with weighted scoring: + - ELF: `.rodata` (10.0), `.comment` (9.0), `.data.rel.ro` (7.0) + - PE: `.rdata` (10.0), `.rsrc` (9.0), read-only `.data` (7.0) + - Mach-O: `__TEXT,__cstring` (10.0), `__TEXT,__const` (9.0), `__DATA_CONST` (7.0) +- **Symbol Processing**: Extract and classify import/export names from symbol tables +- **Cross-Platform Support**: Handle platform-specific section characteristics and naming +- **Comprehensive Metadata**: Track section offsets, sizes, RVAs, and permissions + +### Architecture Highlights + +- **Trait-Based Design**: `ContainerParser` trait enables easy format extension +- **Type Safety**: Comprehensive error handling with `StringyError` enum +- **Performance Ready**: Section weighting system prioritizes high-value areas +- **Extensible Classification**: `Tag` enum supports semantic string categorization +- **Multiple Sources**: Handles strings from section data, imports, exports, and resources See the [implementation plan](.kiro/specs/stringy-binary-analyzer/tasks.md) for detailed progress tracking. diff --git a/cspell.config.yaml b/cspell.config.yaml index face390..01d762f 100644 --- a/cspell.config.yaml +++ b/cspell.config.yaml @@ -81,7 +81,7 @@ words: - evilbitlabs - UncleSp1d3r - unclesp1d3r - + # Rust ecosystem - rustc - rustup @@ -104,7 +104,7 @@ words: - megalinter - cspell - justfile - + # Build and CI tools - dist - axodotdev @@ -118,7 +118,7 @@ words: - musl - aarch - x86_64 - + # Binary analysis terms - rodata - rdata @@ -131,7 +131,7 @@ words: - YARA - GUID - GUIDs - + # Technical acronyms - CLI - JSON @@ -148,4 +148,4 @@ words: - API - URL - URLs - - IPs \ No newline at end of file + - IPs diff --git a/dist-workspace.toml b/dist-workspace.toml index d8bde69..aafdbfe 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -4,7 +4,7 @@ members = ["cargo:."] # Config for 'dist' [dist] # The preferred dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.30.0" +cargo-dist-version = "0.30.2" # CI backends to support ci = "github" # The installers to generate for each app diff --git a/docs/src/architecture.md b/docs/src/architecture.md index d32c928..14bfae3 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -10,98 +10,236 @@ Binary File → Format Detection → Container Parsing → String Extraction → ## Core Components -### 1. Container Module (`src/container/`) +### 1. Container Module (`src/container/`) ✅ **Implemented** -Handles binary format detection and parsing using the `goblin` crate. +Handles binary format detection and parsing using the `goblin` crate with comprehensive section analysis. -- **Format Detection**: Automatically identifies ELF, PE, and Mach-O formats -- **Section Classification**: Categorizes sections by string likelihood -- **Metadata Extraction**: Collects imports, exports, and structural information +- **Format Detection**: Automatically identifies ELF, PE, and Mach-O formats via `goblin::Object::parse()` +- **Section Classification**: Categorizes sections by string likelihood with weighted scoring +- **Metadata Extraction**: Collects imports, exports, and detailed structural information +- **Cross-Platform Support**: Handles platform-specific section characteristics and naming conventions #### Supported Formats -| Format | Parser | Key Sections | -| ------ | ------------- | ------------------------------------- | -| ELF | `ElfParser` | `.rodata`, `.data.rel.ro`, `.comment` | -| PE | `PeParser` | `.rdata`, `.rsrc`, version info | -| Mach-O | `MachoParser` | `__TEXT,__cstring`, `__DATA_CONST` | +| Format | Parser | Key Sections (Weight) | Import/Export Support | +| ------ | ------------- | -------------------------------------------------------- | ----------------------- | +| ELF | `ElfParser` | `.rodata` (10.0), `.comment` (9.0), `.data.rel.ro` (7.0) | ✅ Dynamic & Static | +| PE | `PeParser` | `.rdata` (10.0), `.rsrc` (9.0), read-only `.data` (7.0) | ✅ Import/Export Tables | +| Mach-O | `MachoParser` | `__TEXT,__cstring` (10.0), `__TEXT,__const` (9.0) | ✅ Symbol Tables | -### 2. Extraction Module (`src/extraction/`) +#### Section Weight System -Implements encoding-aware string extraction algorithms. +The parsers implement intelligent section prioritization: -- **ASCII/UTF-8**: Scans for printable character sequences -- **UTF-16**: Detects little-endian and big-endian wide strings -- **Deduplication**: Canonicalizes strings while preserving metadata +```rust +// Example: ELF section weights +".rodata" | ".rodata.str1.*" => 10.0 // Highest priority +".comment" | ".note.*" => 9.0 // Build info, very likely strings +".data.rel.ro" => 7.0 // Read-only data +".data" => 5.0 // Writable data +".text" => 1.0 // Code sections (low priority) +``` + +### 2. Extraction Module (`src/extraction/`) 🚧 **Framework Ready** + +Implements encoding-aware string extraction algorithms with configurable parameters. + +- **ASCII/UTF-8**: Scans for printable character sequences with noise filtering +- **UTF-16**: Detects little-endian and big-endian wide strings with confidence scoring +- **Deduplication**: Canonicalizes strings while preserving complete metadata +- **Section-Aware**: Uses container parser weights to prioritize extraction areas -### 3. Classification Module (`src/classification/`) +### 3. Classification Module (`src/classification/`) 🚧 **Types Defined** -Applies semantic analysis to extracted strings. +Applies semantic analysis to extracted strings with comprehensive tagging system. -- **Pattern Matching**: Uses regex to identify URLs, IPs, paths, etc. +- **Pattern Matching**: Uses regex to identify URLs, IPs, paths, GUIDs, etc. - **Symbol Processing**: Demangles Rust symbols and processes imports/exports -- **Context Analysis**: Considers section context for classification +- **Context Analysis**: Considers section context and source type for classification +- **Extensible Tags**: Supports 15+ semantic categories from network indicators to code artifacts -### 4. Ranking Module (`src/classification/ranking.rs`) +#### Supported Classification Tags -Implements the scoring algorithm to prioritize relevant strings. +| Category | Tags | Examples | +| ----------- | --------------------------------- | ----------------------------------------------- | +| Network | `url`, `domain`, `ipv4`, `ipv6` | `https://api.com`, `example.com`, `192.168.1.1` | +| Filesystem | `filepath`, `regpath` | `/usr/bin/app`, `HKEY_LOCAL_MACHINE\...` | +| Identifiers | `guid`, `email`, `user-agent` | `{12345678-...}`, `user@domain.com` | +| Code | `fmt`, `b64`, `import`, `export` | `Error: %s`, `SGVsbG8=`, `CreateFileW` | +| Resources | `version`, `manifest`, `resource` | `v1.2.3`, XML config, UI strings | + +### 4. Ranking Module (`src/classification/ranking.rs`) 🚧 **Algorithm Designed** + +Implements the scoring algorithm to prioritize relevant strings using multiple factors. ```text Score = SectionWeight + EncodingConfidence + SemanticBoost - NoisePenalty ``` -### 5. Output Module (`src/output/`) +**Scoring Components:** + +- **Section Weight**: 1.0-10.0 based on section classification +- **Encoding Confidence**: Higher for clean UTF-8/ASCII vs. noisy UTF-16 +- **Semantic Boost**: +20-50 points for URLs, GUIDs, imports/exports +- **Noise Penalty**: -10 to -30 for high entropy, excessive length, repeated patterns -Formats results for different use cases. +### 5. Output Module (`src/output/`) 🚧 **Interfaces Defined** -- **Human-readable**: Sorted tables for interactive analysis -- **JSONL**: Structured data for automation -- **YARA**: Escaped strings for rule creation +Formats results for different use cases with consistent data structures. + +- **Human-readable**: Sorted tables with score, offset, section, tags, and truncated strings +- **JSONL**: Complete structured data including all metadata fields +- **YARA**: Properly escaped strings with hex alternatives and confidence grouping ## Data Flow -### 1. Binary Analysis Phase +### 1. Binary Analysis Phase ✅ **Implemented** ```rust -// Format detection -let format = detect_format(&data); -let parser = create_parser(format)?; +// Format detection using goblin +let format = detect_format(&data); // Returns BinaryFormat enum +let parser = create_parser(format)?; // Creates appropriate parser -// Container parsing +// Container parsing with full metadata extraction let container_info = parser.parse(&data)?; +// Returns: sections with weights, imports, exports, format info ``` -### 2. String Extraction Phase +**Current Implementation:** + +- Automatic format detection via `goblin::Object::parse()` +- Trait-based parser creation with `Box` +- Comprehensive section analysis with classification and weighting +- Complete import/export symbol extraction + +### 2. String Extraction Phase 🚧 **Framework Ready** ```rust -// Extract strings from prioritized sections -for section in container_info.sections { - let strings = extract_strings(&data, §ion)?; +// Extract strings from prioritized sections (by weight) +let mut all_strings = Vec::new(); +for section in container_info.sections.iter().filter(|s| s.weight > 5.0) { + let strings = extract_strings(&data, §ion, &config)?; all_strings.extend(strings); } -// Deduplicate while preserving metadata +// Include import/export names as high-value strings +all_strings.extend(extract_symbol_strings(&container_info)); + +// Deduplicate while preserving all metadata let unique_strings = deduplicate(all_strings); ``` -### 3. Classification Phase +### 3. Classification Phase 🚧 **Types Ready** ```rust -// Apply semantic classification +// Apply semantic classification with context awareness for string in &mut unique_strings { - string.tags = classify_string(&string.text, &string.context); - string.score = calculate_score(&string); + let context = StringContext { + section_type: string.section_type, + source: string.source, + encoding: string.encoding, + }; + + string.tags = classify_string(&string.text, &context); + string.score = calculate_score(&string, &context); } ``` -### 4. Output Phase +### 4. Output Phase 🚧 **Interfaces Defined** + +```rust +// Sort by relevance score (descending) +unique_strings.sort_by_key(|s| std::cmp::Reverse(s.score)); + +// Apply user filters and limits +let filtered = apply_filters(&unique_strings, &config); + +// Format according to requested output type +let output = match config.format { + OutputFormat::Human => format_human_readable(&filtered), + OutputFormat::Json => format_jsonl(&filtered), + OutputFormat::Yara => format_yara_rules(&filtered), +}; +``` + +## Current Implementation Details + +### Container Parser Architecture + +The container parsing system is fully implemented with a trait-based design: ```rust -// Sort by relevance and format output -unique_strings.sort_by_key(|s| -s.score); -let output = format_output(&unique_strings, &config); +pub trait ContainerParser { + fn detect(data: &[u8]) -> bool + where + Self: Sized; + fn parse(&self, data: &[u8]) -> Result; +} ``` +**Format Detection Pipeline:** + +1. `detect_format()` uses `goblin::Object::parse()` to identify format +2. `create_parser()` returns appropriate `Box` +3. Parser extracts sections, imports, exports with full metadata + +### Section Classification System + +Each parser implements intelligent section classification: + +```rust +// ELF Example +fn classify_section(section: &SectionHeader, name: &str) -> SectionType { + if section.sh_flags & SHF_EXECINSTR != 0 { + return SectionType::Code; + } + + match name { + ".rodata" | ".rodata.str1.*" => SectionType::StringData, + ".comment" | ".note.*" => SectionType::StringData, + ".data.rel.ro" => SectionType::ReadOnlyData, + // ... more classifications + } +} +``` + +**Weight Calculation:** + +- String data sections: 8.0-10.0 (highest priority) +- Read-only data: 7.0 +- Resources: 8.0-9.0 +- Writable data: 5.0 +- Code: 1.0 (lowest priority) + +### Symbol Extraction + +All parsers extract import/export information: + +- **ELF**: Dynamic symbol table (`dynsyms`) and static symbols (`syms`) +- **PE**: Import/export tables with library names and ordinals +- **Mach-O**: Symbol tables with undefined/defined symbol filtering + +### Data Structures + +Core types are fully defined and serializable: + +```rust +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FoundString { + pub text: String, + pub encoding: Encoding, + pub offset: u64, + pub rva: Option, + pub section: Option, + pub length: u32, + pub tags: Vec, + pub score: i32, + pub source: StringSource, +} +``` + +**Tag System**: 15+ semantic categories ready for classification **Error Handling**: Comprehensive `StringyError` enum with context **Cross-Platform**: Handles platform-specific binary characteristics + ## Key Design Decisions ### Memory Efficiency diff --git a/docs/src/cli.md b/docs/src/cli.md index 806307f..e0a4f97 100644 --- a/docs/src/cli.md +++ b/docs/src/cli.md @@ -1,6 +1,6 @@ # Command Line Interface -**Note**: The CLI interface is currently under development. This documentation describes the planned interface. +**Current Status**: Basic CLI is implemented with argument parsing. Advanced features are in development. This documentation describes both current and planned functionality. ## Basic Syntax @@ -8,6 +8,20 @@ stringy [OPTIONS] ``` +**Currently Implemented:** + +```bash +stringy # Basic binary analysis +stringy --help # Show help information +stringy --version # Show version +``` + +**In Development:** + +```bash +stringy [OPTIONS] # Full option support +``` + ## Global Options ### Input/Output diff --git a/docs/src/introduction.md b/docs/src/introduction.md index f63be2e..3e8996f 100644 --- a/docs/src/introduction.md +++ b/docs/src/introduction.md @@ -68,6 +68,24 @@ Analyze binaries for hardcoded credentials, API endpoints, configuration data, a ## Project Status -Stringy is currently in active development. The core infrastructure is complete, including binary format detection and section classification. String extraction, semantic classification, and output formatting are being implemented. +Stringy is in active development with a solid foundation already in place. The core infrastructure is complete and robust: + +**✅ Implemented:** + +- Complete binary format detection (ELF, PE, Mach-O) +- Comprehensive section classification with intelligent weighting +- Import/export symbol extraction from all formats +- Type-safe error handling and data structures +- Extensible architecture with trait-based parsers + +**🚧 In Progress:** + +- String extraction engines (ASCII/UTF-8, UTF-16) +- Semantic classification system (URLs, paths, GUIDs, etc.) +- Ranking and scoring algorithms +- Output formatters (JSON, human-readable, YARA) +- Full CLI interface implementation + +The foundation provides reliable binary analysis capabilities that can already identify and classify sections by their likelihood of containing meaningful strings, extract symbol information, and handle cross-platform binary formats. See the [Architecture Overview](./architecture.md) for technical details and the [Contributing](./contributing.md) guide to get involved. diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index b28de14..e4c4f47 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -4,7 +4,7 @@ This guide will get you up and running with Stringy in minutes. ## Basic Usage -**Note**: The CLI interface is currently under development. This guide shows the planned interface. +**Current Status**: Basic CLI is implemented with advanced features in development. This guide shows both current and planned functionality. ### Analyze a Binary @@ -12,13 +12,32 @@ This guide will get you up and running with Stringy in minutes. stringy /path/to/binary ``` -This performs a basic analysis with default settings: +**Current Implementation**: Performs binary format detection and section analysis: -- Extracts ASCII and UTF-16 strings -- Applies semantic classification -- Shows top results in human-readable format +- Detects ELF, PE, or Mach-O format automatically +- Classifies sections by string likelihood with weighted scoring +- Extracts import/export symbol names +- Shows basic analysis results -### Example Output +**Planned Features**: Full string extraction and classification: + +- Extract ASCII and UTF-16 strings from prioritized sections +- Apply semantic classification (URLs, paths, GUIDs, etc.) +- Show ranked results in human-readable format + +### Current Output + +```text +Stringy - Binary string extraction tool +Format: ELF +Sections found: 12 +High-priority sections: .rodata (weight: 10.0), .comment (weight: 9.0) +Imports: 45 symbols +Exports: 12 symbols +Implementation coming soon... +``` + +### Planned Output ```text Score Offset Section Encoding Tags String diff --git a/justfile b/justfile index d73f1a5..96c4802 100644 --- a/justfile +++ b/justfile @@ -134,7 +134,6 @@ format: fmt format-json-yaml format-docs fmt-justfile format-json-yaml: npx prettier --write "**/*.{json,yaml,yml}" - [windows] format-docs: @if (Get-Command mdformat -ErrorAction SilentlyContinue) { Get-ChildItem -Recurse -Filter "*.md" | Where-Object { $_.FullName -notmatch "\\target\\" -and $_.FullName -notmatch "\\node_modules\\" } | ForEach-Object { mdformat $_.FullName } } else { Write-Host "mdformat not found. Run 'just mdformat-install' first." } diff --git a/src/container/elf.rs b/src/container/elf.rs index 14ebd16..6529e41 100644 --- a/src/container/elf.rs +++ b/src/container/elf.rs @@ -5,6 +5,7 @@ use crate::types::{ }; use goblin::Object; use goblin::elf::{Elf, SectionHeader}; +use std::collections::HashSet; /// Parser for ELF (Executable and Linkable Format) binaries pub struct ElfParser; @@ -20,6 +21,35 @@ impl ElfParser { Self } + /// Calculate section weight based on likelihood of containing meaningful strings + fn calculate_section_weight(section_type: SectionType, name: &str) -> f32 { + match section_type { + // String data sections get highest weight + SectionType::StringData => { + match name { + // Dedicated string sections get maximum weight + ".rodata" | ".rodata.str1.1" | ".rodata.str1.4" | ".rodata.str1.8" => 10.0, + // Comment sections are also very likely to contain strings + ".comment" | ".note" | ".note.gnu.build-id" => 9.0, + // Other string data sections + _ => 8.0, + } + } + // Read-only data sections are likely to contain strings + SectionType::ReadOnlyData => 7.0, + // Writable data sections may contain strings but less likely + SectionType::WritableData => 5.0, + // Code sections unlikely to contain meaningful strings + SectionType::Code => 1.0, + // Debug sections may contain some strings but usually not user-facing + SectionType::Debug => 2.0, + // Resources (not applicable to ELF but included for completeness) + SectionType::Resources => 8.0, + // Other sections get minimal weight + SectionType::Other => 1.0, + } + } + /// Classify ELF section based on its name and flags fn classify_section(section: &SectionHeader, name: &str) -> SectionType { // Check section flags first @@ -50,22 +80,65 @@ impl ElfParser { } } - /// Extract basic import information from ELF dynamic section + /// Extract import information from ELF dynamic section + /// Imports are symbols that are undefined (SHN_UNDEF) and need to be resolved at runtime fn extract_imports(&self, elf: &Elf) -> Vec { let mut imports = Vec::new(); // Extract from dynamic symbol table for sym in &elf.dynsyms { - if sym.st_bind() == goblin::elf::sym::STB_GLOBAL - && sym.st_type() == goblin::elf::sym::STT_FUNC - && sym.st_shndx == (goblin::elf::section_header::SHN_UNDEF as usize) + // Import symbols are: + // - Undefined (st_shndx == SHN_UNDEF) + // - Global or weak binding + // - Functions or objects + if sym.st_shndx == (goblin::elf::section_header::SHN_UNDEF as usize) + && (sym.st_bind() == goblin::elf::sym::STB_GLOBAL + || sym.st_bind() == goblin::elf::sym::STB_WEAK) + && (sym.st_type() == goblin::elf::sym::STT_FUNC + || sym.st_type() == goblin::elf::sym::STT_OBJECT + || sym.st_type() == goblin::elf::sym::STT_NOTYPE) { if let Some(name) = elf.dynstrtab.get_at(sym.st_name) { - imports.push(ImportInfo { - name: name.to_string(), - library: None, // ELF doesn't directly specify library names in symbols - address: Some(sym.st_value), - }); + // Skip empty names + if !name.is_empty() { + imports.push(ImportInfo { + name: name.to_string(), + library: self.extract_library_from_needed(elf, name), + address: if sym.st_value != 0 { + Some(sym.st_value) + } else { + None + }, + }); + } + } + } + } + + // Also check regular symbol table for static imports + for sym in &elf.syms { + if sym.st_shndx == (goblin::elf::section_header::SHN_UNDEF as usize) + && (sym.st_bind() == goblin::elf::sym::STB_GLOBAL + || sym.st_bind() == goblin::elf::sym::STB_WEAK) + && (sym.st_type() == goblin::elf::sym::STT_FUNC + || sym.st_type() == goblin::elf::sym::STT_OBJECT + || sym.st_type() == goblin::elf::sym::STT_NOTYPE) + { + if let Some(name) = elf.strtab.get_at(sym.st_name) { + if !name.is_empty() { + // Avoid duplicates from dynamic symbol table + if !imports.iter().any(|imp| imp.name == name) { + imports.push(ImportInfo { + name: name.to_string(), + library: None, // Static symbols don't have library info + address: if sym.st_value != 0 { + Some(sym.st_value) + } else { + None + }, + }); + } + } } } } @@ -73,22 +146,63 @@ impl ElfParser { imports } + /// Attempt to extract library information from DT_NEEDED entries + /// This is a best-effort approach since ELF doesn't directly link symbols to libraries + fn extract_library_from_needed(&self, elf: &Elf, _symbol_name: &str) -> Option { + // For now, we can't reliably determine which specific library a symbol comes from + // in ELF without additional information like version symbols or relocation data. + // This would require more complex analysis of the dynamic linking process. + + // We could potentially return the first DT_NEEDED entry as a fallback, + // but that would be misleading. Better to return None for accuracy. + + // Future enhancement: analyze PLT/GOT relocations to match symbols to libraries + let _ = elf; // Suppress unused parameter warning + None + } + /// Extract basic export information from ELF symbol table fn extract_exports(&self, elf: &Elf) -> Vec { let mut exports = Vec::new(); + let mut seen_names = HashSet::new(); // Extract from dynamic symbol table for sym in &elf.dynsyms { - if sym.st_bind() == goblin::elf::sym::STB_GLOBAL + if (sym.st_bind() == goblin::elf::sym::STB_GLOBAL + || sym.st_bind() == goblin::elf::sym::STB_WEAK) && sym.st_shndx != (goblin::elf::section_header::SHN_UNDEF as usize) && sym.st_value != 0 { if let Some(name) = elf.dynstrtab.get_at(sym.st_name) { - exports.push(ExportInfo { - name: name.to_string(), - address: sym.st_value, - ordinal: None, // ELF doesn't use ordinals - }); + if !name.is_empty() && seen_names.insert(name.to_string()) { + exports.push(ExportInfo { + name: name.to_string(), + address: sym.st_value, + ordinal: None, // ELF doesn't use ordinals + }); + } + } + } + } + + // Also check regular symbol table for static exports + for sym in &elf.syms { + if (sym.st_bind() == goblin::elf::sym::STB_GLOBAL + || sym.st_bind() == goblin::elf::sym::STB_WEAK) + && sym.st_shndx != (goblin::elf::section_header::SHN_UNDEF as usize) + && sym.st_value != 0 + && (sym.st_type() == goblin::elf::sym::STT_FUNC + || sym.st_type() == goblin::elf::sym::STT_OBJECT + || sym.st_type() == goblin::elf::sym::STT_NOTYPE) + { + if let Some(name) = elf.strtab.get_at(sym.st_name) { + if !name.is_empty() && seen_names.insert(name.to_string()) { + exports.push(ExportInfo { + name: name.to_string(), + address: sym.st_value, + ordinal: None, // ELF doesn't use ordinals + }); + } } } } @@ -125,6 +239,7 @@ impl ContainerParser for ElfParser { } let section_type = Self::classify_section(section, &name); + let weight = Self::calculate_section_weight(section_type, &name); sections.push(SectionInfo { name, @@ -137,6 +252,7 @@ impl ContainerParser for ElfParser { != 0, is_writable: section.sh_flags & (goblin::elf::section_header::SHF_WRITE as u64) != 0, + weight, }); } @@ -155,6 +271,7 @@ impl ContainerParser for ElfParser { #[cfg(test)] mod tests { use super::*; + use goblin::elf::section_header::{SHF_EXECINSTR, SectionHeader}; #[test] fn test_elf_detection() { @@ -168,8 +285,6 @@ mod tests { #[test] fn test_section_classification() { - use goblin::elf::section_header::{SHF_EXECINSTR, SectionHeader}; - // Create a mock section header for testing let section = SectionHeader { sh_flags: SHF_EXECINSTR as u64, @@ -189,28 +304,52 @@ mod tests { ElfParser::classify_section(&data_section, ".rodata"), SectionType::StringData ); + assert_eq!( + ElfParser::classify_section(&data_section, ".rodata.str1.1"), + SectionType::StringData + ); assert_eq!( ElfParser::classify_section(&data_section, ".comment"), SectionType::StringData ); + assert_eq!( + ElfParser::classify_section(&data_section, ".note"), + SectionType::StringData + ); // Test read-only data sections assert_eq!( ElfParser::classify_section(&data_section, ".data.rel.ro"), SectionType::ReadOnlyData ); + assert_eq!( + ElfParser::classify_section(&data_section, ".data.rel.ro.local"), + SectionType::ReadOnlyData + ); // Test writable data sections assert_eq!( ElfParser::classify_section(&data_section, ".data"), SectionType::WritableData ); + assert_eq!( + ElfParser::classify_section(&data_section, ".bss"), + SectionType::WritableData + ); // Test debug sections assert_eq!( ElfParser::classify_section(&data_section, ".debug_info"), SectionType::Debug ); + assert_eq!( + ElfParser::classify_section(&data_section, ".strtab"), + SectionType::Debug + ); + assert_eq!( + ElfParser::classify_section(&data_section, ".symtab"), + SectionType::Debug + ); // Test other sections assert_eq!( @@ -225,4 +364,111 @@ mod tests { // Just verify we can create the parser // Test passes - basic functionality verified } + + #[test] + fn test_section_weight_calculation() { + // Test weight calculation for different section types and names + + // String data sections should get highest weights + assert_eq!( + ElfParser::calculate_section_weight(SectionType::StringData, ".rodata"), + 10.0 + ); + assert_eq!( + ElfParser::calculate_section_weight(SectionType::StringData, ".rodata.str1.1"), + 10.0 + ); + assert_eq!( + ElfParser::calculate_section_weight(SectionType::StringData, ".comment"), + 9.0 + ); + assert_eq!( + ElfParser::calculate_section_weight(SectionType::StringData, ".note"), + 9.0 + ); + + // Read-only data sections + assert_eq!( + ElfParser::calculate_section_weight(SectionType::ReadOnlyData, ".data.rel.ro"), + 7.0 + ); + + // Writable data sections + assert_eq!( + ElfParser::calculate_section_weight(SectionType::WritableData, ".data"), + 5.0 + ); + + // Code sections should get low weight + assert_eq!( + ElfParser::calculate_section_weight(SectionType::Code, ".text"), + 1.0 + ); + + // Debug sections + assert_eq!( + ElfParser::calculate_section_weight(SectionType::Debug, ".debug_info"), + 2.0 + ); + + // Other sections + assert_eq!( + ElfParser::calculate_section_weight(SectionType::Other, ".unknown"), + 1.0 + ); + } + + #[test] + fn test_symbol_filtering_constants() { + // Test the symbol filtering logic by checking the constants we use + use goblin::elf::section_header::SHN_UNDEF; + use goblin::elf::sym::{STB_GLOBAL, STB_WEAK, STT_FUNC, STT_OBJECT}; + + // Verify that our filtering constants are correct + assert_eq!(SHN_UNDEF, 0); // Undefined section index + assert_eq!(STB_GLOBAL, 1); // Global binding + assert_eq!(STB_WEAK, 2); // Weak binding + assert_eq!(STT_FUNC, 2); // Function type + assert_eq!(STT_OBJECT, 1); // Object type + + // These constants are used in our import/export filtering logic + // This test ensures they remain consistent with the goblin crate + } + + #[test] + fn test_import_export_extraction_methods_exist() { + // Test that the import/export extraction methods exist and can be called + // Full functionality testing requires integration tests with real ELF binaries + let parser = ElfParser::new(); + + // We can't easily create a valid ELF structure for unit testing, + // but we can verify the methods exist and have the right signatures + // by checking that they compile and can be referenced + let _extract_imports = ElfParser::extract_imports; + let _extract_exports = ElfParser::extract_exports; + let _extract_library = ElfParser::extract_library_from_needed; + + // Verify parser can be created (this is a compile-time check) + let _ = parser; + } + + #[test] + fn test_library_extraction_behavior() { + // Test the documented behavior of library extraction + let parser = ElfParser::new(); + + // Create a minimal ELF structure for testing + // We can't use Elf::default() as it doesn't exist, so we'll test the behavior + // by verifying that the method signature is correct and the documented behavior + + // The extract_library_from_needed method should return None as documented + // since ELF doesn't directly link symbols to libraries without additional analysis + + // This is a compile-time test to ensure the method exists with correct signature + let _method_ref: fn(&ElfParser, &Elf, &str) -> Option = + ElfParser::extract_library_from_needed; + + // Verify the parser exists + let _ = parser; + } } diff --git a/src/container/macho.rs b/src/container/macho.rs index b7fe03b..347c3fe 100644 --- a/src/container/macho.rs +++ b/src/container/macho.rs @@ -39,6 +39,40 @@ impl MachoParser { Self } + /// Calculate section weight based on likelihood of containing meaningful strings + fn calculate_section_weight( + section_type: SectionType, + segment_name: &str, + section_name: &str, + ) -> f32 { + match section_type { + // String data sections get highest weight + SectionType::StringData => { + match (segment_name, section_name) { + // __cstring is the primary string section in Mach-O + ("__TEXT", "__cstring") => 10.0, + // __const may contain string constants + ("__TEXT", "__const") => 9.0, + // Core Foundation strings + ("__DATA_CONST", "__cfstring") => 8.5, + _ => 8.0, + } + } + // Read-only data sections are likely to contain strings + SectionType::ReadOnlyData => 7.0, + // Writable data sections may contain strings but less likely + SectionType::WritableData => 5.0, + // Code sections unlikely to contain meaningful strings + SectionType::Code => 1.0, + // Debug sections may contain some strings but usually not user-facing + SectionType::Debug => 2.0, + // Resources (not applicable to Mach-O but included for completeness) + SectionType::Resources => 8.0, + // Other sections get minimal weight + SectionType::Other => 1.0, + } + } + /// Classifies Mach-O section based on its segment and section name. /// /// Returns the appropriate `SectionType` for string extraction prioritization. @@ -187,6 +221,7 @@ impl MachoParser { let section_name = section.name().unwrap_or("unknown"); let section_type = Self::classify_section(segment_name, section_name); + let weight = Self::calculate_section_weight(section_type, segment_name, section_name); let full_name = Self::format_section_name(segment_name, section_name); Some(SectionInfo { @@ -197,6 +232,7 @@ impl MachoParser { section_type, is_executable: Self::is_executable_section(segment_name, section_name), is_writable: Self::is_writable_section(segment_name), + weight, }) } @@ -430,4 +466,61 @@ mod tests { assert!(!MachoParser::is_writable_section("__TEXT")); assert!(!MachoParser::is_writable_section("__DATA_CONST")); } + + #[test] + fn test_section_weight_calculation() { + // Test weight calculation for different section types and names + + // String data sections should get highest weights + assert_eq!( + MachoParser::calculate_section_weight(SectionType::StringData, "__TEXT", "__cstring"), + 10.0 + ); + assert_eq!( + MachoParser::calculate_section_weight(SectionType::StringData, "__TEXT", "__const"), + 9.0 + ); + assert_eq!( + MachoParser::calculate_section_weight( + SectionType::StringData, + "__DATA_CONST", + "__cfstring" + ), + 8.5 + ); + + // Read-only data sections + assert_eq!( + MachoParser::calculate_section_weight( + SectionType::ReadOnlyData, + "__DATA_CONST", + "__const" + ), + 7.0 + ); + + // Writable data sections + assert_eq!( + MachoParser::calculate_section_weight(SectionType::WritableData, "__DATA", "__data"), + 5.0 + ); + + // Code sections should get low weight + assert_eq!( + MachoParser::calculate_section_weight(SectionType::Code, "__TEXT", "__text"), + 1.0 + ); + + // Debug sections + assert_eq!( + MachoParser::calculate_section_weight(SectionType::Debug, "__DWARF", "__debug_info"), + 2.0 + ); + + // Other sections + assert_eq!( + MachoParser::calculate_section_weight(SectionType::Other, "__UNKNOWN", "__unknown"), + 1.0 + ); + } } diff --git a/src/container/mod.rs b/src/container/mod.rs index c861a94..d120c02 100644 --- a/src/container/mod.rs +++ b/src/container/mod.rs @@ -7,6 +7,11 @@ pub mod elf; pub mod macho; pub mod pe; +// Re-export parsers for easier access +pub use elf::ElfParser; +pub use macho::MachoParser; +pub use pe::PeParser; + /// Trait for parsing different container formats pub trait ContainerParser { /// Detect if this parser can handle the given data diff --git a/src/container/pe.rs b/src/container/pe.rs index 3e90a2e..590db25 100644 --- a/src/container/pe.rs +++ b/src/container/pe.rs @@ -20,6 +20,32 @@ impl PeParser { Self } + /// Calculate section weight based on likelihood of containing meaningful strings + fn calculate_section_weight(section_type: SectionType, name: &str) -> f32 { + match section_type { + // String data sections get highest weight + SectionType::StringData => { + match name { + // .rdata is the primary string section in PE + ".rdata" | ".rodata" => 10.0, + _ => 8.0, + } + } + // Resources often contain strings + SectionType::Resources => 9.0, + // Read-only data sections are likely to contain strings + SectionType::ReadOnlyData => 7.0, + // Writable data sections may contain strings but less likely + SectionType::WritableData => 5.0, + // Code sections unlikely to contain meaningful strings + SectionType::Code => 1.0, + // Debug sections may contain some strings but usually not user-facing + SectionType::Debug => 2.0, + // Other sections get minimal weight + SectionType::Other => 1.0, + } + } + /// Classify PE section based on its name and characteristics fn classify_section(section: &SectionTable) -> SectionType { let name_bytes = String::from_utf8_lossy(§ion.name); @@ -119,6 +145,7 @@ impl ContainerParser for PeParser { } let section_type = Self::classify_section(section); + let weight = Self::calculate_section_weight(section_type, &name); sections.push(SectionInfo { name, @@ -132,6 +159,7 @@ impl ContainerParser for PeParser { is_writable: section.characteristics & goblin::pe::section_table::IMAGE_SCN_MEM_WRITE != 0, + weight, }); } @@ -245,4 +273,55 @@ mod tests { // Just verify we can create the parser // Test passes - basic functionality verified } + + #[test] + fn test_section_weight_calculation() { + // Test weight calculation for different section types and names + + // String data sections should get highest weights + assert_eq!( + PeParser::calculate_section_weight(SectionType::StringData, ".rdata"), + 10.0 + ); + assert_eq!( + PeParser::calculate_section_weight(SectionType::StringData, ".rodata"), + 10.0 + ); + + // Resources get high weight + assert_eq!( + PeParser::calculate_section_weight(SectionType::Resources, ".rsrc"), + 9.0 + ); + + // Read-only data sections + assert_eq!( + PeParser::calculate_section_weight(SectionType::ReadOnlyData, ".data"), + 7.0 + ); + + // Writable data sections + assert_eq!( + PeParser::calculate_section_weight(SectionType::WritableData, ".data"), + 5.0 + ); + + // Code sections should get low weight + assert_eq!( + PeParser::calculate_section_weight(SectionType::Code, ".text"), + 1.0 + ); + + // Debug sections + assert_eq!( + PeParser::calculate_section_weight(SectionType::Debug, ".debug"), + 2.0 + ); + + // Other sections + assert_eq!( + PeParser::calculate_section_weight(SectionType::Other, ".unknown"), + 1.0 + ); + } } diff --git a/src/lib.rs b/src/lib.rs index 4e88a33..1418c9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,44 @@ //! //! Stringy leverages format-specific knowledge to distinguish meaningful strings //! from random garbage data in binary files. +//! +//! ## Current Implementation Status +//! +//! The core infrastructure is complete and robust: +//! +//! - **Binary Format Detection**: Automatic ELF, PE, Mach-O detection via `goblin` +//! - **Container Parsing**: Full section analysis with intelligent classification +//! - **Import/Export Extraction**: Symbol processing from all supported formats +//! - **Section Weighting**: Priority-based scoring for string extraction +//! - **Type Safety**: Comprehensive error handling and data structures +//! +//! ## Basic Usage +//! +//! ```rust +//! use stringy::container::{detect_format, create_parser}; +//! +//! # fn example() -> stringy::Result<()> { +//! let data = std::fs::read("binary_file")?; +//! let format = detect_format(&data); +//! let parser = create_parser(format)?; +//! let container_info = parser.parse(&data)?; +//! +//! println!("Format: {:?}", container_info.format); +//! println!("Sections: {}", container_info.sections.len()); +//! println!("Imports: {}", container_info.imports.len()); +//! # Ok(()) +//! # } +//! ``` +//! +//! ## Architecture +//! +//! The library is organized into focused modules: +//! +//! - [`container`]: Binary format detection and parsing (✅ Complete) +//! - [`extraction`]: String extraction algorithms (🚧 Framework ready) +//! - [`classification`]: Semantic analysis and tagging (🚧 Types defined) +//! - [`output`]: Result formatting (🚧 Interfaces ready) +//! - [`types`]: Core data structures and error handling (✅ Complete) pub mod classification; pub mod container; diff --git a/src/types.rs b/src/types.rs index 524a90f..b05074a 100644 --- a/src/types.rs +++ b/src/types.rs @@ -113,6 +113,8 @@ pub struct SectionInfo { pub is_executable: bool, /// Whether the section is writable pub is_writable: bool, + /// Weight indicating likelihood of containing meaningful strings (higher = more likely) + pub weight: f32, } /// Information about an import diff --git a/tests/integration_elf.rs b/tests/integration_elf.rs new file mode 100644 index 0000000..28dc765 --- /dev/null +++ b/tests/integration_elf.rs @@ -0,0 +1,337 @@ +use std::fs; +use std::fs::File; +use std::io::Write; +use std::process::Command; +use stringy::container::{ContainerParser, ElfParser}; +use tempfile::TempDir; + +#[test] +#[cfg(target_family = "unix")] +fn test_elf_import_export_extraction_dynamic() { + // Create a simple C program that we can compile to test with + let c_code = r#" +#include +#include + +// Export a function +int exported_function(int x) { + return x * 2; +} + +// Use some imports +int main() { + printf("Hello, world!\n"); // Import from libc + void* ptr = malloc(100); // Import from libc + free(ptr); // Import from libc + return 0; +} +"#; + + // Write the C code to a temporary file + let temp_dir = std::env::temp_dir(); + let c_file = temp_dir.join("test_elf.c"); + let elf_file = temp_dir.join("test_elf"); + + fs::write(&c_file, c_code).expect("Failed to write C file"); + + // Try to compile it with gcc, attempting to force ELF output + // First try with a cross-compiler for Linux if available + // NOTE: This is for dynamic linking test, so we DON'T use -static + let mut output = Command::new("x86_64-linux-gnu-gcc") + .args(["-o", elf_file.to_str().unwrap(), c_file.to_str().unwrap()]) + .output(); + + // If cross-compiler not available, try regular gcc (dynamically linked) + if output.is_err() { + output = Command::new("gcc") + .args(["-o", elf_file.to_str().unwrap(), c_file.to_str().unwrap()]) + .output(); + } + + match output { + Ok(result) if result.status.success() => { + // Successfully compiled, now test our ELF parser + let elf_data = fs::read(&elf_file).expect("Failed to read ELF file"); + + // Check what format we actually got + match goblin::Object::parse(&elf_data) { + Ok(goblin::Object::Elf(_)) => { + // Great! We have an ELF binary, test our parser + assert!(ElfParser::detect(&elf_data), "ELF detection should succeed"); + } + Ok(goblin::Object::Mach(_)) => { + println!("Got Mach-O binary (expected on macOS), skipping ELF-specific test"); + // Clean up and return early + let _ = fs::remove_file(&c_file); + let _ = fs::remove_file(&elf_file); + return; + } + Ok(other) => { + println!( + "Got unexpected binary format: {:?}, skipping test", + std::mem::discriminant(&other) + ); + let _ = fs::remove_file(&c_file); + let _ = fs::remove_file(&elf_file); + return; + } + Err(e) => { + println!("Failed to parse binary: {}, skipping test", e); + let _ = fs::remove_file(&c_file); + let _ = fs::remove_file(&elf_file); + return; + } + } + + // Test parsing + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Verify we found some imports + assert!( + !container_info.imports.is_empty(), + "Should find imports like printf, malloc, free" + ); + + // Check that we found expected imports + let import_names: Vec<&str> = container_info + .imports + .iter() + .map(|imp| imp.name.as_str()) + .collect(); + + // We should find at least some of these common libc functions + let expected_imports = ["printf", "malloc", "free", "__libc_start_main"]; + let found_expected = expected_imports + .iter() + .any(|&expected| import_names.contains(&expected)); + + assert!( + found_expected, + "Should find at least one expected import. Found: {:?}", + import_names + ); + + // Verify we found some exports (at least main and our exported function) + // Note: exports might be stripped in some builds, so we'll be lenient + println!( + "Found {} imports and {} exports", + container_info.imports.len(), + container_info.exports.len() + ); + + // Clean up + let _ = fs::remove_file(&c_file); + let _ = fs::remove_file(&elf_file); + } + Ok(_) => { + println!("gcc compilation failed, skipping ELF integration test"); + // This is not a test failure - just means gcc isn't available + } + Err(_) => { + println!("gcc not found, skipping ELF integration test"); + // This is not a test failure - just means gcc isn't available + } + } +} + +#[test] +#[cfg(target_family = "unix")] +fn test_elf_import_export_extraction_static() { + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let c_file = temp_dir.path().join("test_static.c"); + let elf_file = temp_dir.path().join("test_static"); + + let c_code = r#" + #include + #include + + void exported_function() { + printf("Hello from exported function\n"); + } + + int main() { + void *ptr = malloc(100); + printf("Allocated memory\n"); + free(ptr); + exported_function(); + return 0; + } + "#; + + File::create(&c_file) + .expect("Failed to create C file") + .write_all(c_code.as_bytes()) + .expect("Failed to write C code"); + + // Compile statically-linked binary with -static flag + let mut output = Command::new("x86_64-linux-gnu-gcc") + .args([ + "-static", + "-o", + elf_file.to_str().unwrap(), + c_file.to_str().unwrap(), + ]) + .output(); + + if output.is_err() || !output.as_ref().map(|o| o.status.success()).unwrap_or(false) { + output = Command::new("gcc") + .args([ + "-static", + "-o", + elf_file.to_str().unwrap(), + c_file.to_str().unwrap(), + ]) + .output(); + } + + match output { + Ok(output) if output.status.success() => { + let elf_data = fs::read(&elf_file).expect("Failed to read ELF file"); + + let format_obj = goblin::Object::parse(&elf_data).expect("Failed to parse with goblin"); + + match format_obj { + goblin::Object::Elf(_elf) => { + let parser = ElfParser::new(); + let container_info = parser.parse(&elf_data).expect("Failed to parse ELF"); + + // Statically-linked binaries typically have no or very few dynamic imports + // since all dependencies are embedded + println!( + "Static binary imports found: {} (expected: 0 or very few)", + container_info.imports.len() + ); + + // Check exports - note that static binaries may have symbols stripped + // or may not expose them depending on compilation flags + let export_names: Vec = container_info + .exports + .iter() + .map(|e| e.name.clone()) + .collect(); + + println!( + "Static binary exports found: {} exports: {:?}", + container_info.exports.len(), + export_names + ); + + // If exports are present, verify expected ones exist + // Note: Exports may be stripped in static binaries, so this is not always guaranteed + if !container_info.exports.is_empty() { + let has_main = export_names.iter().any(|name| name == "main"); + let has_exported_function = + export_names.iter().any(|name| name == "exported_function"); + + if has_main || has_exported_function { + println!( + "Found expected exports: main={}, exported_function={}", + has_main, has_exported_function + ); + } + } else { + println!( + "No exports found in static binary. This can happen when symbols are stripped or not exported." + ); + } + } + goblin::Object::Mach(_) => { + println!("Compiled to Mach-O, skipping ELF-specific test"); + } + _ => panic!("Unexpected binary format"), + } + } + Ok(output) => { + let stderr = String::from_utf8_lossy(&output.stderr); + println!( + "Static compilation failed, skipping test. This is expected if static libraries are not available.\nError: {}", + stderr + ); + } + Err(e) => { + println!( + "GCC not available, skipping test. This is expected in some CI environments. Error: {}", + e + ); + } + } +} + +#[test] +#[cfg(target_family = "unix")] +fn test_elf_section_classification_integration() { + // Test with the current binary (this test executable) + let current_exe = std::env::current_exe().expect("Failed to get current executable path"); + + if let Ok(elf_data) = fs::read(¤t_exe) { + if ElfParser::detect(&elf_data) { + let parser = ElfParser::new(); + if let Ok(container_info) = parser.parse(&elf_data) { + // Verify we found sections and classified them + assert!( + !container_info.sections.is_empty(), + "Should find sections in ELF binary" + ); + + // Look for common ELF sections and verify weights are assigned + let section_names: Vec<&str> = container_info + .sections + .iter() + .map(|sec| sec.name.as_str()) + .collect(); + + println!("Found sections: {:?}", section_names); + + // Verify that all sections have weights assigned + for section in &container_info.sections { + assert!( + section.weight > 0.0, + "Section {} should have a positive weight, got {}", + section.name, + section.weight + ); + } + + // Check that string data sections get higher weights than code sections + let string_sections: Vec<_> = container_info + .sections + .iter() + .filter(|sec| { + matches!(sec.section_type, stringy::types::SectionType::StringData) + }) + .collect(); + let code_sections: Vec<_> = container_info + .sections + .iter() + .filter(|sec| matches!(sec.section_type, stringy::types::SectionType::Code)) + .collect(); + + if !string_sections.is_empty() && !code_sections.is_empty() { + let max_string_weight = string_sections + .iter() + .map(|s| s.weight) + .fold(0.0f32, f32::max); + let max_code_weight = code_sections + .iter() + .map(|s| s.weight) + .fold(0.0f32, f32::max); + assert!( + max_string_weight > max_code_weight, + "String sections should have higher weight than code sections" + ); + } + + // We should find at least some standard sections + let has_text = section_names.iter().any(|&name| name.contains(".text")); + let has_rodata = section_names.iter().any(|&name| name.contains(".rodata")); + + // At least one of these should be present in a typical ELF + assert!( + has_text || has_rodata, + "Should find .text or .rodata sections" + ); + } + } + } +}