diff --git a/.claude/instincts/ast-derive-pattern.md b/.claude/instincts/ast-derive-pattern.md new file mode 100644 index 00000000..2f80f6a6 --- /dev/null +++ b/.claude/instincts/ast-derive-pattern.md @@ -0,0 +1,45 @@ +--- +id: libmagic-rs-ast-derives +trigger: "when creating new AST types or data structures" +confidence: 0.9 +domain: rust-types +source: local-repo-analysis +--- + +# AST Type Derive Pattern + +## Action + +When creating new data structures (especially AST nodes): + +1. Derive: `Debug, Clone, Serialize, Deserialize, PartialEq, Eq` +2. Add rustdoc with `# Examples` section +3. Use `#[non_exhaustive]` on public enums +4. Document each field/variant with `///` comments +5. Import `serde::{Serialize, Deserialize}` (not `serde_derive`) + +```rust +use serde::{Deserialize, Serialize}; + +/// Description of the type +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::parser::ast::MyType; +/// let val = MyType::Variant(42); +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[non_exhaustive] +pub enum MyType { + /// Description of variant + Variant(i64), +} +``` + +## Evidence + +- All types in `src/parser/ast.rs` follow this exact pattern +- 6 enum types and 1 struct type all derive the same 6 traits +- Every public type has rustdoc examples +- Every variant has `///` doc comments diff --git a/.claude/instincts/build-script-testing.md b/.claude/instincts/build-script-testing.md new file mode 100644 index 00000000..86102896 --- /dev/null +++ b/.claude/instincts/build-script-testing.md @@ -0,0 +1,26 @@ +--- +id: libmagic-rs-build-testing +trigger: "when modifying build.rs or build-time logic" +confidence: 0.85 +domain: rust-build +source: local-repo-analysis +--- + +# Build Script Testing Pattern + +## Action + +Build scripts (`build.rs`) cannot import the crate being built. To test build logic: + +1. Extract build logic into `src/build_helpers.rs` with `#[cfg(any(test, doc))]` +2. Keep `build.rs` minimal - it should only call functions from the helper module +3. Write unit tests in `build_helpers.rs` to verify all code paths +4. Test error cases (invalid magic files, malformed input) + +This ensures build-time failures produce clear error messages and are properly tested. + +## Evidence + +- `src/build_helpers.rs` exists with testable parsing and code generation +- `build.rs` delegates to functions from `build_helpers` +- Pattern documented in AGENTS.md as a project convention diff --git a/.claude/instincts/co-change-awareness.md b/.claude/instincts/co-change-awareness.md new file mode 100644 index 00000000..d26ed27e --- /dev/null +++ b/.claude/instincts/co-change-awareness.md @@ -0,0 +1,29 @@ +--- +id: libmagic-rs-co-change +trigger: "when modifying core source files" +confidence: 0.85 +domain: architecture +source: local-repo-analysis +--- + +# Co-Change Awareness + +## Action + +When modifying these files, check if related files also need updates: + +| If you change... | Also check... | +|-------------------|--------------| +| `src/lib.rs` | `src/main.rs`, `src/parser/ast.rs`, tests | +| `src/evaluator/mod.rs` | `src/lib.rs`, `src/main.rs`, tests | +| `src/parser/ast.rs` | `src/lib.rs`, `src/parser/grammar.rs`, `src/evaluator/types.rs` | +| `src/main.rs` | `tests/cli_integration_tests.rs` | +| `Cargo.toml` | `src/lib.rs`, `src/main.rs` | +| `src/parser/grammar.rs` | `src/parser/mod.rs`, `tests/parser_integration_tests.rs` | + +## Evidence + +- Analyzed 34 commits for file co-change frequency +- `src/lib.rs` + `src/main.rs` changed together 8 times (100% of main.rs changes) +- `src/evaluator/mod.rs` + `src/lib.rs` changed together 8 times +- API types are re-exported through `src/lib.rs`, making it a hub file diff --git a/.claude/instincts/conventional-commits.md b/.claude/instincts/conventional-commits.md new file mode 100644 index 00000000..0b50d333 --- /dev/null +++ b/.claude/instincts/conventional-commits.md @@ -0,0 +1,39 @@ +--- +id: libmagic-rs-commits +trigger: "when writing a commit message" +confidence: 0.8 +domain: git +source: local-repo-analysis +--- + +# Conventional Commits Format + +## Action + +Prefix commit messages with type: + +- `feat:` - New features (most common) +- `fix:` - Bug fixes +- `chore:` - Maintenance (deps, config) +- `docs:` - Documentation changes +- `test:` - Test additions/changes +- `refactor:` - Code restructuring +- `perf:` - Performance improvements +- `ci:` - CI/CD changes + +Optional scope: `chore(deps):`, `chore(ci):` + +PR references: Include `(#N)` suffix when applicable. + +Examples from this repo: +- `feat: implement comprehensive test infrastructure` +- `feat: evaluation enhancements with confidence, MIME, tags, metadata (#29)` +- `chore(deps): update dependencies in mise.toml for improved tooling` +- `feat: built-in rules build time compilation fallback (#28)` + +## Evidence + +- Analyzed 34 commits +- 77% follow conventional commit format +- `feat:` is the most common type (30% of commits) +- PR numbers consistently included as `(#N)` suffix diff --git a/.claude/instincts/error-handling-pattern.md b/.claude/instincts/error-handling-pattern.md new file mode 100644 index 00000000..9b111a7d --- /dev/null +++ b/.claude/instincts/error-handling-pattern.md @@ -0,0 +1,42 @@ +--- +id: libmagic-rs-error-handling +trigger: "when adding new error types or handling errors" +confidence: 0.9 +domain: rust-errors +source: local-repo-analysis +--- + +# Error Handling with thiserror + Constructor Methods + +## Action + +When creating or extending error types: + +1. Use `thiserror::Error` derive macro +2. Include contextual fields (line numbers, offsets, names) +3. Add named constructor methods for each variant +4. Mark constructors with `#[must_use]` +5. Use `impl Into` for string parameters +6. Write unit tests that verify Display output + +```rust +#[derive(Debug, thiserror::Error)] +pub enum MyError { + #[error("Something failed at {location}: {reason}")] + SomethingFailed { location: usize, reason: String }, +} + +impl MyError { + #[must_use] + pub fn something_failed(location: usize, reason: impl Into) -> Self { + Self::SomethingFailed { location, reason: reason.into() } + } +} +``` + +## Evidence + +- `src/error.rs` defines 3 error enums with 20+ variants, all following this pattern +- Every variant has a corresponding constructor method +- All constructors use `impl Into` and `#[must_use]` +- Comprehensive tests verify Display formatting for each variant diff --git a/.claude/instincts/just-task-runner.md b/.claude/instincts/just-task-runner.md new file mode 100644 index 00000000..b91c4143 --- /dev/null +++ b/.claude/instincts/just-task-runner.md @@ -0,0 +1,35 @@ +--- +id: libmagic-rs-just-tasks +trigger: "when running project commands or CI checks" +confidence: 0.9 +domain: tooling +source: local-repo-analysis +--- + +# Use just for Task Running + +## Action + +Use `just` (not raw cargo) for project tasks. Key commands: + +| Command | Purpose | +|---------|---------| +| `just ci-check` | Full CI parity check (run before committing) | +| `just test` | Run tests with nextest | +| `just lint-rust` | Clippy with `-D warnings` | +| `just fmt` | Format Rust code | +| `just coverage` | Generate LCOV coverage report | +| `just coverage-report` | HTML coverage with browser open | +| `just bench` | Run all benchmarks | +| `just audit` | Security audit | +| `just docs` | Build and serve documentation | +| `just test-compatibility` | Test against original libmagic test suite | + +All commands use `mise exec --` prefix for tool version management. + +## Evidence + +- `justfile` has 50+ recipes organized into sections +- All CI workflows use `just` commands +- `mise.toml` manages tool versions (Rust, pre-commit, mdbook, etc.) +- Cross-platform support with `[unix]`/`[windows]` annotations diff --git a/.claude/instincts/strict-clippy-compliance.md b/.claude/instincts/strict-clippy-compliance.md new file mode 100644 index 00000000..c84c8710 --- /dev/null +++ b/.claude/instincts/strict-clippy-compliance.md @@ -0,0 +1,30 @@ +--- +id: libmagic-rs-strict-clippy +trigger: "when writing or modifying Rust code in this project" +confidence: 0.95 +domain: rust-linting +source: local-repo-analysis +--- + +# Strict Clippy Compliance + +## Action + +This project enforces extremely strict clippy settings. When writing code: + +- Never use `.unwrap()` - use `?`, `.ok()`, or match instead (clippy `unwrap_used = "deny"`) +- Never use `panic!()` in library code (clippy `panic = "deny"`) +- Never use `unsafe` code (workspace `unsafe_code = "forbid"`) +- Never use direct indexing on slices/buffers - use `.get()` (clippy `indexing_slicing = "warn"`) +- Never use `&str[n..]` - use `strip_prefix()`/`strip_suffix()` (clippy `string_slice = "warn"`) +- Mark all constructors/getters with `#[must_use]` +- Avoid `as` casts where possible (clippy `as_conversions = "warn"`) +- No `dbg!()` macros (clippy `dbg_macro = "warn"`) +- No `todo!()` macros (clippy `todo = "warn"`) + +## Evidence + +- Cargo.toml contains 80+ clippy lint configurations +- `unsafe_code = "forbid"` and `unwrap_used = "deny"` are workspace-level +- All 34 analyzed commits maintain zero-warning compliance +- CI enforces `cargo clippy -- -D warnings` diff --git a/.claude/instincts/testing-conventions.md b/.claude/instincts/testing-conventions.md new file mode 100644 index 00000000..8ea04678 --- /dev/null +++ b/.claude/instincts/testing-conventions.md @@ -0,0 +1,39 @@ +--- +id: libmagic-rs-testing +trigger: "when writing tests or adding new functionality" +confidence: 0.9 +domain: testing +source: local-repo-analysis +--- + +# Testing Conventions + +## Action + +Follow these testing patterns: + +1. **Unit tests**: Place in `#[cfg(test)] mod tests` within each source file +2. **Integration tests**: Add to `tests/` directory with `_tests.rs` suffix +3. **CLI tests**: Use `insta` snapshots in `tests/cli_integration_tests.rs` +4. **Property tests**: Add to `tests/property_tests.rs` using `proptest` +5. **Benchmarks**: Add to `benches/` using `criterion` with `harness = false` + +Run tests with: +```bash +cargo nextest run --workspace --no-capture # Standard +just ci-check # Full CI parity +just coverage # With coverage +``` + +Test naming: `test__` (e.g., `test_parse_error_display`) + +Coverage target: >85% with `cargo llvm-cov` + +## Evidence + +- 8 test files in `tests/` directory +- 3 benchmark files in `benches/` +- Every source file has inline `#[cfg(test)]` module +- `insta` used for snapshot testing CLI output +- `proptest` used for property-based testing +- `criterion` used for benchmarks (not built-in bench) diff --git a/.claude/skills/SKILL.md b/.claude/skills/SKILL.md new file mode 100644 index 00000000..0a44ff34 --- /dev/null +++ b/.claude/skills/SKILL.md @@ -0,0 +1,190 @@ +--- +name: libmagic-rs-patterns +description: Coding patterns extracted from libmagic-rs repository +version: 1.0.0 +source: local-git-analysis +analyzed_commits: 34 +--- + +# libmagic-rs Development Patterns + +## Commit Conventions + +This project uses **conventional commits** (77% of commits follow the pattern): + +- `feat:` - New features (most common, ~30%) +- `chore:` / `chore(deps):` / `chore(ci):` - Maintenance tasks +- `fix:` - Bug fixes +- `refactor:` - Code restructuring +- `docs:` - Documentation updates +- `test:` - Test additions/changes + +PR references use `(#N)` suffix format. + +## Architecture + +### Module Structure + +``` +src/ + lib.rs # Public API (MagicDatabase), re-exports + main.rs # CLI binary (rmagic), clap-based + error.rs # Error types (LibmagicError, ParseError, EvaluationError) + mime.rs # MIME type detection + tags.rs # Tag-based classification + build_helpers.rs # Testable build script logic + builtin_rules.rs # Built-in magic rules (compiled at build time) + builtin_rules.magic # Magic rule definitions + parser/ + mod.rs # Parser public interface + ast.rs # AST node definitions (OffsetSpec, TypeKind, Operator, etc.) + grammar.rs # nom-based parser combinators + evaluator/ + mod.rs # Main evaluation engine + offset.rs # Offset resolution (absolute, indirect, relative) + operators.rs # Comparison and bitwise operations + types.rs # Type interpretation with endianness + strength.rs # Rule strength calculation + io/ + mod.rs # Memory-mapped file I/O + output/ + mod.rs # Output formatting interface + json.rs # JSON output format + text.rs # Text output format +``` + +### Pipeline Pattern + +``` +Magic File -> Parser -> AST -> Evaluator -> Match Results -> Output Formatter + ^ +Target File -> Memory Mapper -> File Buffer +``` + +### Core Types + +- `MagicDatabase` - Main entry point for loading rules and evaluating files +- `MagicRule` - A single magic rule with offset, type, operator, value, description +- `OffsetSpec` - Absolute, Indirect, Relative, or FromEnd offsets +- `TypeKind` - Byte, Short, Long, String, Regex with endianness +- `Operator` - Equal, NotEqual, Greater, Less, BitwiseAnd, Xor +- `LibmagicError` / `ParseError` / `EvaluationError` - Error hierarchy + +## Co-Change Patterns (Files That Change Together) + +| File Group | Frequency | Reason | +|------------|-----------|--------| +| `src/lib.rs` + `src/main.rs` | 8x | API changes require CLI updates | +| `src/evaluator/mod.rs` + `src/lib.rs` | 8x | Evaluator changes exposed through lib API | +| `Cargo.toml` + `src/lib.rs` | 6x | Dependency changes affect library code | +| `src/lib.rs` + `src/parser/ast.rs` | 5x | AST changes re-exported through lib | +| `src/main.rs` + `tests/cli_integration_tests.rs` | 4x | CLI changes require test updates | + +## Clippy Configuration + +Extremely strict clippy setup in `Cargo.toml`: + +- `unsafe_code = "forbid"` - No unsafe code allowed +- `warnings = "deny"` - Zero warnings policy +- `unwrap_used = "deny"` - Never use `.unwrap()` in production +- `panic = "deny"` - Never panic in production +- `pedantic`, `nursery`, `cargo` lint groups enabled +- Security-focused lints: `indexing_slicing`, `arithmetic_side_effects`, `string_slice` + +## Error Handling Pattern + +Use `thiserror` with constructor methods: + +```rust +#[derive(Debug, thiserror::Error)] +pub enum ParseError { + #[error("Invalid syntax at line {line}: {message}")] + InvalidSyntax { line: usize, message: String }, +} + +impl ParseError { + #[must_use] + pub fn invalid_syntax(line: usize, message: impl Into) -> Self { + Self::InvalidSyntax { line, message: message.into() } + } +} +``` + +- Every error variant has a named constructor method +- Constructors use `impl Into` for ergonomic creation +- All constructors marked `#[must_use]` +- Error messages include contextual info (line numbers, offsets) + +## Testing Patterns + +### Test Organization + +- **Unit tests**: `#[cfg(test)] mod tests` in each source file +- **Integration tests**: `tests/*.rs` directory +- **Compatibility tests**: `tests/compatibility_tests.rs` against real libmagic test suite +- **Property tests**: `tests/property_tests.rs` using `proptest` +- **Benchmarks**: `benches/*.rs` using `criterion` +- **Snapshots**: `insta` for CLI output snapshots + +### Test File Naming + +- CLI tests: `tests/cli_integration_tests.rs` +- JSON output: `tests/json_integration_test.rs` +- Parser: `tests/parser_integration_tests.rs` +- Properties: `tests/property_tests.rs` + +### Commands + +```bash +cargo nextest run --workspace --no-capture # Standard test run +just ci-check # Full CI parity check +just coverage # Generate coverage report +just test-compatibility # Test against original libmagic +``` + +## Build Script Pattern + +Build logic extracted into testable library module: + +```rust +// src/build_helpers.rs - testable parsing and code generation +#[cfg(any(test, doc))] +pub mod build_helpers { ... } + +// build.rs - minimal, delegates to build_helpers +fn main() { ... } +``` + +## Tooling + +- **mise**: Tool version manager (replaces asdf/rtx) +- **just**: Task runner (justfile) +- **cargo-nextest**: Fast test runner +- **cargo-llvm-cov**: Code coverage +- **insta**: Snapshot testing +- **criterion**: Benchmarking +- **pre-commit**: Git hooks +- **actionlint**: GitHub Actions validation +- **mdbook**: Documentation site +- **prettier**: JSON/YAML formatting +- **mdformat**: Markdown formatting +- **cargo-dist**: Binary distribution + +## Rust Edition & Style + +- **Edition**: 2024 +- **MSRV**: 1.85 +- **rustfmt**: `style_edition = "2024"` +- All derive macros: `Debug, Clone, Serialize, Deserialize, PartialEq, Eq` +- Public API types derive `Serialize` + `Deserialize` +- Doc comments on all public items with examples +- `#[non_exhaustive]` on public enums + +## Safety Rules + +1. Use `.get()` for buffer access, never direct indexing +2. Use `strip_prefix()`/`strip_suffix()` instead of `&str[n..]` +3. No `unwrap()` or `panic!()` in library code +4. Bounds checking on all byte reads +5. Configurable timeouts for evaluation +6. No non-ASCII literals in code/comments diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..a40a31f2 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,124 @@ +name: Benchmarks + +on: + # Run on schedule (weekly on Sunday at 3 AM UTC) + schedule: + - cron: "0 3 * * 0" + # Run on PRs that modify performance-critical code + pull_request: + branches: [main] + paths: + - "src/evaluator/**" + - "src/parser/**" + - "src/io/**" + - "benches/**" + - "Cargo.toml" + # Allow manual triggering + workflow_dispatch: + +env: + CARGO_TERM_COLOR: always + +jobs: + benchmark: + name: Run Benchmarks + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - uses: jdx/mise-action@v3 + with: + install: true + cache: true + github_token: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-bench-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bench- + ${{ runner.os }}-cargo- + + - name: Run benchmarks + run: cargo bench --bench parser_bench --bench evaluation_bench --bench io_bench + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: target/criterion/ + retention-days: 30 + + # Compare benchmarks on PRs + benchmark-compare: + name: Compare Benchmarks + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - uses: jdx/mise-action@v3 + with: + install: true + cache: true + github_token: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-bench-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-bench- + ${{ runner.os }}-cargo- + + - name: Install critcmp + run: cargo install critcmp + + - name: Run benchmarks on PR branch + run: | + cargo bench --bench parser_bench --bench evaluation_bench --bench io_bench -- --save-baseline pr + + - name: Checkout main branch + run: git checkout origin/main + + - name: Run benchmarks on main branch + continue-on-error: true + id: main-bench + run: | + cargo bench --bench parser_bench --bench evaluation_bench --bench io_bench -- --save-baseline main + + - name: Compare benchmarks + if: steps.main-bench.outcome == 'success' + run: | + { + echo "## Benchmark Comparison" + echo "" + echo "Comparing PR branch against main:" + echo "" + echo '```' + critcmp main pr || true + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + - name: Skip comparison (no main benchmarks) + if: steps.main-bench.outcome != 'success' + run: | + { + echo "## Benchmark Comparison" + echo "" + echo "Skipped: main branch does not have matching benchmark targets." + echo "Comparison will be available once benchmarks are merged to main." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/compatibility.yml b/.github/workflows/compatibility.yml index f37f6d76..25b94b83 100644 --- a/.github/workflows/compatibility.yml +++ b/.github/workflows/compatibility.yml @@ -8,19 +8,24 @@ on: schedule: # Run daily at 2 AM UTC to catch any regressions - cron: "0 2 * * *" + workflow_dispatch: + +env: + CARGO_TERM_COLOR: always jobs: compatibility-tests: - name: Compatibility Tests - if: false + name: Compatibility Tests (${{ matrix.os }}) runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v6 + - uses: jdx/mise-action@v3 with: install: true @@ -34,4 +39,14 @@ jobs: run: cargo build --release - name: Run compatibility tests - run: cargo test test_compatibility_with_original_libmagic -- --ignored + run: cargo test test_compatibility_with_original_libmagic -- --ignored --nocapture + + - name: Upload test results on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: compatibility-test-results-${{ matrix.os }} + path: | + target/nextest/ + target/tmp/ + retention-days: 7 diff --git a/.gitignore b/.gitignore index 8e68824b..8525a105 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,4 @@ megalinter-reports/ # megalinter crud undefined/ +.claude.local.md diff --git a/Cargo.toml b/Cargo.toml index b0ea55ec..e49e4a85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,9 +28,6 @@ exclude = [ "AGENTS.md", ] -[package.metadata.cargo-machete] -ignored = ["byteorder"] # Ignoring it since we're going to use it later. - # Workspace lint configuration [workspace.lints.rust] # Security: Forbid unsafe code globally @@ -148,8 +145,9 @@ path = "src/main.rs" [dependencies] byteorder = "1.5.0" cfg-if = "1.0.4" -clap = { version = "4.5.54", features = ["derive"] } +clap = { version = "4.5.57", features = ["derive"] } clap-stdin = "0.8.0" +memchr = "2.7.6" memmap2 = "0.9.9" nom = "8.0.0" serde = { version = "1.0.228", features = ["derive"] } @@ -162,13 +160,28 @@ serde = { version = "1.0.228", features = ["derive"] } thiserror = "2.0.18" [dev-dependencies] -criterion = "0.8.1" -insta = { version = "1.46.1", features = ["json"] } +criterion = "0.8.2" +insta = { version = "1.46.3", features = ["json"] } nix = { version = "0.31.1", features = ["fs"] } -proptest = "1.9.0" -regex = "1.12.2" -temp-env = "0.3.6" -tempfile = "3.24.0" +proptest = "1.10.0" +regex = "1.12.3" +tempfile = "3.25.0" + +[[bench]] +name = "parser_bench" +harness = false + +[[bench]] +name = "evaluation_bench" +harness = false + +[[bench]] +name = "io_bench" +harness = false + +[profile.release] +lto = "thin" +codegen-units = 1 # The profile that 'dist' will build with [profile.dist] diff --git a/benches/evaluation_bench.rs b/benches/evaluation_bench.rs new file mode 100644 index 00000000..6e0e6b58 --- /dev/null +++ b/benches/evaluation_bench.rs @@ -0,0 +1,207 @@ +//! Evaluation benchmarks for libmagic-rs +//! +//! Benchmarks rule evaluation performance against various file types: +//! - ELF binary detection +//! - ZIP archive detection +//! - PDF document detection +//! - Unknown data handling + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use libmagic_rs::{EvaluationConfig, MagicDatabase}; +use std::hint::black_box; + +/// Create a minimal ELF 64-bit header for testing +fn create_elf64_header() -> Vec { + let mut header = vec![0u8; 64]; + + // ELF magic number + header[0] = 0x7f; + header[1] = b'E'; + header[2] = b'L'; + header[3] = b'F'; + + // 64-bit + header[4] = 2; + + // Little endian + header[5] = 1; + + // ELF version + header[6] = 1; + + // System V ABI + header[7] = 0; + + // Type: executable + header[16] = 2; + + // Machine: x86-64 + header[18] = 0x3e; + + // ELF version + header[20] = 1; + + header +} + +/// Create a minimal ZIP header for testing +fn create_zip_header() -> Vec { + vec![ + 0x50, 0x4b, 0x03, 0x04, // PK signature + 0x14, 0x00, // Version needed + 0x00, 0x00, // General purpose flags + 0x08, 0x00, // Compression method (deflate) + 0x00, 0x00, // Last mod time + 0x00, 0x00, // Last mod date + 0x00, 0x00, 0x00, 0x00, // CRC-32 + 0x00, 0x00, 0x00, 0x00, // Compressed size + 0x00, 0x00, 0x00, 0x00, // Uncompressed size + 0x04, 0x00, // Filename length + 0x00, 0x00, // Extra field length + b't', b'e', b's', b't', // Filename + ] +} + +/// Create a minimal PDF header for testing +fn create_pdf_header() -> Vec { + b"%PDF-1.7\n%\xe2\xe3\xcf\xd3\n".to_vec() +} + +/// Create random unknown data for testing fallback behavior +fn create_unknown_data(size: usize) -> Vec { + (0..size).map(|i| (i % 256) as u8).collect() +} + +/// Benchmark evaluation of different file types +fn bench_file_type_detection(c: &mut Criterion) { + let db = MagicDatabase::with_builtin_rules().expect("should load"); + + let mut group = c.benchmark_group("file_type_detection"); + + // ELF detection + let elf_data = create_elf64_header(); + group.throughput(Throughput::Bytes(elf_data.len() as u64)); + group.bench_function("detect_elf", |b| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&elf_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + // ZIP detection + let zip_data = create_zip_header(); + group.throughput(Throughput::Bytes(zip_data.len() as u64)); + group.bench_function("detect_zip", |b| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&zip_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + // PDF detection + let pdf_data = create_pdf_header(); + group.throughput(Throughput::Bytes(pdf_data.len() as u64)); + group.bench_function("detect_pdf", |b| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&pdf_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + // Unknown data (fallback to "data") + let unknown_data = create_unknown_data(64); + group.throughput(Throughput::Bytes(unknown_data.len() as u64)); + group.bench_function("detect_unknown", |b| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&unknown_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + group.finish(); +} + +/// Benchmark evaluation with different buffer sizes +fn bench_buffer_sizes(c: &mut Criterion) { + let db = MagicDatabase::with_builtin_rules().expect("should load"); + + let mut group = c.benchmark_group("buffer_sizes"); + + // Test with various buffer sizes + for size in [64, 256, 1024, 4096, 16384, 65536] { + let data = create_unknown_data(size); + group.throughput(Throughput::Bytes(size as u64)); + group.bench_with_input(BenchmarkId::new("unknown", size), &data, |b, data| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(data)) + .expect("should evaluate"); + black_box(result) + }) + }); + } + + group.finish(); +} + +/// Benchmark evaluation with different configurations +fn bench_evaluation_configs(c: &mut Criterion) { + let elf_data = create_elf64_header(); + + let mut group = c.benchmark_group("evaluation_configs"); + group.throughput(Throughput::Bytes(elf_data.len() as u64)); + + // Default configuration + group.bench_function("config_default", |b| { + let db = MagicDatabase::with_builtin_rules_and_config(EvaluationConfig::default()) + .expect("should load"); + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&elf_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + // Performance configuration + group.bench_function("config_performance", |b| { + let db = MagicDatabase::with_builtin_rules_and_config(EvaluationConfig::performance()) + .expect("should load"); + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&elf_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + // Comprehensive configuration + group.bench_function("config_comprehensive", |b| { + let db = MagicDatabase::with_builtin_rules_and_config(EvaluationConfig::comprehensive()) + .expect("should load"); + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&elf_data)) + .expect("should evaluate"); + black_box(result) + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_file_type_detection, + bench_buffer_sizes, + bench_evaluation_configs +); +criterion_main!(benches); diff --git a/benches/io_bench.rs b/benches/io_bench.rs new file mode 100644 index 00000000..81f2b9aa --- /dev/null +++ b/benches/io_bench.rs @@ -0,0 +1,131 @@ +//! I/O benchmarks for libmagic-rs +//! +//! Benchmarks file I/O operations including: +//! - FileBuffer creation +//! - Memory-mapped file access +//! - Buffer slice operations + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use libmagic_rs::io::FileBuffer; +use std::hint::black_box; +use std::io::Write; +use tempfile::NamedTempFile; + +/// Create a temporary file with random data +fn create_temp_file(size: usize) -> NamedTempFile { + let mut file = NamedTempFile::new().expect("create temp file"); + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + file.write_all(&data).expect("write data"); + file.flush().expect("flush"); + file +} + +/// Benchmark FileBuffer creation from files of various sizes +fn bench_file_buffer_creation(c: &mut Criterion) { + let mut group = c.benchmark_group("file_buffer_creation"); + + for size in [64, 1024, 4096, 65536, 262144, 1048576] { + let temp_file = create_temp_file(size); + let path = temp_file.path(); + + group.throughput(Throughput::Bytes(size as u64)); + group.bench_with_input(BenchmarkId::new("mmap", size), &path, |b, path| { + b.iter(|| { + let buffer = FileBuffer::new(black_box(path)).expect("should create"); + black_box(buffer) + }) + }); + } + + group.finish(); +} + +/// Benchmark slice access patterns on FileBuffer +fn bench_buffer_access(c: &mut Criterion) { + let temp_file = create_temp_file(1_048_576); // 1MB file + let path = temp_file.path(); + let buffer = FileBuffer::new(path).expect("should create"); + let slice = buffer.as_slice(); + + let mut group = c.benchmark_group("buffer_access"); + + // Sequential read + group.bench_function("sequential_read_1mb", |b| { + b.iter(|| { + let mut sum: u64 = 0; + for &byte in slice { + sum = sum.wrapping_add(u64::from(byte)); + } + black_box(sum) + }) + }); + + // Random access pattern (simulating rule evaluation) + group.bench_function("random_access_pattern", |b| { + let offsets = [0, 4, 16, 64, 256, 1024, 4096, 65536, 262144]; + b.iter(|| { + let mut sum: u64 = 0; + for &offset in &offsets { + if offset < slice.len() { + sum = sum.wrapping_add(u64::from(slice[offset])); + } + } + black_box(sum) + }) + }); + + // Slice extraction (common in rule evaluation) + group.bench_function("slice_extraction", |b| { + b.iter(|| { + let slices = [ + slice.get(0..4), + slice.get(4..8), + slice.get(16..32), + slice.get(64..128), + ]; + black_box(slices) + }) + }); + + group.finish(); +} + +/// Benchmark small file handling (common case) +fn bench_small_files(c: &mut Criterion) { + let mut group = c.benchmark_group("small_files"); + + // Common small file sizes + for size in [8, 16, 32, 64, 128, 256, 512] { + let temp_file = create_temp_file(size); + let path = temp_file.path(); + + group.throughput(Throughput::Bytes(size as u64)); + group.bench_with_input( + BenchmarkId::new("create_and_access", size), + &path, + |b, path| { + b.iter(|| { + let buffer = FileBuffer::new(black_box(path)).expect("should create"); + let slice = buffer.as_slice(); + // Simulate typical magic number check + let result = if slice.len() >= 4 { + (slice.get(0..4).map(<[u8]>::to_vec), slice.len()) + } else { + (None, slice.len()) + }; + black_box(result) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_file_buffer_creation, + bench_buffer_access, + bench_small_files +); +criterion_main!(benches); diff --git a/benches/parser_bench.rs b/benches/parser_bench.rs new file mode 100644 index 00000000..3a19359d --- /dev/null +++ b/benches/parser_bench.rs @@ -0,0 +1,108 @@ +//! Parser benchmarks for libmagic-rs +//! +//! Benchmarks magic file parsing performance including: +//! - Single line parsing +//! - Multi-line rule parsing +//! - Built-in rules loading +//! - Directory loading + +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; +use libmagic_rs::MagicDatabase; +use std::hint::black_box; + +/// Benchmark loading built-in magic rules +fn bench_builtin_rules_loading(c: &mut Criterion) { + c.bench_function("load_builtin_rules", |b| { + b.iter(|| { + let db = MagicDatabase::with_builtin_rules().expect("should load"); + black_box(db) + }) + }); +} + +/// Benchmark loading magic file from disk +fn bench_magic_file_loading(c: &mut Criterion) { + let magic_path = "src/builtin_rules.magic"; + + // Skip if file doesn't exist + if !std::path::Path::new(magic_path).exists() { + return; + } + + let mut group = c.benchmark_group("magic_file_loading"); + + // Measure throughput based on file size + let file_size = std::fs::metadata(magic_path).map(|m| m.len()).unwrap_or(0); + group.throughput(Throughput::Bytes(file_size)); + + group.bench_function("load_builtin_magic_file", |b| { + b.iter(|| { + let db = MagicDatabase::load_from_file(black_box(magic_path)).expect("should load"); + black_box(db) + }) + }); + + group.finish(); +} + +/// Benchmark parsing individual magic rule patterns +fn bench_rule_parsing(c: &mut Criterion) { + use std::io::Write; + use tempfile::NamedTempFile; + + let mut group = c.benchmark_group("rule_parsing"); + + // Simple string match rule + let simple_rule = "0 string \"test\" Test file\n"; + group.bench_function("parse_simple_string_rule", |b| { + let mut temp = NamedTempFile::new().expect("temp file"); + temp.write_all(simple_rule.as_bytes()).expect("write temp"); + let path = temp.path().to_owned(); + + b.iter(|| { + let db = MagicDatabase::load_from_file(black_box(&path)).expect("should load"); + black_box(db) + }) + }); + + // Numeric rule with endianness + let numeric_rule = "0 belong 0x7f454c46 ELF executable\n"; + group.bench_function("parse_numeric_rule", |b| { + let mut temp = NamedTempFile::new().expect("temp file"); + temp.write_all(numeric_rule.as_bytes()).expect("write temp"); + let path = temp.path().to_owned(); + + b.iter(|| { + let db = MagicDatabase::load_from_file(black_box(&path)).expect("should load"); + black_box(db) + }) + }); + + // Complex nested rule + let nested_rule = r#"0 string \x7fELF ELF +>4 byte 1 32-bit +>4 byte 2 64-bit +>5 byte 1 LSB +>5 byte 2 MSB +"#; + group.bench_function("parse_nested_rules", |b| { + let mut temp = NamedTempFile::new().expect("temp file"); + temp.write_all(nested_rule.as_bytes()).expect("write temp"); + let path = temp.path().to_owned(); + + b.iter(|| { + let db = MagicDatabase::load_from_file(black_box(&path)).expect("should load"); + black_box(db) + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_builtin_rules_loading, + bench_magic_file_loading, + bench_rule_parsing +); +criterion_main!(benches); diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index b2aec50b..9cfb7576 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -229,7 +229,7 @@ let config = EvaluationConfig { ..Default::default() }; -let result = evaluate_rules_with_config(&rules, buffer, config)?; +let result = evaluate_rules_with_config(&rules, buffer, &config)?; ``` ## API Reference @@ -247,7 +247,7 @@ pub fn evaluate_rules( pub fn evaluate_rules_with_config( rules: &[MagicRule], buffer: &[u8], - config: EvaluationConfig, + config: &EvaluationConfig, ) -> Result, EvaluationError>; /// Evaluate a single rule (used internally and for testing) diff --git a/docs/src/getting-started.md b/docs/src/getting-started.md index 37f4264b..d7aa2925 100644 --- a/docs/src/getting-started.md +++ b/docs/src/getting-started.md @@ -256,7 +256,7 @@ let rule = MagicRule { // Evaluate against a buffer let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes let config = EvaluationConfig::default(); -let matches = evaluate_rules_with_config(&[rule], buffer, config)?; +let matches = evaluate_rules_with_config(&[rule], buffer, &config)?; assert_eq!(matches.len(), 1); assert_eq!(matches[0].message, "ELF magic"); diff --git a/docs/src/testing.md b/docs/src/testing.md index 642fa8bf..d3bd3d8e 100644 --- a/docs/src/testing.md +++ b/docs/src/testing.md @@ -195,44 +195,55 @@ fn test_hierarchical_rules() { } ``` -### Property Tests (Planned) +### Property Tests -Using `proptest` for fuzz-like testing: +Property-based testing using `proptest` is implemented in `tests/property_tests.rs`: + +```bash +# Run property tests +cargo test --test property_tests + +# Run with more test cases +PROPTEST_CASES=1000 cargo test --test property_tests +``` + +The property tests verify: + +- **Serialization roundtrips**: AST types serialize and deserialize correctly +- **Evaluation safety**: Evaluation never panics on arbitrary input +- **Configuration validation**: Invalid configurations are rejected +- **Known pattern detection**: ELF, ZIP, PDF patterns are correctly detected + +Example property test: ```rust use proptest::prelude::*; proptest! { #[test] - fn test_number_parsing_roundtrip(n in any::()) { - let s = n.to_string(); - let (remaining, parsed) = parse_number(&s).unwrap(); - assert_eq!(remaining, ""); - assert_eq!(parsed, n); - } - - #[test] - fn test_offset_parsing_never_panics(s in ".*") { - // Should never panic, even on invalid input - let _ = parse_offset(&s); + fn prop_evaluation_never_panics(buffer in prop::collection::vec(any::(), 0..1024)) { + let db = MagicDatabase::with_builtin_rules().expect("should load"); + // Should not panic regardless of buffer contents + let result = db.evaluate_buffer(&buffer); + prop_assert!(result.is_ok()); } } ``` -### Compatibility Tests (Planned) +### Compatibility Tests -Validate against GNU `file` command: +Compatibility tests validate against GNU `file` command using the canonical test suite from the file project. Test data is located in `third_party/tests/`. -```rust -#[test] -fn test_elf_detection_compatibility() { - let gnu_result = run_gnu_file("third_party/tests/elf64.testfile"); - let our_result = evaluate_file("third_party/tests/elf64.testfile"); +```bash +# Run compatibility tests (requires test files) +cargo test test_compatibility_with_original_libmagic -- --ignored - assert_eq!(extract_file_type(&gnu_result), our_result.description); -} +# Or use the just recipe +just test-compatibility ``` +The compatibility workflow runs automatically in CI on pushes to main/develop. + ## Test Utilities and Helpers ### Common Test Patterns @@ -507,21 +518,44 @@ cargo insta accept 3. **Test Cross-Platform**: Verify tests pass on both Windows and Unix 4. **Keep Snapshots Small**: Use focused tests for specific CLI features -## Future Testing Plans +## Benchmarks -### Integration Testing +Performance benchmarks are implemented using Criterion in the `benches/` directory: -- **Complete Workflow Tests**: End-to-end magic file parsing and evaluation -- **File Format Tests**: Comprehensive testing against known file formats -- **Error Recovery Tests**: Graceful handling of malformed inputs +```bash +# Run all benchmarks +cargo bench + +# Run specific benchmark group +cargo bench parser +cargo bench evaluation +cargo bench io + +# Generate HTML benchmark report +cargo bench -- --noplot +``` -### Compatibility Testing +### Available Benchmarks -- **GNU file Compatibility**: Validate results against original implementation -- **Magic File Compatibility**: Test with real-world magic databases -- **Performance Parity**: Ensure comparable performance to libmagic +| Benchmark | Description | +|-----------|-------------| +| `parser_bench` | Magic file parsing performance | +| `evaluation_bench` | Rule evaluation against various file types | +| `io_bench` | Memory-mapped I/O operations | + +### Benchmark CI + +Benchmarks run automatically: + +- **Weekly**: Scheduled runs on Sunday at 3 AM UTC +- **On PR**: When performance-critical code changes (src/evaluator, src/parser, src/io, benches) +- **Manual**: Via workflow_dispatch + +The CI compares PR benchmarks against the main branch and reports regressions. + +## Future Testing Plans -### Fuzzing Integration +### Fuzzing Integration (Phase 2) - **Parser Fuzzing**: Use cargo-fuzz for parser robustness - **Evaluator Fuzzing**: Test evaluation engine with malformed files diff --git a/justfile b/justfile index d251ee81..ec45e614 100644 --- a/justfile +++ b/justfile @@ -1,8 +1,8 @@ # Cross-platform justfile using OS annotations # Windows uses PowerShell, Unix uses bash -set shell := ["bash", "-cu"] -set windows-shell := ["powershell", "-NoProfile", "-Command"] +set windows-shell := ["powershell.exe", "-c"] +set shell := ["bash", "-c"] set dotenv-load := true set ignore-comments := true @@ -90,7 +90,7 @@ lint: lint-rust lint-actions lint-docs lint-justfile # Individual lint recipes lint-actions: - @{{ mise_exec }} actionlint .github/workflows/audit.yml .github/workflows/ci.yml .github/workflows/codeql.yml .github/workflows/compatibility.yml .github/workflows/copilot-setup-steps.yml .github/workflows/docs.yml .github/workflows/release.yml .github/workflows/security.yml + @{{ mise_exec }} actionlint .github/workflows/audit.yml .github/workflows/benchmarks.yml .github/workflows/ci.yml .github/workflows/codeql.yml .github/workflows/compatibility.yml .github/workflows/copilot-setup-steps.yml .github/workflows/docs.yml .github/workflows/release.yml .github/workflows/security.yml lint-docs: @{{ mise_exec }} markdownlint-cli2 docs/**/*.md README.md @@ -197,6 +197,30 @@ coverage: coverage-check: @{{ mise_exec }} just _coverage --fail-under-lines 9.7 +# Generate HTML coverage report for local viewing +[unix] +coverage-report: + #!/usr/bin/env bash + set -euo pipefail + rm -rf target/llvm-cov-target + RUSTFLAGS="--cfg coverage" {{ mise_exec }} cargo llvm-cov --workspace --html --open + +[windows] +coverage-report: + $env:RUSTFLAGS = "--cfg coverage"; {{ mise_exec }} cargo llvm-cov --workspace --html --open + +# Show coverage summary by file +[unix] +coverage-summary: + #!/usr/bin/env bash + set -euo pipefail + rm -rf target/llvm-cov-target + RUSTFLAGS="--cfg coverage" {{ mise_exec }} cargo llvm-cov --workspace + +[windows] +coverage-summary: + $env:RUSTFLAGS = "--cfg coverage"; {{ mise_exec }} cargo llvm-cov --workspace + # Full local CI parity check ci-check: pre-commit-run fmt-check lint-rust lint-rust-min test-ci build-release audit coverage-check dist-plan diff --git a/mise.toml b/mise.toml index 6b351feb..e082f365 100644 --- a/mise.toml +++ b/mise.toml @@ -23,6 +23,8 @@ rust = { version = "1.91.0", components = "llvm-tools,car "cargo:cargo-cyclonedx" = "0.5.7" "pipx:mdformat" = { version = "0.7.21", uvx_args = "--with mdformat-gfm --with mdformat-frontmatter --with mdformat-footnote --with mdformat-simple-breaks --with mdformat-gfm-alerts --with mdformat-toc --with mdformat-wikilink --with mdformat-tables" } prettier = "3.8.1" -actionlint = "1.7.10" -lychee = "0.22.0" -markdownlint-cli2 = "0.20.0" +actionlint = "1.7.10" +lychee = "0.22.0" +markdownlint-cli2 = "0.20.0" +pre-commit = "latest" +"cargo:cargo-machete" = "0.9.1" diff --git a/src/error.rs b/src/error.rs index 180ee1a8..5d1bdee8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -27,6 +27,41 @@ pub enum LibmagicError { /// The timeout duration in milliseconds timeout_ms: u64, }, + + /// Invalid configuration parameter. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::LibmagicError; + /// + /// let error = LibmagicError::ConfigError { + /// reason: "invalid timeout value".to_string(), + /// }; + /// assert!(matches!(error, LibmagicError::ConfigError { .. })); + /// ``` + #[error("Configuration error: {reason}")] + ConfigError { + /// Description of the configuration issue + reason: String, + }, + + /// File I/O error with structured context (path, operation). + /// + /// Unlike `IoError` which wraps a generic `std::io::Error`, this variant + /// preserves the structured error information from file I/O operations + /// (e.g., file path, operation type). + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::LibmagicError; + /// + /// let error = LibmagicError::FileError("failed to read /path/to/file".to_string()); + /// assert!(matches!(error, LibmagicError::FileError(_))); + /// ``` + #[error("File error: {0}")] + FileError(String), } /// Errors that can occur during magic file parsing. diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 1f1c2ccd..56fd9f67 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -6,9 +6,6 @@ use crate::parser::ast::MagicRule; use crate::{EvaluationConfig, LibmagicError}; use serde::{Deserialize, Serialize}; -use std::sync::{Arc, mpsc}; -use std::thread; -use std::time::Duration; #[cfg(test)] use crate::parser::ast::{Endianness, OffsetSpec, Operator, TypeKind, Value}; @@ -260,8 +257,9 @@ impl MatchResult { /// /// # Returns /// -/// Returns `Ok(true)` if the rule matches, `Ok(false)` if it doesn't match, -/// or `Err(LibmagicError)` if evaluation fails due to buffer access issues or other errors. +/// Returns `Ok(Some((offset, value)))` if the rule matches (with the resolved offset and +/// read value), `Ok(None)` if it doesn't match, or `Err(LibmagicError)` if evaluation +/// fails due to buffer access issues or other errors. /// /// # Examples /// @@ -283,18 +281,21 @@ impl MatchResult { /// /// let elf_buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes /// let result = evaluate_single_rule(&rule, elf_buffer).unwrap(); -/// assert!(result); // Should match +/// assert!(result.is_some()); // Should match /// /// let non_elf_buffer = &[0x50, 0x4b, 0x03, 0x04]; // ZIP magic bytes /// let result = evaluate_single_rule(&rule, non_elf_buffer).unwrap(); -/// assert!(!result); // Should not match +/// assert!(result.is_none()); // Should not match /// ``` /// /// # Errors /// /// * `LibmagicError::EvaluationError` - If offset resolution fails, buffer access is out of bounds, /// or type interpretation fails -pub fn evaluate_single_rule(rule: &MagicRule, buffer: &[u8]) -> Result { +pub fn evaluate_single_rule( + rule: &MagicRule, + buffer: &[u8], +) -> Result, LibmagicError> { // Step 1: Resolve the offset specification to an absolute position let absolute_offset = offset::resolve_offset(&rule.offset, buffer)?; @@ -303,29 +304,10 @@ pub fn evaluate_single_rule(rule: &MagicRule, buffer: &[u8]) -> Result bool { - match error { - LibmagicError::EvaluationError(eval_error) => match eval_error { - crate::error::EvaluationError::BufferOverrun { .. } => true, - crate::error::EvaluationError::TypeReadError(type_error) => { - matches!( - type_error, - crate::evaluator::types::TypeReadError::BufferOverrun { .. } - ) - } - _ => false, - }, - _ => false, - } + Ok( + operators::apply_operator(&rule.op, &read_value, &rule.value) + .then_some((absolute_offset, read_value)), + ) } /// Evaluate a list of magic rules against a file buffer with hierarchical processing @@ -407,65 +389,31 @@ pub fn evaluate_rules( buffer: &[u8], context: &mut EvaluationContext, ) -> Result, LibmagicError> { - let mut matches = Vec::with_capacity(rules.len()); + let mut matches = Vec::with_capacity(8); let start_time = std::time::Instant::now(); + let mut rule_count = 0u32; for rule in rules { - // Check timeout if configured - if let Some(timeout_ms) = context.timeout_ms() { - if start_time.elapsed().as_millis() > u128::from(timeout_ms) { - return Err(LibmagicError::Timeout { timeout_ms }); + // Check timeout periodically (every 16 rules) to reduce syscall overhead + rule_count = rule_count.wrapping_add(1); + if rule_count.trailing_zeros() >= 4 { + if let Some(timeout_ms) = context.timeout_ms() { + if start_time.elapsed().as_millis() > u128::from(timeout_ms) { + return Err(LibmagicError::Timeout { timeout_ms }); + } } } // Evaluate the current rule with graceful error handling - let rule_matches = match evaluate_single_rule(rule, buffer) { - Ok(matches) => matches, - Err(e) => { - // Skip buffer overruns silently (expected for short buffers) - // Log other errors and continue with next rule (graceful degradation) - if !is_buffer_overrun_error(&e) { - eprintln!( - "Warning: Skipping rule '{}' due to error: {}", - rule.message, e - ); - } + let match_data = match evaluate_single_rule(rule, buffer) { + Ok(data) => data, + Err(_e) => { + // Skip rules with evaluation errors (graceful degradation) continue; } }; - if rule_matches { - // Create match result for this rule with graceful error handling - let absolute_offset = match offset::resolve_offset(&rule.offset, buffer) { - Ok(offset) => offset, - Err(e) => { - // Skip buffer overruns silently (expected for short buffers) - if !is_buffer_overrun_error(&e) { - eprintln!( - "Warning: Skipping rule '{}' due to offset resolution error: {}", - rule.message, e - ); - } - continue; - } - }; - - let read_value = match types::read_typed_value(buffer, absolute_offset, &rule.typ) { - Ok(value) => value, - Err(e) => { - // Skip buffer overruns silently (expected for short buffers) - let eval_error: crate::error::EvaluationError = e.into(); - let lib_error: LibmagicError = eval_error.into(); - if !is_buffer_overrun_error(&lib_error) { - eprintln!( - "Warning: Skipping rule '{}' due to type reading error: {}", - rule.message, lib_error - ); - } - continue; - } - }; - + if let Some((absolute_offset, read_value)) = match_data { let match_result = MatchResult { message: rule.message.clone(), offset: absolute_offset, @@ -503,15 +451,8 @@ pub fn evaluate_rules( }, )); } - Err(e) => { - // Skip buffer overruns silently (expected for short buffers) - // Other errors in child evaluation are logged but don't stop parent evaluation - if !is_buffer_overrun_error(&e) { - eprintln!( - "Warning: Error evaluating children of rule '{}': {}", - rule.message, e - ); - } + Err(_e) => { + // Non-critical child evaluation errors are skipped (graceful degradation) } } @@ -567,7 +508,7 @@ pub fn evaluate_rules( /// let buffer = &[0x7f, 0x45, 0x4c, 0x46]; /// let config = EvaluationConfig::default(); /// -/// let matches = evaluate_rules_with_config(&rules, buffer, config).unwrap(); +/// let matches = evaluate_rules_with_config(&rules, buffer, &config).unwrap(); /// assert_eq!(matches.len(), 1); /// assert_eq!(matches[0].message, "ELF magic"); /// ``` @@ -579,52 +520,10 @@ pub fn evaluate_rules( pub fn evaluate_rules_with_config( rules: &[MagicRule], buffer: &[u8], - config: EvaluationConfig, + config: &EvaluationConfig, ) -> Result, LibmagicError> { - // If no timeout is configured, evaluate normally - let Some(timeout_ms) = config.timeout_ms else { - let mut context = EvaluationContext::new(config); - return evaluate_rules(rules, buffer, &mut context); - }; - - // With timeout: spawn evaluation in a thread and wait with timeout - // Use Arc to share data without cloning the potentially large rules/buffer - let rules_arc = Arc::new(rules.to_vec()); - let buffer_arc = Arc::new(buffer.to_vec()); - let config_clone = config.clone(); - - let (tx, rx) = mpsc::channel(); - - // Clone Arcs for the thread (cheap reference count increment) - let rules_thread = Arc::clone(&rules_arc); - let buffer_thread = Arc::clone(&buffer_arc); - - // Spawn evaluation in separate thread - // Note: The thread will run to completion even if we return early on timeout. - // True cancellation would require cooperative cancellation (checking a flag - // periodically during evaluation) or running in a separate process. - // For most use cases, the thread will complete quickly or the process will - // exit, cleaning up the thread automatically. - thread::spawn(move || { - let mut context = EvaluationContext::new(config_clone); - let result = evaluate_rules(&rules_thread, &buffer_thread, &mut context); - // Send result; ignore error if receiver was dropped (timeout occurred) - let _ = tx.send(result); - }); - - // Wait for result with timeout - match rx.recv_timeout(Duration::from_millis(timeout_ms)) { - Ok(result) => result, - Err(mpsc::RecvTimeoutError::Timeout) => Err(LibmagicError::Timeout { timeout_ms }), - Err(mpsc::RecvTimeoutError::Disconnected) => { - // Thread panicked or dropped sender - Err(LibmagicError::EvaluationError( - crate::error::EvaluationError::internal_error( - "Evaluation thread terminated unexpectedly", - ), - )) - } - } + let mut context = EvaluationContext::new(config.clone()); + evaluate_rules(rules, buffer, &mut context) } #[cfg(test)] @@ -647,7 +546,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -665,7 +564,7 @@ mod tests { let buffer = &[0x50, 0x4b, 0x03, 0x04]; // ZIP magic bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(!result); + assert!(result.is_none()); } #[test] @@ -683,7 +582,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // 0x7f != 0x00 + assert!(result.is_some()); // 0x7f != 0x00 } #[test] @@ -701,7 +600,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(!result); // 0x7f == 0x7f, so NotEqual is false + assert!(result.is_none()); // 0x7f == 0x7f, so NotEqual is false } #[test] @@ -719,7 +618,7 @@ mod tests { let buffer = &[0xff, 0x45, 0x4c, 0x46]; // 0xff has high bit set let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // 0xff & 0x80 = 0x80 (non-zero) + assert!(result.is_some()); // 0xff & 0x80 = 0x80 (non-zero) } #[test] @@ -737,7 +636,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // 0x7f has high bit clear let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(!result); // 0x7f & 0x80 = 0x00 (zero) + assert!(result.is_none()); // 0x7f & 0x80 = 0x00 (zero) } #[test] @@ -758,7 +657,7 @@ mod tests { let buffer = &[0x34, 0x12, 0x56, 0x78]; // 0x1234 in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -779,7 +678,7 @@ mod tests { let buffer = &[0x12, 0x34, 0x56, 0x78]; // 0x1234 in big-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -800,7 +699,7 @@ mod tests { let buffer = &[0xff, 0x7f, 0x00, 0x00]; // 0x7fff in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -821,7 +720,7 @@ mod tests { let buffer = &[0xff, 0xff, 0x00, 0x00]; // 0xffff in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -842,7 +741,7 @@ mod tests { let buffer = &[0x78, 0x56, 0x34, 0x12, 0x00]; // 0x12345678 in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -863,7 +762,7 @@ mod tests { let buffer = &[0x12, 0x34, 0x56, 0x78, 0x00]; // 0x12345678 in big-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -884,7 +783,7 @@ mod tests { let buffer = &[0xff, 0xff, 0xff, 0x7f, 0x00]; // 0x7fffffff in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -905,7 +804,7 @@ mod tests { let buffer = &[0xff, 0xff, 0xff, 0xff, 0x00]; // 0xffffffff in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -923,7 +822,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // buffer[2] == 0x4c + assert!(result.is_some()); // buffer[2] == 0x4c } #[test] @@ -941,7 +840,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // Last byte is 0x46 + assert!(result.is_some()); // Last byte is 0x46 } #[test] @@ -959,7 +858,7 @@ mod tests { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // buffer[2] == 0x4c (second to last) + assert!(result.is_some()); // buffer[2] == 0x4c (second to last) } #[test] @@ -1090,7 +989,7 @@ mod tests { let result = evaluate_single_rule(&rule, buffer); assert!(result.is_ok()); let matches = result.unwrap(); - assert!(matches); // Should match + assert!(matches.is_some()); // Should match // Test non-matching string let rule_no_match = MagicRule { @@ -1107,13 +1006,13 @@ mod tests { let result = evaluate_single_rule(&rule_no_match, buffer); assert!(result.is_ok()); let matches = result.unwrap(); - assert!(!matches); // Should not match + assert!(matches.is_none()); // Should not match } } #[test] fn test_evaluate_single_rule_cross_type_comparison() { - // Test that cross-type comparisons work correctly (should not match) + // Test that cross-type integer comparisons use coercion (Uint(42) == Int(42)) let rule = MagicRule { offset: OffsetSpec::Absolute(0), typ: TypeKind::Byte, @@ -1127,7 +1026,7 @@ fn test_evaluate_single_rule_cross_type_comparison() { let buffer = &[42]; // Byte value 42 let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(!result); // Should not match due to type mismatch (Uint vs Int) + assert!(result.is_some()); // Should match via cross-type integer coercion } #[test] @@ -1148,7 +1047,7 @@ fn test_evaluate_single_rule_bitwise_and_with_shorts() { let buffer = &[0x34, 0x12]; // 0x1234 in little-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // 0x1234 & 0xff00 = 0x1200 (non-zero) + assert!(result.is_some()); // 0x1234 & 0xff00 = 0x1200 (non-zero) } #[test] @@ -1169,7 +1068,7 @@ fn test_evaluate_single_rule_bitwise_and_with_longs() { let buffer = &[0x12, 0x34, 0x56, 0x78]; // 0x12345678 in big-endian let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // 0x12345678 & 0xffff0000 = 0x12340000 (non-zero) + assert!(result.is_some()); // 0x12345678 & 0xffff0000 = 0x12340000 (non-zero) } #[test] @@ -1191,11 +1090,11 @@ fn test_evaluate_single_rule_comprehensive_elf_check() { let elf_buffer = &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01]; // ELF64 header start let result = evaluate_single_rule(&rule, elf_buffer).unwrap(); - assert!(result); + assert!(result.is_some()); let non_elf_buffer = &[0x50, 0x4b, 0x03, 0x04, 0x14, 0x00]; // ZIP header let result = evaluate_single_rule(&rule, non_elf_buffer).unwrap(); - assert!(!result); + assert!(result.is_none()); } #[test] @@ -1216,7 +1115,7 @@ fn test_evaluate_single_rule_native_endianness() { let buffer = &[0x01, 0x02]; // Non-zero bytes let result = evaluate_single_rule(&rule, buffer).unwrap(); - assert!(result); // Should be non-zero regardless of endianness + assert!(result.is_some()); // Should be non-zero regardless of endianness } #[test] @@ -1234,7 +1133,7 @@ fn test_evaluate_single_rule_all_operators() { level: 0, strength_modifier: None, }; - assert!(evaluate_single_rule(&equal_rule, buffer).unwrap()); + assert!(evaluate_single_rule(&equal_rule, buffer).unwrap().is_some()); // Test NotEqual operator let not_equal_rule = MagicRule { @@ -1247,7 +1146,11 @@ fn test_evaluate_single_rule_all_operators() { level: 0, strength_modifier: None, }; - assert!(evaluate_single_rule(¬_equal_rule, buffer).unwrap()); // 0x00 != 0x42 + assert!( + evaluate_single_rule(¬_equal_rule, buffer) + .unwrap() + .is_some() + ); // 0x00 != 0x42 // Test BitwiseAnd operator let bitwise_and_rule = MagicRule { @@ -1260,7 +1163,11 @@ fn test_evaluate_single_rule_all_operators() { level: 0, strength_modifier: None, }; - assert!(evaluate_single_rule(&bitwise_and_rule, buffer).unwrap()); // 0x80 & 0x80 = 0x80 + assert!( + evaluate_single_rule(&bitwise_and_rule, buffer) + .unwrap() + .is_some() + ); // 0x80 & 0x80 = 0x80 } #[test] @@ -1282,7 +1189,7 @@ fn test_evaluate_single_rule_edge_case_values() { let max_buffer = &[0xff, 0xff, 0xff, 0xff]; let result = evaluate_single_rule(&max_uint_rule, max_buffer).unwrap(); - assert!(result); + assert!(result.is_some()); // Test with minimum signed value let min_int_rule = MagicRule { @@ -1301,7 +1208,7 @@ fn test_evaluate_single_rule_edge_case_values() { let min_buffer = &[0x00, 0x00, 0x00, 0x80]; // 0x80000000 in little-endian let result = evaluate_single_rule(&min_int_rule, min_buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } #[test] @@ -1320,7 +1227,7 @@ fn test_evaluate_single_rule_various_buffer_sizes() { let single_buffer = &[0xaa]; let result = evaluate_single_rule(&single_byte_rule, single_buffer).unwrap(); - assert!(result); + assert!(result.is_some()); // Test with large buffer #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] @@ -1337,7 +1244,7 @@ fn test_evaluate_single_rule_various_buffer_sizes() { }; let result = evaluate_single_rule(&large_rule, &large_buffer).unwrap(); - assert!(result); + assert!(result.is_some()); } // Tests for EvaluationContext @@ -2116,7 +2023,7 @@ fn test_evaluate_rules_with_config_convenience() { let buffer = &[0x7f, 0x45, 0x4c, 0x46]; let config = EvaluationConfig::default(); - let matches = evaluate_rules_with_config(&rules, buffer, config).unwrap(); + let matches = evaluate_rules_with_config(&rules, buffer, &config).unwrap(); assert_eq!(matches.len(), 1); assert_eq!(matches[0].message, "ELF magic"); } diff --git a/src/evaluator/operators.rs b/src/evaluator/operators.rs index 93b5a8c6..e307dcc1 100644 --- a/src/evaluator/operators.rs +++ b/src/evaluator/operators.rs @@ -32,8 +32,8 @@ use crate::parser::ast::{Operator, Value}; /// // Same type, different value /// assert!(!apply_equal(&Value::Uint(42), &Value::Uint(24))); /// -/// // Different types, same numeric value -/// assert!(!apply_equal(&Value::Uint(42), &Value::Int(42))); +/// // Cross-type integer coercion +/// assert!(apply_equal(&Value::Uint(42), &Value::Int(42))); /// /// // String comparison /// assert!(apply_equal( @@ -56,7 +56,11 @@ pub fn apply_equal(left: &Value, right: &Value) -> bool { // String comparison (Value::String(a), Value::String(b)) => a == b, - // Different types are never equal + // Cross-type integer coercion (safe via i128 to avoid overflow) + (Value::Uint(a), Value::Int(b)) => i128::from(*a) == i128::from(*b), + (Value::Int(a), Value::Uint(b)) => i128::from(*a) == i128::from(*b), + + // Different non-integer types are never equal _ => false, } } @@ -88,8 +92,8 @@ pub fn apply_equal(left: &Value, right: &Value) -> bool { /// // Same type, same value /// assert!(!apply_not_equal(&Value::Uint(42), &Value::Uint(42))); /// -/// // Different types (always not equal) -/// assert!(apply_not_equal(&Value::Uint(42), &Value::Int(42))); +/// // Cross-type integers with same numeric value (equal via coercion) +/// assert!(!apply_not_equal(&Value::Uint(42), &Value::Int(42))); /// /// // String comparison /// assert!(apply_not_equal( @@ -201,8 +205,8 @@ pub fn apply_bitwise_and(left: &Value, right: &Value) -> bool { /// &Value::Uint(0x0F) /// )); /// -/// // Cross-type comparisons -/// assert!(!apply_operator( +/// // Cross-type integer coercion +/// assert!(apply_operator( /// &Operator::Equal, /// &Value::Uint(42), /// &Value::Int(42) @@ -219,13 +223,9 @@ pub fn apply_operator(operator: &Operator, left: &Value, right: &Value) -> bool let masked_left = match left { Value::Uint(val) => Value::Uint(val & mask), Value::Int(val) => { - // Convert u64 mask to i64 safely - let i64_mask = if i64::try_from(*mask).is_ok() { - i64::try_from(*mask).unwrap_or(0) - } else { - // For values > i64::MAX, use bitwise representation - i64::from_ne_bytes(mask.to_ne_bytes()) - }; + // Convert u64 mask to i64, using bitwise representation for values > i64::MAX + let i64_mask = i64::try_from(*mask) + .unwrap_or_else(|_| i64::from_ne_bytes(mask.to_ne_bytes())); Value::Int(val & i64_mask) } _ => return false, // Can't apply bitwise operations to non-numeric values @@ -398,12 +398,23 @@ mod tests { // Cross-type comparison tests (should all return false) #[test] fn test_apply_equal_uint_vs_int() { + // Same numeric value across types should match let left = Value::Uint(42); let right = Value::Int(42); - assert!(!apply_equal(&left, &right)); + assert!(apply_equal(&left, &right)); let left = Value::Uint(0); let right = Value::Int(0); + assert!(apply_equal(&left, &right)); + + // Negative Int cannot equal Uint + let left = Value::Uint(42); + let right = Value::Int(-42); + assert!(!apply_equal(&left, &right)); + + // Large Uint that doesn't fit in i64 cannot equal Int + let left = Value::Uint(u64::MAX); + let right = Value::Int(-1); assert!(!apply_equal(&left, &right)); } @@ -451,14 +462,23 @@ mod tests { Value::String("42".to_string()), ]; - // Test that no cross-type comparisons return true + // Test cross-type comparisons for (i, left) in values.iter().enumerate() { for (j, right) in values.iter().enumerate() { if i != j { - assert!( - !apply_equal(left, right), - "Cross-type comparison should be false: {left:?} vs {right:?}" - ); + let result = apply_equal(left, right); + // Uint(42) and Int(42) should be equal (cross-type coercion) + if (i <= 1) && (j <= 1) { + assert!( + result, + "Integer cross-type comparison should be true: {left:?} vs {right:?}" + ); + } else { + assert!( + !result, + "Non-integer cross-type comparison should be false: {left:?} vs {right:?}" + ); + } } } } @@ -528,6 +548,12 @@ mod tests { assert!(apply_equal(&max_signed, &max_signed)); assert!(apply_equal(&min_int, &min_int)); + // Cross-type edge cases + // u64::MAX != -1 in i64 (different mathematical values) + assert!(!apply_equal(&max_unsigned, &Value::Int(-1))); + // i64::MAX can be represented as u64, so should match + assert!(apply_equal(&Value::Uint(i64::MAX as u64), &max_signed)); + // Test with empty collections let empty_bytes = Value::Bytes(vec![]); let empty_string = Value::String(String::new()); @@ -709,12 +735,18 @@ mod tests { // Cross-type comparison tests for not_equal (should all return true) #[test] fn test_apply_not_equal_uint_vs_int() { + // Same numeric value across types should be equal (not not-equal) let left = Value::Uint(42); let right = Value::Int(42); - assert!(apply_not_equal(&left, &right)); + assert!(!apply_not_equal(&left, &right)); let left = Value::Uint(0); let right = Value::Int(0); + assert!(!apply_not_equal(&left, &right)); + + // Different numeric values should be not-equal + let left = Value::Uint(42); + let right = Value::Int(-42); assert!(apply_not_equal(&left, &right)); } @@ -762,14 +794,23 @@ mod tests { Value::String("42".to_string()), ]; - // Test that all cross-type comparisons return true for not_equal + // Test cross-type comparisons for not_equal for (i, left) in values.iter().enumerate() { for (j, right) in values.iter().enumerate() { if i != j { - assert!( - apply_not_equal(left, right), - "Cross-type comparison should be true for not_equal: {left:?} vs {right:?}" - ); + let result = apply_not_equal(left, right); + // Uint(42) and Int(42) should be equal, so not_equal is false + if (i <= 1) && (j <= 1) { + assert!( + !result, + "Integer cross-type not_equal should be false: {left:?} vs {right:?}" + ); + } else { + assert!( + result, + "Non-integer cross-type not_equal should be true: {left:?} vs {right:?}" + ); + } } } } @@ -1152,8 +1193,8 @@ mod tests { &Value::String("world".to_string()) )); - // Cross-type should be false - assert!(!apply_operator( + // Cross-type integer coercion + assert!(apply_operator( &Operator::Equal, &Value::Uint(42), &Value::Int(42) @@ -1186,8 +1227,8 @@ mod tests { &Value::String("world".to_string()) )); - // Cross-type should be true (not equal) - assert!(apply_operator( + // Cross-type integer coercion: same value, so not-equal is false + assert!(!apply_operator( &Operator::NotEqual, &Value::Uint(42), &Value::Int(42) @@ -1302,20 +1343,19 @@ mod tests { Value::String("world".to_string()), ), (Value::Bytes(vec![1, 2, 3]), Value::Bytes(vec![4, 5, 6])), - // Different types - (Value::Uint(42), Value::Int(42)), + // Different types (non-coercible) (Value::Uint(42), Value::String("42".to_string())), (Value::Int(42), Value::Bytes(vec![42])), ]; for (left, right) in test_cases { - // Equal should always be false for different values + // Equal should always be false for truly different values assert!( !apply_operator(&Operator::Equal, &left, &right), "Equal should be false for different values: {left:?} == {right:?}" ); - // NotEqual should always be true for different values + // NotEqual should always be true for truly different values assert!( apply_operator(&Operator::NotEqual, &left, &right), "NotEqual should be true for different values: {left:?} != {right:?}" @@ -1514,11 +1554,11 @@ mod tests { &zero_signed, &Value::Int(0xFF) )); - assert!(apply_operator( + assert!(!apply_operator( &Operator::NotEqual, &zero_uint, &zero_signed - )); // Different types + )); // Cross-type integer coercion: 0 == 0 } #[test] diff --git a/src/evaluator/types.rs b/src/evaluator/types.rs index c18cd087..448e2752 100644 --- a/src/evaluator/types.rs +++ b/src/evaluator/types.rs @@ -72,18 +72,9 @@ pub enum TypeReadError { /// Returns `TypeReadError::BufferOverrun` if the offset is greater than or equal to /// the buffer length. pub fn read_byte(buffer: &[u8], offset: usize) -> Result { - // Additional security check: ensure offset doesn't cause integer overflow - if offset >= buffer.len() { - return Err(TypeReadError::BufferOverrun { - offset, - buffer_len: buffer.len(), - }); - } - buffer .get(offset) - .copied() - .map(|byte| Value::Uint(u64::from(byte))) + .map(|&byte| Value::Uint(u64::from(byte))) .ok_or(TypeReadError::BufferOverrun { offset, buffer_len: buffer.len(), @@ -294,20 +285,14 @@ pub fn read_string( // Get the slice starting from offset let remaining_buffer = &buffer[offset..]; - // Determine the actual length to read + // Determine the actual length to read (uses SIMD-accelerated memchr for null scan) let read_length = if let Some(max_len) = max_length { // Find null terminator within max_length, or use max_length if no null found let search_len = std::cmp::min(max_len, remaining_buffer.len()); - remaining_buffer[..search_len] - .iter() - .position(|&b| b == 0) - .unwrap_or(search_len) + memchr::memchr(0, &remaining_buffer[..search_len]).unwrap_or(search_len) } else { // Find null terminator in entire remaining buffer - remaining_buffer - .iter() - .position(|&b| b == 0) - .unwrap_or(remaining_buffer.len()) + memchr::memchr(0, remaining_buffer).unwrap_or(remaining_buffer.len()) }; // Extract the string bytes (excluding null terminator) diff --git a/src/lib.rs b/src/lib.rs index b56e4675..5e77f34a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -133,10 +133,10 @@ pub use error::{EvaluationError, LibmagicError, ParseError}; /// Result type for library operations pub type Result = std::result::Result; -// Implement From for LibmagicError impl From for LibmagicError { fn from(err: crate::io::IoError) -> Self { - LibmagicError::IoError(std::io::Error::other(err.to_string())) + // Preserve the structured error message (includes path and operation context) + LibmagicError::FileError(err.to_string()) } } @@ -338,19 +338,17 @@ impl EvaluationConfig { const MAX_SAFE_RECURSION_DEPTH: u32 = 1000; if self.max_recursion_depth == 0 { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - "max_recursion_depth must be greater than 0", - ))); + return Err(LibmagicError::ConfigError { + reason: "max_recursion_depth must be greater than 0".to_string(), + }); } if self.max_recursion_depth > MAX_SAFE_RECURSION_DEPTH { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - format!( + return Err(LibmagicError::ConfigError { + reason: format!( "max_recursion_depth must not exceed {MAX_SAFE_RECURSION_DEPTH} to prevent stack overflow" ), - ))); + }); } Ok(()) @@ -361,19 +359,17 @@ impl EvaluationConfig { const MAX_SAFE_STRING_LENGTH: usize = 1_048_576; // 1MB if self.max_string_length == 0 { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - "max_string_length must be greater than 0", - ))); + return Err(LibmagicError::ConfigError { + reason: "max_string_length must be greater than 0".to_string(), + }); } if self.max_string_length > MAX_SAFE_STRING_LENGTH { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - format!( + return Err(LibmagicError::ConfigError { + reason: format!( "max_string_length must not exceed {MAX_SAFE_STRING_LENGTH} bytes to prevent memory exhaustion" ), - ))); + }); } Ok(()) @@ -385,19 +381,17 @@ impl EvaluationConfig { if let Some(timeout) = self.timeout_ms { if timeout == 0 { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - "timeout_ms must be greater than 0 if specified", - ))); + return Err(LibmagicError::ConfigError { + reason: "timeout_ms must be greater than 0 if specified".to_string(), + }); } if timeout > MAX_SAFE_TIMEOUT_MS { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - format!( + return Err(LibmagicError::ConfigError { + reason: format!( "timeout_ms must not exceed {MAX_SAFE_TIMEOUT_MS} (5 minutes) to prevent denial of service" ), - ))); + }); } } @@ -412,12 +406,11 @@ impl EvaluationConfig { if self.max_recursion_depth > HIGH_RECURSION_THRESHOLD && self.max_string_length > LARGE_STRING_THRESHOLD { - return Err(LibmagicError::ParseError(ParseError::invalid_syntax( - 0, - format!( + return Err(LibmagicError::ConfigError { + reason: format!( "High recursion depth (>{HIGH_RECURSION_THRESHOLD}) combined with large string length (>{LARGE_STRING_THRESHOLD}) may cause resource exhaustion" ), - ))); + }); } Ok(()) @@ -432,6 +425,8 @@ pub struct MagicDatabase { /// Optional path to the source magic file or directory from which rules were loaded. /// This is used for debugging and logging purposes. source_path: Option, + /// Cached MIME type mapper to avoid rebuilding the lookup table on every evaluation + mime_mapper: mime::MimeMapper, } impl MagicDatabase { @@ -492,6 +487,7 @@ impl MagicDatabase { rules: crate::builtin_rules::get_builtin_rules(), config, source_path: None, + mime_mapper: mime::MimeMapper::new(), }) } @@ -537,6 +533,7 @@ impl MagicDatabase { rules, config, source_path: Some(path.as_ref().to_path_buf()), + mime_mapper: mime::MimeMapper::new(), }) } @@ -564,7 +561,6 @@ impl MagicDatabase { pub fn evaluate_file>(&self, path: P) -> Result { use crate::evaluator::evaluate_rules_with_config; use crate::io::FileBuffer; - use crate::mime::MimeMapper; use std::fs; use std::time::Instant; @@ -588,56 +584,14 @@ impl MagicDatabase { let file_buffer = FileBuffer::new(path)?; let buffer = file_buffer.as_slice(); - // If we have no rules, return "data" as fallback - if self.rules.is_empty() { - return Ok(EvaluationResult { - description: "data".to_string(), - mime_type: None, - confidence: 0.0, - matches: vec![], - metadata: EvaluationMetadata { - file_size, - evaluation_time_ms: start_time.elapsed().as_secs_f64() * 1000.0, - rules_evaluated: 0, - magic_file: self.source_path.clone(), - timed_out: false, - }, - }); - } - - // Evaluate rules against the file buffer - let matches = evaluate_rules_with_config(&self.rules, buffer, self.config.clone())?; - - // Build the result - let (description, confidence) = if matches.is_empty() { - ("data".to_string(), 0.0) + // Evaluate rules against the file buffer (build_result handles empty rules/matches) + let matches = if self.rules.is_empty() { + vec![] } else { - ( - Self::concatenate_messages(&matches), - matches.first().map_or(0.0, |m| m.confidence), - ) - }; - - // Get MIME type if enabled - let mime_type = if self.config.enable_mime_types { - MimeMapper::new().get_mime_type(&description) - } else { - None + evaluate_rules_with_config(&self.rules, buffer, &self.config)? }; - Ok(EvaluationResult { - description, - mime_type, - confidence, - matches, - metadata: EvaluationMetadata { - file_size, - evaluation_time_ms: start_time.elapsed().as_secs_f64() * 1000.0, - rules_evaluated: self.rules.len(), - magic_file: self.source_path.clone(), - timed_out: false, - }, - }) + Ok(self.build_result(matches, file_size, start_time)) } /// Evaluate magic rules against an in-memory buffer @@ -676,29 +630,28 @@ impl MagicDatabase { start_time: std::time::Instant, ) -> Result { use crate::evaluator::evaluate_rules_with_config; - use crate::mime::MimeMapper; let file_size = buffer.len() as u64; - if self.rules.is_empty() { - return Ok(EvaluationResult { - description: "data".to_string(), - mime_type: None, - confidence: 0.0, - matches: vec![], - metadata: EvaluationMetadata { - file_size, - evaluation_time_ms: start_time.elapsed().as_secs_f64() * 1000.0, - rules_evaluated: 0, - magic_file: self.source_path.clone(), - timed_out: false, - }, - }); - } + let matches = if self.rules.is_empty() { + vec![] + } else { + evaluate_rules_with_config(&self.rules, buffer, &self.config)? + }; - let matches = evaluate_rules_with_config(&self.rules, buffer, self.config.clone())?; + Ok(self.build_result(matches, file_size, start_time)) + } - // Build the result + /// Build an `EvaluationResult` from match results, file size, and start time. + /// + /// This is shared between `evaluate_file` and `evaluate_buffer_internal` to + /// avoid duplicating the result-construction logic. + fn build_result( + &self, + matches: Vec, + file_size: u64, + start_time: std::time::Instant, + ) -> EvaluationResult { let (description, confidence) = if matches.is_empty() { ("data".to_string(), 0.0) } else { @@ -708,14 +661,13 @@ impl MagicDatabase { ) }; - // Get MIME type if enabled let mime_type = if self.config.enable_mime_types { - MimeMapper::new().get_mime_type(&description) + self.mime_mapper.get_mime_type(&description) } else { None }; - Ok(EvaluationResult { + EvaluationResult { description, mime_type, confidence, @@ -727,7 +679,7 @@ impl MagicDatabase { magic_file: self.source_path.clone(), timed_out: false, }, - }) + } } /// Concatenate match messages following libmagic behavior @@ -735,7 +687,8 @@ impl MagicDatabase { /// Messages are joined with spaces, except when a message starts with /// backspace character (\\b) which suppresses the space. fn concatenate_messages(matches: &[evaluator::MatchResult]) -> String { - let mut result = String::new(); + let capacity: usize = matches.iter().map(|m| m.message.len() + 1).sum(); + let mut result = String::with_capacity(capacity); for m in matches { if let Some(rest) = m.message.strip_prefix('\u{0008}') { // Backspace suppresses the space and the character itself @@ -952,10 +905,10 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("max_recursion_depth must be greater than 0")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } @@ -970,10 +923,10 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("max_recursion_depth must not exceed 1000")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } @@ -988,10 +941,10 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("max_string_length must be greater than 0")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } @@ -1006,11 +959,11 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("max_string_length must not exceed")); assert!(message.contains("bytes to prevent memory exhaustion")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } @@ -1025,10 +978,10 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("timeout_ms must be greater than 0 if specified")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } @@ -1043,10 +996,10 @@ mod tests { assert!(result.is_err()); match result.unwrap_err() { - LibmagicError::ParseError(ParseError::InvalidSyntax { message, .. }) => { + LibmagicError::ConfigError { reason: message } => { assert!(message.contains("timeout_ms must not exceed 300000")); } - _ => panic!("Expected ParseError with InvalidSyntax"), + _ => panic!("Expected ConfigError"), } } diff --git a/src/main.rs b/src/main.rs index 8b75b363..ed10a902 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,11 +5,8 @@ use clap::Parser; use clap_stdin::FileOrStdin; -use libmagic_rs::output::MatchResult; use libmagic_rs::output::json::{format_json_line_output, format_json_output}; -use libmagic_rs::parser::ast::Value; use libmagic_rs::parser::{MagicFileFormat, detect_format}; -use libmagic_rs::tags::TagExtractor; use libmagic_rs::{LibmagicError, MagicDatabase}; use std::fs; use std::path::{Path, PathBuf}; @@ -263,6 +260,14 @@ fn handle_error(error: LibmagicError) -> i32 { LibmagicError::ParseError(ref parse_err) => handle_parse_error_new(parse_err), LibmagicError::EvaluationError(ref eval_err) => handle_evaluation_error_new(eval_err), LibmagicError::Timeout { timeout_ms } => handle_timeout_error(timeout_ms), + LibmagicError::ConfigError { ref reason } => { + eprintln!("Configuration error: {reason}"); + 1 + } + LibmagicError::FileError(ref msg) => { + eprintln!("File error: {msg}"); + 3 + } } } @@ -377,72 +382,15 @@ fn output_result( match args.output_format() { OutputFormat::Json => { - // Extract tags from the description - let tag_extractor = TagExtractor::new(); - let tags = tag_extractor.extract_tags(&result.description); - - // Convert evaluator::MatchResult to output::MatchResult - let match_results: Vec = if result.matches.is_empty() { - vec![] - } else { - result - .matches - .iter() - .map(|m| { - // Build rule_path from match messages - let rule_path = - tag_extractor.extract_rule_path(std::iter::once(m.message.as_str())); - - // Convert confidence from 0.0-1.0 to 0-100 scale - let confidence_score = (m.confidence * 100.0).min(100.0) as u8; - - // Estimate length from value type - let length = match &m.value { - Value::Bytes(b) => b.len(), - Value::String(s) => s.len(), - _ => 4, // Default for numeric types - }; - - MatchResult::with_metadata( - m.message.clone(), - m.offset, - length, - m.value.clone(), - rule_path, - confidence_score, - result.mime_type.clone(), - ) - }) - .collect() - }; - - // Add tags to the first match if present - let match_results = if !match_results.is_empty() && !tags.is_empty() { - let mut results = match_results; - // Tags are extracted from description, associate with primary result - results[0] = MatchResult::with_metadata( - results[0].message.clone(), - results[0].offset, - results[0].length, - results[0].value.clone(), - if results[0].rule_path.is_empty() { - tags - } else { - results[0].rule_path.clone() - }, - results[0].confidence, - results[0].mime_type.clone(), - ); - results - } else { - match_results - }; + // Convert library result to output format (handles match conversion + tag enrichment) + let output_result = + libmagic_rs::output::EvaluationResult::from_library_result(result, file_path); // Use JSON Lines format for multiple files, pretty JSON for single file let json_result = if is_multiple_files { - format_json_line_output(file_path, &match_results) + format_json_line_output(file_path, &output_result.matches) } else { - format_json_output(&match_results) + format_json_output(&output_result.matches) }; match json_result { diff --git a/src/output/mod.rs b/src/output/mod.rs index 48518131..161a0612 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -12,8 +12,16 @@ pub mod text; use serde::{Deserialize, Serialize}; use std::path::PathBuf; +use std::sync::LazyLock; + use crate::parser::ast::Value; +/// Shared `TagExtractor` instance, initialized once on first use. +/// Avoids allocating the 16-keyword `HashSet` on every call to +/// `from_evaluator_match` or `from_library_result`. +static DEFAULT_TAG_EXTRACTOR: LazyLock = + LazyLock::new(crate::tags::TagExtractor::new); + /// Result of a single magic rule match /// /// Contains all information about a successful rule match, including the matched @@ -245,6 +253,44 @@ impl MatchResult { } } + /// Convert from an evaluator `MatchResult` to an output `MatchResult` + /// + /// This adapts the internal evaluation result format to the richer output format + /// used for JSON and structured output. It extracts rule paths from match messages + /// and converts confidence from 0.0-1.0 to 0-100 scale. + /// + /// # Arguments + /// + /// * `m` - The evaluator match result to convert + /// * `mime_type` - Optional MIME type to associate with this match + #[must_use] + pub fn from_evaluator_match( + m: &crate::evaluator::MatchResult, + mime_type: Option<&str>, + ) -> Self { + let rule_path = + DEFAULT_TAG_EXTRACTOR.extract_rule_path(std::iter::once(m.message.as_str())); + + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let confidence = (m.confidence * 100.0).min(100.0) as u8; + + let length = match &m.value { + Value::Bytes(b) => b.len(), + Value::String(s) => s.len(), + Value::Uint(_) | Value::Int(_) => 4, + }; + + Self::with_metadata( + m.message.clone(), + m.offset, + length, + m.value.clone(), + rule_path, + confidence, + mime_type.map(String::from), + ) + } + /// Set the confidence score for this match /// /// The confidence score is automatically clamped to the range 0-100. @@ -269,12 +315,6 @@ impl MatchResult { /// assert_eq!(result.confidence, 100); /// ``` pub fn set_confidence(&mut self, confidence: u8) { - // Only warn in debug builds to avoid performance impact - #[cfg(debug_assertions)] - if confidence > 100 { - eprintln!("Warning: Confidence score {confidence} clamped to 100"); - } - self.confidence = confidence.min(100); } @@ -366,6 +406,51 @@ impl EvaluationResult { } } + /// Convert from a library `EvaluationResult` to an output `EvaluationResult` + /// + /// This adapts the library's evaluation result into the output format used for + /// JSON and structured output. Converts all matches and metadata, and enriches + /// the first match's rule path with tags extracted from the overall description. + /// + /// # Arguments + /// + /// * `result` - The library evaluation result to convert + /// * `filename` - Path to the file that was evaluated + #[must_use] + pub fn from_library_result( + result: &crate::EvaluationResult, + filename: &std::path::Path, + ) -> Self { + let mut output_matches: Vec = result + .matches + .iter() + .map(|m| MatchResult::from_evaluator_match(m, result.mime_type.as_deref())) + .collect(); + + // Enrich the first match with tags from the overall description + if let Some(first) = output_matches.first_mut() { + if first.rule_path.is_empty() { + first.rule_path = DEFAULT_TAG_EXTRACTOR.extract_tags(&result.description); + } + } + + #[allow(clippy::cast_possible_truncation)] + let rules_evaluated = result.metadata.rules_evaluated as u32; + #[allow(clippy::cast_possible_truncation)] + let rules_matched = output_matches.len() as u32; + + Self::new( + filename.to_path_buf(), + output_matches, + EvaluationMetadata::new( + result.metadata.file_size, + result.metadata.evaluation_time_ms, + rules_evaluated, + rules_matched, + ), + ) + } + /// Create an evaluation result with an error /// /// # Arguments diff --git a/src/parser/format.rs b/src/parser/format.rs new file mode 100644 index 00000000..a813d946 --- /dev/null +++ b/src/parser/format.rs @@ -0,0 +1,228 @@ +//! Format detection for magic files. +//! +//! Detects whether a path points to a text magic file, a directory of magic files, +//! or a binary compiled magic file (.mgc format). + +use crate::error::ParseError; +use std::io::Read; +use std::path::Path; + +/// Represents the format of a magic file or directory +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MagicFileFormat { + /// Text-based magic file (human-readable) + Text, + /// Directory containing multiple magic files (Magdir pattern) + Directory, + /// Binary compiled magic file (.mgc format) + Binary, +} + +/// Detect the format of a magic file or directory +/// +/// This function examines the filesystem metadata and file contents to determine +/// whether the path points to a text magic file, a directory, or a binary .mgc file. +/// +/// # Detection Logic +/// +/// 1. Check if path is a directory -> `MagicFileFormat::Directory` +/// 2. Read first 4 bytes and check for binary magic number `0xF11E041C` -> `MagicFileFormat::Binary` +/// 3. Otherwise -> `MagicFileFormat::Text` +/// +/// # Arguments +/// +/// * `path` - Path to the magic file or directory to detect +/// +/// # Errors +/// +/// Returns `ParseError::IoError` if the path doesn't exist or cannot be read. +/// +/// # Notes +/// +/// This function only detects the format and returns it. It does not validate whether +/// the format is supported by the parser. Higher-level code should check the returned +/// format and decide how to handle unsupported formats (e.g., binary .mgc files). +/// +/// # Examples +/// +/// ```rust,no_run +/// use libmagic_rs::parser::detect_format; +/// use std::path::Path; +/// +/// let format = detect_format(Path::new("/usr/share/file/magic"))?; +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +pub fn detect_format(path: &Path) -> Result { + // Check if path exists and is accessible + let metadata = std::fs::metadata(path)?; + + // Check if it's a directory + if metadata.is_dir() { + return Ok(MagicFileFormat::Directory); + } + + // Read first 4 bytes to check for binary magic number + let mut file = std::fs::File::open(path)?; + + let mut magic_bytes = [0u8; 4]; + + match file.read_exact(&mut magic_bytes) { + Ok(()) => { + // Check for binary magic number 0xF11E041C in little-endian format + let magic_number = u32::from_le_bytes(magic_bytes); + if magic_number == 0xF11E_041C { + return Ok(MagicFileFormat::Binary); + } + // Not a binary magic file, assume text + Ok(MagicFileFormat::Text) + } + Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => { + // File is too small to be a binary magic file, assume text + Ok(MagicFileFormat::Text) + } + Err(e) => Err(ParseError::IoError(e)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::io::Write; + + #[test] + fn test_detect_format_text_file() { + let temp_dir = std::env::temp_dir(); + let text_file = temp_dir.join("test_text_magic.txt"); + fs::write(&text_file, "# Magic file\n0 string test Test").unwrap(); + + let format = detect_format(&text_file).unwrap(); + assert_eq!(format, MagicFileFormat::Text); + + fs::remove_file(&text_file).unwrap(); + } + + #[test] + fn test_detect_format_directory() { + let temp_dir = std::env::temp_dir().join("test_magic_dir"); + fs::create_dir_all(&temp_dir).unwrap(); + + let format = detect_format(&temp_dir).unwrap(); + assert_eq!(format, MagicFileFormat::Directory); + + fs::remove_dir_all(&temp_dir).unwrap(); + } + + #[test] + fn test_detect_format_binary_mgc() { + let temp_dir = std::env::temp_dir(); + let binary_file = temp_dir.join("test_binary.mgc"); + + // Write binary magic number 0xF11E041C in little-endian + let mut file = fs::File::create(&binary_file).unwrap(); + file.write_all(&[0x1C, 0x04, 0x1E, 0xF1]).unwrap(); + file.write_all(b"additional binary data").unwrap(); + + let result = detect_format(&binary_file); + assert!(result.is_ok()); + + match result.unwrap() { + MagicFileFormat::Binary => { + // Expected result + } + other => panic!("Expected Binary format, got {other:?}"), + } + + fs::remove_file(&binary_file).unwrap(); + } + + #[test] + fn test_detect_format_nonexistent_path() { + let nonexistent = std::env::temp_dir().join("nonexistent_magic_file.txt"); + + let result = detect_format(&nonexistent); + assert!(result.is_err()); + + match result.unwrap_err() { + ParseError::IoError(e) => { + assert_eq!(e.kind(), std::io::ErrorKind::NotFound); + } + other => panic!("Expected IoError, got: {other:?}"), + } + } + + #[test] + fn test_detect_format_empty_file() { + let temp_dir = std::env::temp_dir(); + let empty_file = temp_dir.join("test_empty_magic.txt"); + fs::write(&empty_file, "").unwrap(); + + // Empty files should be detected as text (too small for binary magic) + let format = detect_format(&empty_file).unwrap(); + assert_eq!(format, MagicFileFormat::Text); + + fs::remove_file(&empty_file).unwrap(); + } + + #[test] + fn test_detect_format_small_file() { + let temp_dir = std::env::temp_dir(); + let small_file = temp_dir.join("test_small_magic.txt"); + fs::write(&small_file, "ab").unwrap(); // Only 2 bytes + + // Small files should be detected as text + let format = detect_format(&small_file).unwrap(); + assert_eq!(format, MagicFileFormat::Text); + + fs::remove_file(&small_file).unwrap(); + } + + #[test] + fn test_detect_format_text_with_binary_content() { + let temp_dir = std::env::temp_dir(); + let binary_text_file = temp_dir.join("test_binary_text.txt"); + + // Write binary data that's NOT the magic number + let mut file = fs::File::create(&binary_text_file).unwrap(); + file.write_all(&[0xFF, 0xFE, 0xFD, 0xFC]).unwrap(); + file.write_all(b"some text").unwrap(); + + // Should be detected as text (wrong magic number) + let format = detect_format(&binary_text_file).unwrap(); + assert_eq!(format, MagicFileFormat::Text); + + fs::remove_file(&binary_text_file).unwrap(); + } + + #[test] + fn test_magic_file_format_enum_equality() { + assert_eq!(MagicFileFormat::Text, MagicFileFormat::Text); + assert_eq!(MagicFileFormat::Directory, MagicFileFormat::Directory); + assert_eq!(MagicFileFormat::Binary, MagicFileFormat::Binary); + + assert_ne!(MagicFileFormat::Text, MagicFileFormat::Directory); + assert_ne!(MagicFileFormat::Text, MagicFileFormat::Binary); + assert_ne!(MagicFileFormat::Directory, MagicFileFormat::Binary); + } + + #[test] + fn test_magic_file_format_debug() { + let text_format = MagicFileFormat::Text; + let debug_str = format!("{text_format:?}"); + assert!(debug_str.contains("Text")); + } + + #[test] + fn test_magic_file_format_clone() { + let original = MagicFileFormat::Directory; + let cloned = original; + assert_eq!(original, cloned); + } + + #[test] + fn test_magic_file_format_copy() { + let original = MagicFileFormat::Binary; + let copied = original; // Copy trait allows this + assert_eq!(original, copied); + } +} diff --git a/src/parser/hierarchy.rs b/src/parser/hierarchy.rs new file mode 100644 index 00000000..4a9496dd --- /dev/null +++ b/src/parser/hierarchy.rs @@ -0,0 +1,227 @@ +//! Rule hierarchy building for magic file parsing. +//! +//! Constructs parent-child relationships between magic rules based on +//! indentation levels, implementing a stack-based algorithm for hierarchy construction. + +use super::preprocessing::{LineInfo, parse_magic_rule_line}; +use crate::error::ParseError; +use crate::parser::ast::MagicRule; + +/// Builds a hierarchical structure from a flat list of parsed magic rules. +/// +/// This function establishes parent-child relationships based on indentation levels. +/// Rules at deeper indentation levels become children of the most recent rule at a +/// shallower level. This implements a stack-based algorithm for hierarchy construction. +/// +/// # Arguments +/// +/// * `lines` - A vector of preprocessed `LineInfo` structs +/// +/// # Returns +/// +/// `Result, ParseError>` - Root-level rules with children attached +/// +/// # Behavior +/// +/// - Rules with `level=0` are root rules +/// - Rules with `level=1` become children of the most recent `level=0` rule +/// - Rules with `level=2` become children of the most recent `level=1` rule +/// - When indentation decreases, the stack is unwound and completed rules are attached +/// - Orphaned child rules (starting with '>' but with no preceding parent) are +/// added to the root list with their hierarchy level preserved +/// +/// # Errors +/// +/// Returns an error if: +/// - Any line contains invalid magic rule syntax +/// - Rule parsing fails (propagated from `parse_magic_rule_line`) +pub(crate) fn build_rule_hierarchy(lines: Vec) -> Result, ParseError> { + /// Helper to pop a rule from the stack and attach it to its parent or roots + fn pop_and_attach(stack: &mut Vec, roots: &mut Vec) { + if let Some(completed) = stack.pop() { + if let Some(parent) = stack.last_mut() { + parent.children.push(completed); + } else { + roots.push(completed); + } + } + } + + let mut stack: Vec = Vec::new(); + let mut roots: Vec = Vec::new(); + let mut pending_strength: Option = None; + + for line in lines { + if line.is_comment { + continue; + } + + // Handle strength directive: store modifier for next rule + if line.strength_modifier.is_some() { + pending_strength = line.strength_modifier; + continue; + } + + let mut rule = parse_magic_rule_line(&line)?; + + // Apply pending strength modifier to this rule + if pending_strength.is_some() { + rule.strength_modifier = pending_strength.take(); + } + + // Unwind stack until we find a parent with lower level + while stack.last().is_some_and(|top| top.level >= rule.level) { + pop_and_attach(&mut stack, &mut roots); + } + + stack.push(rule); + } + + // Unwind remaining stack + while !stack.is_empty() { + pop_and_attach(&mut stack, &mut roots); + } + + Ok(roots) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::preprocessing::LineInfo; + + fn li(line_number: usize, content: &str) -> LineInfo { + LineInfo { + content: content.to_string(), + line_number, + is_comment: false, + strength_modifier: None, + } + } + + // ============================================================ + // Tests for build_rule_hierarchy (10+ test cases) + // ============================================================ + + #[test] + fn test_build_rule_hierarchy_single_root() { + let lines = vec![li(1, "0 string \\x7fELF ELF executable")]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 1); + assert_eq!(roots[0].level, 0); + } + + #[test] + fn test_build_rule_hierarchy_root_with_one_child() { + let lines = vec![ + li(1, "0 string \\x7fELF ELF executable"), + li(2, ">4 byte 1 32-bit"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 1); + assert_eq!(roots[0].children.len(), 1); + } + + #[test] + fn test_build_rule_hierarchy_root_with_multiple_children() { + let lines = vec![ + li(1, "0 string \\x7fELF ELF executable"), + li(2, ">4 byte 1 32-bit"), + li(3, ">4 byte 2 64-bit"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 1); + assert_eq!(roots[0].children.len(), 2); + } + + #[test] + fn test_build_rule_hierarchy_nested_three_levels() { + let lines = vec![ + li(1, "0 string \\x7fELF ELF executable"), + li(2, ">4 byte 1 class"), + li(3, ">>5 byte 1 subtype"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots[0].children.len(), 1); + assert_eq!(roots[0].children[0].children.len(), 1); + assert_eq!(roots[0].children[0].children[0].level, 2); + } + + #[test] + fn test_build_rule_hierarchy_multiple_roots() { + let lines = vec![ + li(1, r#"0 string "ELF" "ELF executable""#), + li(2, r#"0 string "%PDF" "PDF document""#), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 2); + } + + #[test] + fn test_build_rule_hierarchy_sibling_rules() { + let lines = vec![ + li(1, "0 byte 1 Root"), + li(2, ">4 byte 1 Child1"), + li(3, ">4 byte 2 Child2"), + li(4, "0 byte 2 Root2"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 2); + assert_eq!(roots[0].children.len(), 2); + } + + #[test] + fn test_build_rule_hierarchy_deep_nesting() { + let lines = vec![ + li(1, "0 byte 1 L0"), + li(2, ">4 byte 1 L1"), + li(3, ">>5 byte 2 L2"), + li(4, ">>>6 byte 3 L3"), + li(5, ">>>>7 byte 4 L4"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 1); + assert_eq!( + roots[0].children[0].children[0].children[0].children.len(), + 1 + ); + } + + #[test] + fn test_build_rule_hierarchy_return_to_root_level() { + let lines = vec![ + li(1, "0 byte 1 Root1"), + li(2, ">4 byte 1 Child"), + li(3, "0 byte 2 Root2"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 2); + assert_eq!(roots[0].children.len(), 1); + assert_eq!(roots[1].children.len(), 0); + } + + #[test] + fn test_build_rule_hierarchy_orphaned_child() { + let lines = vec![li(1, ">4 byte 1 Orphaned child")]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 1); + assert_eq!(roots[0].level, 1); + } + + #[test] + fn test_build_rule_hierarchy_complex_structure() { + let lines = vec![ + li(1, "0 byte 1 Root1"), + li(2, ">4 byte 1 C1"), + li(3, ">4 byte 2 C2"), + li(4, ">>6 byte 3 GC1"), + li(5, "0 byte 2 Root2"), + li(6, ">4 byte 4 C3"), + ]; + let roots = build_rule_hierarchy(lines).unwrap(); + assert_eq!(roots.len(), 2); + assert_eq!(roots[0].children.len(), 2); + assert_eq!(roots[0].children[1].children.len(), 1); + assert_eq!(roots[1].children.len(), 1); + } +} diff --git a/src/parser/loader.rs b/src/parser/loader.rs new file mode 100644 index 00000000..22db03b6 --- /dev/null +++ b/src/parser/loader.rs @@ -0,0 +1,586 @@ +//! File and directory loading for magic files. +//! +//! Provides functions for loading magic rules from individual files and +//! directories, with automatic format detection and error handling. + +use crate::error::ParseError; +use crate::parser::ast::MagicRule; +use std::path::{Path, PathBuf}; + +use super::format::{MagicFileFormat, detect_format}; + +/// Loads and parses all magic files from a directory, merging them into a single rule set. +/// +/// This function reads all regular files in the specified directory, parses each as a magic file, +/// and combines the resulting rules into a single `Vec`. Files are processed in +/// alphabetical order by filename to ensure deterministic results. +/// +/// # Error Handling Strategy +/// +/// This function distinguishes between critical and non-critical errors: +/// +/// - **Critical errors** (I/O failures, directory access issues, encoding problems): +/// These cause immediate failure and return a `ParseError`. The function stops processing +/// and propagates the error to the caller. +/// +/// - **Non-critical errors** (individual file parse failures): +/// These are logged to stderr with a warning message and the file is skipped. Processing +/// continues with remaining files. +/// +/// # Behavior +/// +/// - Subdirectories are skipped (not recursively processed) +/// - Symbolic links are skipped +/// - Empty directories return an empty rules vector +/// - Files are processed in alphabetical order by filename +/// - All successfully parsed rules are merged in order +/// +/// # Examples +/// +/// Loading a directory of magic files: +/// +/// ```rust,no_run +/// use libmagic_rs::parser::load_magic_directory; +/// use std::path::Path; +/// +/// let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?; +/// println!("Loaded {} rules from directory", rules.len()); +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +/// +/// Creating a Magdir-style directory structure: +/// +/// ```rust,no_run +/// use libmagic_rs::parser::load_magic_directory; +/// use std::path::Path; +/// +/// // Directory structure: +/// // magic.d/ +/// // ├── 01-elf +/// // ├── 02-archive +/// // └── 03-text +/// +/// let rules = load_magic_directory(Path::new("./magic.d"))?; +/// // Rules from all three files are merged in alphabetical order +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +/// +/// # Errors +/// +/// Returns `ParseError` if: +/// - The directory does not exist or cannot be accessed +/// - Directory entries cannot be read +/// - A file cannot be read due to I/O errors +/// - A file contains invalid UTF-8 encoding +/// +/// # Panics +/// +/// This function does not panic under normal operation. +#[allow(clippy::print_stderr)] +pub fn load_magic_directory(dir_path: &Path) -> Result, ParseError> { + use std::fs; + + // Read directory entries + let entries = fs::read_dir(dir_path).map_err(|e| { + ParseError::invalid_syntax( + 0, + format!("Failed to read directory '{}': {}", dir_path.display(), e), + ) + })?; + + // Collect and sort entries by filename for deterministic ordering + let mut file_paths: Vec = Vec::new(); + for entry in entries { + let entry = entry.map_err(|e| { + ParseError::invalid_syntax( + 0, + format!( + "Failed to read directory entry in '{}': {}", + dir_path.display(), + e + ), + ) + })?; + + let path = entry.path(); + let file_type = entry.file_type().map_err(|e| { + ParseError::invalid_syntax( + 0, + format!("Failed to read file type for '{}': {}", path.display(), e), + ) + })?; + + // Only process regular files, skip directories and symlinks + if file_type.is_file() && !file_type.is_symlink() { + file_paths.push(path); + } + } + + // Sort by filename for deterministic ordering + file_paths.sort_by_key(|path| path.file_name().map(std::ffi::OsStr::to_os_string)); + + // Accumulate rules from all files + let mut all_rules = Vec::new(); + let mut parse_failures: Vec<(PathBuf, ParseError)> = Vec::new(); + let file_count = file_paths.len(); + + for path in file_paths { + // Read file contents + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(e) => { + // I/O errors are critical + return Err(ParseError::invalid_syntax( + 0, + format!("Failed to read file '{}': {}", path.display(), e), + )); + } + }; + + // Parse the file + match super::parse_text_magic_file(&contents) { + Ok(rules) => { + // Successfully parsed - merge rules + all_rules.extend(rules); + } + Err(e) => { + // Track parse failures for reporting + parse_failures.push((path, e)); + } + } + } + + // If all files failed to parse, return an error + if all_rules.is_empty() && !parse_failures.is_empty() { + use std::fmt::Write; + + let failure_details: Vec = parse_failures + .iter() + .take(3) // Limit to first 3 failures for brevity + .map(|(path, e)| format!(" - {}: {}", path.display(), e)) + .collect(); + + let mut message = format!("All {file_count} magic file(s) in directory failed to parse"); + if !failure_details.is_empty() { + message.push_str(":\n"); + message.push_str(&failure_details.join("\n")); + if parse_failures.len() > 3 { + let _ = write!(message, "\n ... and {} more", parse_failures.len() - 3); + } + } + + return Err(ParseError::invalid_syntax(0, message)); + } + + // Log warnings for partial failures (some files parsed, some failed) + // Note: Using eprintln for now; consider a logging framework in the future + #[allow(clippy::print_stderr)] + for (path, e) in &parse_failures { + eprintln!("Warning: Failed to parse '{}': {}", path.display(), e); + } + + Ok(all_rules) +} + +/// Loads magic rules from a file or directory, automatically detecting the format. +/// +/// This is the unified entry point for loading magic rules from the filesystem. It +/// automatically detects whether the path points to a text magic file, a directory +/// containing magic files, or a binary compiled magic file, and dispatches to the +/// appropriate handler. +/// +/// # Format Detection and Handling +/// +/// The function uses [`detect_format()`] to determine the file type and handles each +/// format as follows: +/// +/// - **Text format**: Reads the file contents and parses using [`parse_text_magic_file()`] +/// - **Directory format**: Loads all magic files from the directory using [`load_magic_directory()`] +/// - **Binary format**: Returns an error with guidance to use the `--use-builtin` option +/// +/// # Arguments +/// +/// * `path` - Path to a magic file or directory. Can be absolute or relative. +/// +/// # Returns +/// +/// Returns `Ok(Vec)` containing all successfully parsed magic rules. For +/// directories, rules from all files are merged in alphabetical order by filename. +/// +/// # Errors +/// +/// This function returns a [`ParseError`] in the following cases: +/// +/// - **File not found**: The specified path does not exist +/// - **Unsupported format**: The file is a binary compiled magic file (`.mgc`) +/// - **Parse errors**: The magic file contains syntax errors or invalid rules +/// - **I/O errors**: File system errors during reading (permissions, disk errors, etc.) +/// +/// # Examples +/// +/// ## Loading a text magic file +/// +/// ```no_run +/// use libmagic_rs::parser::load_magic_file; +/// use std::path::Path; +/// +/// let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?; +/// println!("Loaded {} magic rules", rules.len()); +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +/// +/// ## Loading a directory of magic files +/// +/// ```no_run +/// use libmagic_rs::parser::load_magic_file; +/// use std::path::Path; +/// +/// let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?; +/// println!("Loaded {} rules from directory", rules.len()); +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +/// +/// ## Handling binary format errors +/// +/// ```no_run +/// use libmagic_rs::parser::load_magic_file; +/// use std::path::Path; +/// +/// match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) { +/// Ok(rules) => println!("Loaded {} rules", rules.len()), +/// Err(e) => { +/// eprintln!("Error loading magic file: {}", e); +/// eprintln!("Hint: Use --use-builtin for binary files"); +/// } +/// } +/// # Ok::<(), libmagic_rs::ParseError>(()) +/// ``` +/// +/// # Security +/// +/// This function delegates to [`parse_text_magic_file()`] or [`load_magic_directory()`] +/// based on format detection. Security considerations are handled by those functions: +/// +/// - Rule hierarchy depth is bounded during parsing +/// - Invalid syntax is rejected with descriptive errors +/// - Binary `.mgc` files are rejected (not parsed) +/// +/// Note: File size limits and memory exhaustion protection are not currently implemented. +/// Large magic files will be loaded entirely into memory. +/// +/// # See Also +/// +/// - [`detect_format()`] - Format detection logic +/// - [`parse_text_magic_file()`] - Text file parser +/// - [`load_magic_directory()`] - Directory loader +pub fn load_magic_file(path: &Path) -> Result, ParseError> { + // Detect the magic file format + let format = detect_format(path)?; + + // Dispatch to appropriate handler based on format + match format { + MagicFileFormat::Text => { + // Read file contents and parse as text magic file + let content = std::fs::read_to_string(path)?; + super::parse_text_magic_file(&content) + } + MagicFileFormat::Directory => { + // Load all magic files from directory + load_magic_directory(path) + } + MagicFileFormat::Binary => { + // Binary compiled magic files are not supported + Err(ParseError::unsupported_format( + 0, + "binary .mgc file", + "Binary compiled magic files (.mgc) are not supported for parsing.\n\ + Use the --use-builtin option to use the built-in magic rules instead,\n\ + or provide a text-based magic file or directory.", + )) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ============================================================ + // Tests for load_magic_directory (6+ test cases) + // ============================================================ + + #[test] + fn test_load_directory_critical_error_io() { + use std::path::Path; + + let non_existent = Path::new("/this/should/not/exist/anywhere/at/all"); + let result = load_magic_directory(non_existent); + + assert!( + result.is_err(), + "Should return error for non-existent directory" + ); + let err = result.unwrap_err(); + assert!(err.to_string().contains("Failed to read directory")); + } + + #[test] + fn test_load_directory_non_critical_error_parse() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Create a valid file + let valid_path = temp_dir.path().join("valid.magic"); + fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file"); + + // Create an invalid file + let invalid_path = temp_dir.path().join("invalid.magic"); + fs::write(&invalid_path, "this is invalid syntax\n").expect("Failed to write invalid file"); + + // Should succeed, loading only the valid file + let rules = load_magic_directory(temp_dir.path()).expect("Should load valid files"); + + assert_eq!(rules.len(), 1, "Should load only valid file"); + assert_eq!(rules[0].message, "valid"); + } + + #[test] + fn test_load_directory_empty_files() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Create an empty file + let empty_path = temp_dir.path().join("empty.magic"); + fs::write(&empty_path, "").expect("Failed to write empty file"); + + // Create a file with only comments + let comments_path = temp_dir.path().join("comments.magic"); + fs::write(&comments_path, "# Just comments\n# Nothing else\n") + .expect("Failed to write comments file"); + + // Should succeed with no rules + let rules = load_magic_directory(temp_dir.path()).expect("Should handle empty files"); + + assert_eq!(rules.len(), 0, "Empty files should contribute no rules"); + } + + #[test] + fn test_load_directory_binary_files() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Create a binary file (invalid UTF-8) + let binary_path = temp_dir.path().join("binary.dat"); + fs::write(&binary_path, [0xFF, 0xFE, 0xFF, 0xFE]).expect("Failed to write binary file"); + + // Create a valid text file + let valid_path = temp_dir.path().join("valid.magic"); + fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file"); + + // Binary file should cause a critical error (invalid UTF-8) + let result = load_magic_directory(temp_dir.path()); + + // The function should fail when encountering binary files (critical I/O error) + assert!( + result.is_err(), + "Binary files should cause critical error due to invalid UTF-8" + ); + } + + #[test] + fn test_load_directory_mixed_extensions() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Create files with different extensions + fs::write( + temp_dir.path().join("file.magic"), + "0 string \\x01\\x02 magic\n", + ) + .expect("Failed to write .magic file"); + fs::write( + temp_dir.path().join("file.txt"), + "0 string \\x03\\x04 txt\n", + ) + .expect("Failed to write .txt file"); + fs::write(temp_dir.path().join("noext"), "0 string \\x05\\x06 noext\n") + .expect("Failed to write no-ext file"); + + let rules = load_magic_directory(temp_dir.path()) + .expect("Should load all files regardless of extension"); + + assert_eq!( + rules.len(), + 3, + "Should process all files regardless of extension" + ); + + let messages: Vec<&str> = rules.iter().map(|r| r.message.as_str()).collect(); + assert!(messages.contains(&"magic")); + assert!(messages.contains(&"txt")); + assert!(messages.contains(&"noext")); + } + + #[test] + fn test_load_directory_alphabetical_ordering() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Create files in non-alphabetical order - using valid magic syntax with hex escapes + fs::write( + temp_dir.path().join("03-third"), + "0 string \\x07\\x08\\x09 third\n", + ) + .expect("Failed to write third file"); + fs::write( + temp_dir.path().join("01-first"), + "0 string \\x01\\x02\\x03 first\n", + ) + .expect("Failed to write first file"); + fs::write( + temp_dir.path().join("02-second"), + "0 string \\x04\\x05\\x06 second\n", + ) + .expect("Failed to write second file"); + + let rules = load_magic_directory(temp_dir.path()).expect("Should load directory in order"); + + assert_eq!(rules.len(), 3); + // Should be sorted alphabetically by filename + assert_eq!(rules[0].message, "first"); + assert_eq!(rules[1].message, "second"); + assert_eq!(rules[2].message, "third"); + } + + // ============================================================ + // Tests for load_magic_file (5+ test cases) + // ============================================================ + + #[test] + fn test_load_magic_file_text_format() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let magic_file = temp_dir.path().join("magic.txt"); + + // Create text magic file with valid content + fs::write(&magic_file, "0 string \\x7fELF ELF executable\n") + .expect("Failed to write magic file"); + + // Load using load_magic_file + let rules = load_magic_file(&magic_file).expect("Failed to load text magic file"); + + assert_eq!(rules.len(), 1); + assert_eq!(rules[0].message, "ELF executable"); + } + + #[test] + fn test_load_magic_file_directory_format() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let magic_dir = temp_dir.path().join("magic.d"); + fs::create_dir(&magic_dir).expect("Failed to create magic directory"); + + // Create multiple files in directory + fs::write( + magic_dir.join("00_elf"), + "0 string \\x7fELF ELF executable\n", + ) + .expect("Failed to write elf file"); + fs::write( + magic_dir.join("01_zip"), + "0 string \\x50\\x4b\\x03\\x04 ZIP archive\n", + ) + .expect("Failed to write zip file"); + + // Load using load_magic_file + let rules = load_magic_file(&magic_dir).expect("Failed to load directory"); + + assert_eq!(rules.len(), 2); + assert_eq!(rules[0].message, "ELF executable"); + assert_eq!(rules[1].message, "ZIP archive"); + } + + #[test] + fn test_load_magic_file_binary_format_error() { + use std::fs::File; + use std::io::Write; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let binary_file = temp_dir.path().join("magic.mgc"); + + // Create binary file with .mgc magic number + let mut file = File::create(&binary_file).expect("Failed to create binary file"); + let magic_number: [u8; 4] = [0x1C, 0x04, 0x1E, 0xF1]; // Little-endian 0xF11E041C + file.write_all(&magic_number) + .expect("Failed to write magic number"); + + // Attempt to load binary file + let result = load_magic_file(&binary_file); + + assert!(result.is_err(), "Should fail to load binary .mgc file"); + + let error = result.unwrap_err(); + let error_msg = error.to_string(); + + // Verify error mentions unsupported format and --use-builtin + assert!( + error_msg.contains("Binary") || error_msg.contains("binary"), + "Error should mention binary format: {error_msg}", + ); + assert!( + error_msg.contains("--use-builtin") || error_msg.contains("built-in"), + "Error should mention --use-builtin option: {error_msg}", + ); + } + + #[test] + fn test_load_magic_file_io_error() { + use std::path::Path; + + // Try to load non-existent file + let non_existent = Path::new("/this/path/should/not/exist/magic.txt"); + let result = load_magic_file(non_existent); + + assert!(result.is_err(), "Should fail for non-existent file"); + } + + #[test] + fn test_load_magic_file_parse_error_propagation() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let invalid_file = temp_dir.path().join("invalid.magic"); + + // Create file with invalid syntax (missing offset) + fs::write(&invalid_file, "string test invalid\n").expect("Failed to write invalid file"); + + // Attempt to load file with parse errors + let result = load_magic_file(&invalid_file); + + assert!(result.is_err(), "Should fail for file with parse errors"); + + // Error should be a parse error (not I/O error) + let error = result.unwrap_err(); + let error_msg = format!("{error:?}"); + assert!( + error_msg.contains("InvalidSyntax") || error_msg.contains("syntax"), + "Error should be parse error: {error_msg}", + ); + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 9d0d47b4..5ecf3994 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -49,9 +49,9 @@ //! //! 1. **Format Detection**: [`detect_format()`] examines the path to determine the file type //! 2. **Dispatch to Handler**: -//! - Text files → [`parse_text_magic_file()`] after reading contents -//! - Directories → [`load_magic_directory()`] to load and merge all files -//! - Binary files → Returns error suggesting `--use-builtin` option +//! - Text files -> [`parse_text_magic_file()`] after reading contents +//! - Directories -> [`load_magic_directory()`] to load and merge all files +//! - Binary files -> Returns error suggesting `--use-builtin` option //! 3. **Return Merged Rules**: All rules are returned in a single `Vec` //! //! # Examples @@ -128,7 +128,11 @@ //! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues pub mod ast; +mod format; pub mod grammar; +mod hierarchy; +mod loader; +pub(crate) mod preprocessing; // Re-export AST types for convenience pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; @@ -136,1051 +140,59 @@ pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, Typ // Re-export parser functions for convenience pub use grammar::{parse_number, parse_offset}; -use crate::{ - error::ParseError, - parser::grammar::{ - has_continuation, is_comment_line, is_empty_line, is_strength_directive, parse_comment, - parse_magic_rule, parse_strength_directive, - }, -}; -use std::io::Read; -use std::path::{Path, PathBuf}; +// Re-export format detection and loading +pub use format::{MagicFileFormat, detect_format}; +pub use loader::{load_magic_directory, load_magic_file}; -/// Represents the format of a magic file or directory -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum MagicFileFormat { - /// Text-based magic file (human-readable) - Text, - /// Directory containing multiple magic files (Magdir pattern) - Directory, - /// Binary compiled magic file (.mgc format) - Binary, -} - -/// Detect the format of a magic file or directory -/// -/// This function examines the filesystem metadata and file contents to determine -/// whether the path points to a text magic file, a directory, or a binary .mgc file. -/// -/// # Detection Logic -/// -/// 1. Check if path is a directory → `MagicFileFormat::Directory` -/// 2. Read first 4 bytes and check for binary magic number `0xF11E041C` → `MagicFileFormat::Binary` -/// 3. Otherwise → `MagicFileFormat::Text` -/// -/// # Arguments -/// -/// * `path` - Path to the magic file or directory to detect -/// -/// # Errors -/// -/// Returns `ParseError::IoError` if the path doesn't exist or cannot be read. -/// -/// # Notes -/// -/// This function only detects the format and returns it. It does not validate whether -/// the format is supported by the parser. Higher-level code should check the returned -/// format and decide how to handle unsupported formats (e.g., binary .mgc files). -/// -/// # Examples -/// -/// ```rust,no_run -/// use libmagic_rs::parser::detect_format; -/// use std::path::Path; -/// -/// let format = detect_format(Path::new("/usr/share/file/magic"))?; -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -pub fn detect_format(path: &Path) -> Result { - // Check if path exists and is accessible - let metadata = std::fs::metadata(path)?; - - // Check if it's a directory - if metadata.is_dir() { - return Ok(MagicFileFormat::Directory); - } - - // Read first 4 bytes to check for binary magic number - let mut file = std::fs::File::open(path)?; - - let mut magic_bytes = [0u8; 4]; - - match file.read_exact(&mut magic_bytes) { - Ok(()) => { - // Check for binary magic number 0xF11E041C in little-endian format - let magic_number = u32::from_le_bytes(magic_bytes); - if magic_number == 0xF11E_041C { - return Ok(MagicFileFormat::Binary); - } - // Not a binary magic file, assume text - Ok(MagicFileFormat::Text) - } - Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => { - // File is too small to be a binary magic file, assume text - Ok(MagicFileFormat::Text) - } - Err(e) => Err(ParseError::invalid_syntax( - 0, - format!("Failed to read magic file: {e}"), - )), - } -} - -/// Internal structure to track line metadata during preprocessing. -/// -/// Stores the processed content, original line number, and flags for comment -/// and strength directive lines in the input magic file. -#[derive(Debug)] -struct LineInfo { - content: String, - line_number: usize, - is_comment: bool, - /// Optional strength modifier parsed from `!:strength` directive - strength_modifier: Option, -} - -impl LineInfo { - fn new(content: String, line_number: usize, is_comment: bool) -> Self { - Self { - content, - line_number, - is_comment, - strength_modifier: None, - } - } - - fn with_strength( - content: String, - line_number: usize, - strength_modifier: StrengthModifier, - ) -> Self { - Self { - content, - line_number, - is_comment: false, - strength_modifier: Some(strength_modifier), - } - } -} - -/// Preprocesses raw magic file input by handling comments, empty lines, and continuations. -/// -/// This function performs the following transformations: -/// - Removes empty lines from the input -/// - Handles comment lines (lines starting with '#') -/// - Processes line continuations (lines ending with '\') -/// - Concatenates continued lines into single entries -/// - Preserves original line numbers for error reporting (continued lines -/// are assigned the line number of the first line in the continuation sequence) -/// -/// # Arguments -/// -/// * `input` - The raw magic file content as a string -/// -/// # Returns -/// -/// `Result, ParseError>` - A vector of processed lines or a parse error -/// -/// # Errors -/// -/// Returns an error if: -/// - Comment lines cannot be parsed -/// - Input ends with an unterminated line continuation -/// - The input is malformed -/// -/// # Examples -/// -/// ```ignore -/// let input = r#"0 string 0 Test -/// >4 byte 1 Child"#; -/// let lines = preprocess_lines(input)?; -/// assert_eq!(lines.len(), 2); -/// # Ok::<(), Box>(()) -/// ``` -fn preprocess_lines(input: &str) -> Result, ParseError> { - let mut lines_info: Vec = Vec::new(); - let mut line_buf = String::new(); - let mut start_line_number: Option = None; - for (i, mut line) in input.lines().enumerate() { - if is_empty_line(line) { - continue; - } - if is_comment_line(line) { - // Bug 1 fix: If we have an ongoing continuation, discard it before processing comment - if !line_buf.is_empty() { - line_buf.clear(); - start_line_number = None; - } - let parsed_comment = parse_comment(line) - .map_err(|_| ParseError::invalid_syntax(i + 1, "Unable to parse comment"))?; - line = parsed_comment.1.as_str(); - lines_info.push(LineInfo::new(line.trim().to_string(), i + 1, true)); - continue; - } - // Handle strength directives (!:strength ...) - if is_strength_directive(line) { - // If we have an ongoing continuation, discard it before processing directive - if !line_buf.is_empty() { - line_buf.clear(); - start_line_number = None; - } - let strength_modifier = parse_strength_directive(line) - .map_err(|e| { - ParseError::invalid_syntax( - i + 1, - format!("Failed to parse strength directive: {e}"), - ) - })? - .1; - lines_info.push(LineInfo::with_strength( - line.trim().to_string(), - i + 1, - strength_modifier, - )); - continue; - } - // Track the starting line number when we begin accumulating a rule - if start_line_number.is_none() { - start_line_number = Some(i + 1); - } - line_buf.push_str(line.trim()); - if has_continuation(line) { - if let Some(stripped) = line_buf.strip_suffix('\\') { - line_buf = stripped.to_string(); - } - continue; - } - // Bug 2 fix: Use the stored starting line number instead of calculating from cont_ctr - let rule_line_number = start_line_number.unwrap_or(i + 1); - lines_info.push(LineInfo::new( - std::mem::take(&mut line_buf), - rule_line_number, - false, - )); - start_line_number = None; - } - - // Handle unterminated continuation at end of input - if !line_buf.is_empty() { - let last_line = input.lines().count(); - return Err(ParseError::invalid_syntax( - last_line, - "Unterminated line continuation", - )); - } - - Ok(lines_info) -} - -/// Parses a single magic rule line into a `MagicRule` AST node. -/// -/// This function takes a preprocessed `LineInfo` and converts it into a `MagicRule` -/// by delegating to the grammar parser. It handles error mapping to include -/// context about which line failed. -/// -/// # Arguments -/// -/// * `line` - The `LineInfo` struct containing the rule text and metadata -/// -/// # Returns -/// -/// `Result` - The parsed rule or a parse error -/// -/// # Errors -/// -/// Returns an error if: -/// - The line is marked as a comment -/// - The rule syntax is invalid -/// - Required fields are missing -/// - Value parsing fails -/// -/// # Examples -/// -/// ```ignore -/// let line = LineInfo::new("0 string 0 Test".to_string(), 1, false); -/// let rule = parse_magic_rule_line(&line)?; -/// assert_eq!(rule.level, 0); -/// # Ok::<(), Box>(()) -/// ``` -fn parse_magic_rule_line(line: &LineInfo) -> Result { - if line.is_comment { - return Err(ParseError::invalid_syntax( - line.line_number, - "Comment lines cannot be parsed as rules", - )); - } - parse_magic_rule(&line.content) - .map_err(|e| { - ParseError::invalid_syntax(line.line_number, format!("Failed to parse rule: {e}")) - }) - .map(|(_, rule)| rule) -} - -/// Builds a hierarchical structure from a flat list of parsed magic rules. -/// -/// This function establishes parent-child relationships based on indentation levels. -/// Rules at deeper indentation levels become children of the most recent rule at a -/// shallower level. This implements a stack-based algorithm for hierarchy construction. -/// -/// # Arguments -/// -/// * `lines` - A vector of preprocessed `LineInfo` structs -/// -/// # Returns -/// -/// `Result, ParseError>` - Root-level rules with children attached -/// -/// # Behavior -/// -/// - Rules with `level=0` are root rules -/// - Rules with `level=1` become children of the most recent `level=0` rule -/// - Rules with `level=2` become children of the most recent `level=1` rule -/// - When indentation decreases, the stack is unwound and completed rules are attached -/// - Orphaned child rules (starting with '>' but with no preceding parent) are -/// added to the root list with their hierarchy level preserved -/// -/// # Errors -/// -/// Returns an error if: -/// - Any line contains invalid magic rule syntax -/// - Rule parsing fails (propagated from `parse_magic_rule_line`) -/// -/// # Examples -/// -/// ```ignore -/// let lines = vec![ -/// LineInfo::new("0 string 0 ELF".to_string(), 1, false), -/// LineInfo::new(">4 byte 1 32-bit".to_string(), 2, false), -/// ]; -/// let rules = build_rule_hierarchy(lines)?; -/// assert_eq!(rules[0].children.len(), 1); -/// # Ok::<(), Box>(()) -/// ``` -fn build_rule_hierarchy(lines: Vec) -> Result, ParseError> { - /// Helper to pop a rule from the stack and attach it to its parent or roots - fn pop_and_attach(stack: &mut Vec, roots: &mut Vec) { - if let Some(completed) = stack.pop() { - if let Some(parent) = stack.last_mut() { - parent.children.push(completed); - } else { - roots.push(completed); - } - } - } - - let mut stack: Vec = Vec::new(); - let mut roots: Vec = Vec::new(); - let mut pending_strength: Option = None; - - for line in lines { - if line.is_comment { - continue; - } - - // Handle strength directive: store modifier for next rule - if line.strength_modifier.is_some() { - pending_strength = line.strength_modifier; - continue; - } - - let mut rule = parse_magic_rule_line(&line)?; - - // Apply pending strength modifier to this rule - if pending_strength.is_some() { - rule.strength_modifier = pending_strength.take(); - } - - // Unwind stack until we find a parent with lower level - while stack.last().is_some_and(|top| top.level >= rule.level) { - pop_and_attach(&mut stack, &mut roots); - } - - stack.push(rule); - } - - // Unwind remaining stack - while !stack.is_empty() { - pop_and_attach(&mut stack, &mut roots); - } - - Ok(roots) -} - -/// Parses a complete magic file from raw text input. -/// -/// This is the main public-facing parser function that orchestrates the complete -/// parsing pipeline: preprocessing, parsing individual rules, and building the -/// hierarchical structure. -/// -/// # Arguments -/// -/// * `input` - The raw magic file content as a string -/// -/// # Returns -/// -/// `Result, ParseError>` - A vector of root rules with nested children -/// -/// # Errors -/// -/// Returns an error if any stage of parsing fails: -/// - Preprocessing errors -/// - Rule parsing errors -/// - Hierarchy building errors -/// -/// # Example -/// -/// ```ignore -/// use libmagic_rs::parser::parse_text_magic_file; -/// -/// let magic = r#"0 string \x7fELF ELF file -/// >4 byte 1 32-bit -/// >4 byte 2 64-bit"#; -/// -/// let rules = parse_text_magic_file(magic)?; -/// assert_eq!(rules.len(), 1); -/// assert_eq!(rules[0].message, "ELF file"); -/// # Ok::<(), Box>(()) -/// ``` -pub fn parse_text_magic_file(input: &str) -> Result, ParseError> { - let lines = preprocess_lines(input)?; - build_rule_hierarchy(lines) -} - -/// Loads and parses all magic files from a directory, merging them into a single rule set. -/// -/// This function reads all regular files in the specified directory, parses each as a magic file, -/// and combines the resulting rules into a single `Vec`. Files are processed in -/// alphabetical order by filename to ensure deterministic results. -/// -/// # Error Handling Strategy -/// -/// This function distinguishes between critical and non-critical errors: -/// -/// - **Critical errors** (I/O failures, directory access issues, encoding problems): -/// These cause immediate failure and return a `ParseError`. The function stops processing -/// and propagates the error to the caller. -/// -/// - **Non-critical errors** (individual file parse failures): -/// These are logged to stderr with a warning message and the file is skipped. Processing -/// continues with remaining files. -/// -/// # Behavior -/// -/// - Subdirectories are skipped (not recursively processed) -/// - Symbolic links are skipped -/// - Empty directories return an empty rules vector -/// - Files are processed in alphabetical order by filename -/// - All successfully parsed rules are merged in order -/// -/// # Examples -/// -/// Loading a directory of magic files: -/// -/// ```rust,no_run -/// use libmagic_rs::parser::load_magic_directory; -/// use std::path::Path; -/// -/// let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?; -/// println!("Loaded {} rules from directory", rules.len()); -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -/// -/// Creating a Magdir-style directory structure: -/// -/// ```rust,no_run -/// use libmagic_rs::parser::load_magic_directory; -/// use std::path::Path; -/// -/// // Directory structure: -/// // magic.d/ -/// // ├── 01-elf -/// // ├── 02-archive -/// // └── 03-text -/// -/// let rules = load_magic_directory(Path::new("./magic.d"))?; -/// // Rules from all three files are merged in alphabetical order -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -/// -/// # Errors -/// -/// Returns `ParseError` if: -/// - The directory does not exist or cannot be accessed -/// - Directory entries cannot be read -/// - A file cannot be read due to I/O errors -/// - A file contains invalid UTF-8 encoding -/// -/// # Panics -/// -/// This function does not panic under normal operation. -#[allow(clippy::print_stderr)] -pub fn load_magic_directory(dir_path: &Path) -> Result, ParseError> { - use std::fs; - - // Read directory entries - let entries = fs::read_dir(dir_path).map_err(|e| { - ParseError::invalid_syntax( - 0, - format!("Failed to read directory '{}': {}", dir_path.display(), e), - ) - })?; - - // Collect and sort entries by filename for deterministic ordering - let mut file_paths: Vec = Vec::new(); - for entry in entries { - let entry = entry.map_err(|e| { - ParseError::invalid_syntax( - 0, - format!( - "Failed to read directory entry in '{}': {}", - dir_path.display(), - e - ), - ) - })?; - - let path = entry.path(); - let file_type = entry.file_type().map_err(|e| { - ParseError::invalid_syntax( - 0, - format!("Failed to read file type for '{}': {}", path.display(), e), - ) - })?; - - // Only process regular files, skip directories and symlinks - if file_type.is_file() && !file_type.is_symlink() { - file_paths.push(path); - } - } - - // Sort by filename for deterministic ordering - file_paths.sort_by_key(|path| path.file_name().map(std::ffi::OsStr::to_os_string)); - - // Accumulate rules from all files - let mut all_rules = Vec::new(); - let mut parse_failures: Vec<(PathBuf, ParseError)> = Vec::new(); - let file_count = file_paths.len(); - - for path in file_paths { - // Read file contents - let contents = match fs::read_to_string(&path) { - Ok(contents) => contents, - Err(e) => { - // I/O errors are critical - return Err(ParseError::invalid_syntax( - 0, - format!("Failed to read file '{}': {}", path.display(), e), - )); - } - }; - - // Parse the file - match parse_text_magic_file(&contents) { - Ok(rules) => { - // Successfully parsed - merge rules - all_rules.extend(rules); - } - Err(e) => { - // Track parse failures for reporting - parse_failures.push((path, e)); - } - } - } - - // If all files failed to parse, return an error - if all_rules.is_empty() && !parse_failures.is_empty() { - use std::fmt::Write; - - let failure_details: Vec = parse_failures - .iter() - .take(3) // Limit to first 3 failures for brevity - .map(|(path, e)| format!(" - {}: {}", path.display(), e)) - .collect(); - - let mut message = format!("All {file_count} magic file(s) in directory failed to parse"); - if !failure_details.is_empty() { - message.push_str(":\n"); - message.push_str(&failure_details.join("\n")); - if parse_failures.len() > 3 { - let _ = write!(message, "\n ... and {} more", parse_failures.len() - 3); - } - } - - return Err(ParseError::invalid_syntax(0, message)); - } - - // Log warnings for partial failures (some files parsed, some failed) - // Note: Using eprintln for now; consider a logging framework in the future - #[allow(clippy::print_stderr)] - for (path, e) in &parse_failures { - eprintln!("Warning: Failed to parse '{}': {}", path.display(), e); - } - - Ok(all_rules) -} - -/// Loads magic rules from a file or directory, automatically detecting the format. -/// -/// This is the unified entry point for loading magic rules from the filesystem. It -/// automatically detects whether the path points to a text magic file, a directory -/// containing magic files, or a binary compiled magic file, and dispatches to the -/// appropriate handler. -/// -/// # Format Detection and Handling -/// -/// The function uses [`detect_format()`] to determine the file type and handles each -/// format as follows: -/// -/// - **Text format**: Reads the file contents and parses using [`parse_text_magic_file()`] -/// - **Directory format**: Loads all magic files from the directory using [`load_magic_directory()`] -/// - **Binary format**: Returns an error with guidance to use the `--use-builtin` option -/// -/// # Arguments -/// -/// * `path` - Path to a magic file or directory. Can be absolute or relative. -/// -/// # Returns -/// -/// Returns `Ok(Vec)` containing all successfully parsed magic rules. For -/// directories, rules from all files are merged in alphabetical order by filename. -/// -/// # Errors -/// -/// This function returns a [`ParseError`] in the following cases: -/// -/// - **File not found**: The specified path does not exist -/// - **Unsupported format**: The file is a binary compiled magic file (`.mgc`) -/// - **Parse errors**: The magic file contains syntax errors or invalid rules -/// - **I/O errors**: File system errors during reading (permissions, disk errors, etc.) -/// -/// # Examples -/// -/// ## Loading a text magic file -/// -/// ```no_run -/// use libmagic_rs::parser::load_magic_file; -/// use std::path::Path; -/// -/// let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?; -/// println!("Loaded {} magic rules", rules.len()); -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -/// -/// ## Loading a directory of magic files -/// -/// ```no_run -/// use libmagic_rs::parser::load_magic_file; -/// use std::path::Path; -/// -/// let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?; -/// println!("Loaded {} rules from directory", rules.len()); -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -/// -/// ## Handling binary format errors -/// -/// ```no_run -/// use libmagic_rs::parser::load_magic_file; -/// use std::path::Path; -/// -/// match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) { -/// Ok(rules) => println!("Loaded {} rules", rules.len()), -/// Err(e) => { -/// eprintln!("Error loading magic file: {}", e); -/// eprintln!("Hint: Use --use-builtin for binary files"); -/// } -/// } -/// # Ok::<(), libmagic_rs::ParseError>(()) -/// ``` -/// -/// # Security -/// -/// This function delegates to [`parse_text_magic_file()`] or [`load_magic_directory()`] -/// based on format detection. Security considerations are handled by those functions: -/// -/// - Rule hierarchy depth is bounded during parsing -/// - Invalid syntax is rejected with descriptive errors -/// - Binary `.mgc` files are rejected (not parsed) -/// -/// Note: File size limits and memory exhaustion protection are not currently implemented. -/// Large magic files will be loaded entirely into memory. -/// -/// # See Also -/// -/// - [`detect_format()`] - Format detection logic -/// - [`parse_text_magic_file()`] - Text file parser -/// - [`load_magic_directory()`] - Directory loader -pub fn load_magic_file(path: &Path) -> Result, ParseError> { - // Detect the magic file format - let format = detect_format(path)?; - - // Dispatch to appropriate handler based on format - match format { - MagicFileFormat::Text => { - // Read file contents and parse as text magic file - let content = std::fs::read_to_string(path)?; - parse_text_magic_file(&content) - } - MagicFileFormat::Directory => { - // Load all magic files from directory - load_magic_directory(path) - } - MagicFileFormat::Binary => { - // Binary compiled magic files are not supported - Err(ParseError::unsupported_format( - 0, - "binary .mgc file", - "Binary compiled magic files (.mgc) are not supported for parsing.\n\ - Use the --use-builtin option to use the built-in magic rules instead,\n\ - or provide a text-based magic file or directory.", - )) - } - } -} - -#[cfg(test)] -mod unit_tests { - use super::*; - - fn li(line_number: usize, content: &str) -> LineInfo { - LineInfo { - content: content.to_string(), - line_number, - is_comment: false, - strength_modifier: None, - } - } - - fn li_comment(line_number: usize, content: &str) -> LineInfo { - LineInfo { - content: content.to_string(), - line_number, - is_comment: true, - strength_modifier: None, - } - } - - // ============================================================ - // Tests for parse_magic_rule_line (10+ test cases) - // ============================================================ - - #[test] - fn test_parse_magic_rule_line_simple_string() { - let line = li(1, "0 string \\x7fELF ELF executable"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.level, 0); - assert_eq!(rule.message, "ELF executable"); - } - - #[test] - fn test_parse_magic_rule_line_byte_type() { - let line = li(1, "0 byte 1 ELF"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.level, 0); - assert!(matches!(rule.typ, TypeKind::Byte)); - } - - #[test] - fn test_parse_magic_rule_line_with_child_indentation() { - let line = li(2, ">4 byte 1 32-bit"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.level, 1); - } - - #[test] - fn test_parse_magic_rule_line_deep_indentation() { - let line = li(3, ">>>8 long = 0x12345678 Complex match"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.level, 3); - } - - #[test] - fn test_parse_magic_rule_line_not_equal_operator() { - let line = li(1, "0 byte != 0 Non-zero"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.op, Operator::NotEqual); - } - - #[test] - fn test_parse_magic_rule_line_greater_operator() { - let line = li(1, "0 long = 1000 Number"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.op, Operator::Equal); - } - - #[test] - fn test_parse_magic_rule_line_less_operator() { - let line = li(1, "0 long != 256 Not equal"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.op, Operator::NotEqual); - } - - #[test] - fn test_parse_magic_rule_line_bitwise_and_operator() { - let line = li(1, "0 byte & 0xFF Bitmask"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.op, Operator::BitwiseAnd); - } - - #[test] - fn test_parse_magic_rule_line_comment_line_error() { - let line = li_comment(1, "This is a comment"); - let result = parse_magic_rule_line(&line); - assert!(result.is_err()); - } - - #[test] - fn test_parse_magic_rule_line_hex_offset() { - let line = li(1, "0x100 byte 1 PDF document"); - let rule = parse_magic_rule_line(&line).unwrap(); - match rule.offset { - OffsetSpec::Absolute(offset) => assert_eq!(offset, 0x100), - _ => panic!("Expected absolute offset"), - } - } - - #[test] - fn test_parse_magic_rule_line_string_with_spaces() { - let line = li(1, "0 byte 1 Long message with multiple words"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert_eq!(rule.message, "Long message with multiple words"); - } - - #[test] - fn test_parse_magic_rule_line_short_type() { - let line = li(1, "0 short 0x4d5a MS-DOS executable"); - let rule = parse_magic_rule_line(&line).unwrap(); - assert!(matches!(rule.typ, TypeKind::Short { .. })); - } - - // ============================================================ - // Tests for preprocess_lines (10+ test cases) - // ============================================================ - - #[test] - fn test_preprocess_lines_single_rule() { - let input = "0 string 0 Test"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 Test"); - assert!(!lines[0].is_comment); - } - - #[test] - fn test_preprocess_lines_multiple_rules() { - let input = "0 string 0 Test\n0 byte 1 Byte"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 2); - assert_eq!(lines[0].content, "0 string 0 Test"); - assert_eq!(lines[1].content, "0 byte 1 Byte"); - } - - #[test] - fn test_preprocess_lines_with_comments() { - let input = "# Comment\n0 string 0 Test"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 2); - assert!(lines[0].is_comment); - assert!(!lines[1].is_comment); - } - - #[test] - fn test_preprocess_lines_empty_lines() { - let input = "0 string 0 Test\n\n0 byte 1 Byte"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 2); - } - - #[test] - fn test_preprocess_lines_leading_empty_lines() { - let input = "\n\n0 string 0 Test"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 Test"); - } - - #[test] - fn test_preprocess_lines_trailing_empty_lines() { - let input = "0 string 0 Test\n\n"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - } - - #[test] - fn test_preprocess_lines_line_continuation() { - let input = "0 string 0 Long message \\\ncontinued here"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 Long message continued here"); - } - - #[test] - fn test_preprocess_lines_multiple_continuations() { - let input = "0 string 0 Multi \\\nline \\\ncontinuation"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 Multi line continuation"); - } - - #[test] - fn test_preprocess_lines_mixed_comments_and_rules() { - let input = "# Header\n0 string 0 Test\n# Another comment\n>4 byte 1 Child"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 4); - assert!(lines[0].is_comment); - assert!(!lines[1].is_comment); - assert!(lines[2].is_comment); - assert!(!lines[3].is_comment); - } - - #[test] - fn test_preprocess_lines_preserves_line_numbers() { - let input = "0 string 0 Test\n>4 byte 1 Child"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines[0].line_number, 1); - assert_eq!(lines[1].line_number, 2); - } - - #[test] - fn test_preprocess_lines_empty_input() { - let input = ""; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 0); - } - - #[test] - fn test_preprocess_lines_only_comments() { - let input = "# Comment 1\n# Comment 2\n# Comment 3"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 3); - assert!(lines.iter().all(|l| l.is_comment)); - } - - // ============================================================ - // Tests for build_rule_hierarchy (10+ test cases) - // ============================================================ - - #[test] - fn test_build_rule_hierarchy_single_root() { - let lines = vec![li(1, "0 string \\x7fELF ELF executable")]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 1); - assert_eq!(roots[0].level, 0); - } - - #[test] - fn test_build_rule_hierarchy_root_with_one_child() { - let lines = vec![ - li(1, "0 string \\x7fELF ELF executable"), - li(2, ">4 byte 1 32-bit"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 1); - assert_eq!(roots[0].children.len(), 1); - } - - #[test] - fn test_build_rule_hierarchy_root_with_multiple_children() { - let lines = vec![ - li(1, "0 string \\x7fELF ELF executable"), - li(2, ">4 byte 1 32-bit"), - li(3, ">4 byte 2 64-bit"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 1); - assert_eq!(roots[0].children.len(), 2); - } - - #[test] - fn test_build_rule_hierarchy_nested_three_levels() { - let lines = vec![ - li(1, "0 string \\x7fELF ELF executable"), - li(2, ">4 byte 1 class"), - li(3, ">>5 byte 1 subtype"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots[0].children.len(), 1); - assert_eq!(roots[0].children[0].children.len(), 1); - assert_eq!(roots[0].children[0].children[0].level, 2); - } - - #[test] - fn test_build_rule_hierarchy_multiple_roots() { - let lines = vec![ - li(1, r#"0 string "ELF" "ELF executable""#), - li(2, r#"0 string "%PDF" "PDF document""#), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 2); - } - - #[test] - fn test_build_rule_hierarchy_sibling_rules() { - let lines = vec![ - li(1, "0 byte 1 Root"), - li(2, ">4 byte 1 Child1"), - li(3, ">4 byte 2 Child2"), - li(4, "0 byte 2 Root2"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 2); - assert_eq!(roots[0].children.len(), 2); - } - - #[test] - fn test_build_rule_hierarchy_deep_nesting() { - let lines = vec![ - li(1, "0 byte 1 L0"), - li(2, ">4 byte 1 L1"), - li(3, ">>5 byte 2 L2"), - li(4, ">>>6 byte 3 L3"), - li(5, ">>>>7 byte 4 L4"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 1); - assert_eq!( - roots[0].children[0].children[0].children[0].children.len(), - 1 - ); - } +// Internal re-exports for sibling modules and tests +pub(crate) use hierarchy::build_rule_hierarchy; +pub(crate) use preprocessing::preprocess_lines; - #[test] - fn test_build_rule_hierarchy_return_to_root_level() { - let lines = vec![ - li(1, "0 byte 1 Root1"), - li(2, ">4 byte 1 Child"), - li(3, "0 byte 2 Root2"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 2); - assert_eq!(roots[0].children.len(), 1); - assert_eq!(roots[1].children.len(), 0); - } +use crate::error::ParseError; - #[test] - fn test_build_rule_hierarchy_orphaned_child() { - let lines = vec![li(1, ">4 byte 1 Orphaned child")]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 1); - assert_eq!(roots[0].level, 1); - } +/// Parses a complete magic file from raw text input. +/// +/// This is the main public-facing parser function that orchestrates the complete +/// parsing pipeline: preprocessing, parsing individual rules, and building the +/// hierarchical structure. +/// +/// # Arguments +/// +/// * `input` - The raw magic file content as a string +/// +/// # Returns +/// +/// `Result, ParseError>` - A vector of root rules with nested children +/// +/// # Errors +/// +/// Returns an error if any stage of parsing fails: +/// - Preprocessing errors +/// - Rule parsing errors +/// - Hierarchy building errors +/// +/// # Example +/// +/// ```ignore +/// use libmagic_rs::parser::parse_text_magic_file; +/// +/// let magic = r#"0 string \x7fELF ELF file +/// >4 byte 1 32-bit +/// >4 byte 2 64-bit"#; +/// +/// let rules = parse_text_magic_file(magic)?; +/// assert_eq!(rules.len(), 1); +/// assert_eq!(rules[0].message, "ELF file"); +/// # Ok::<(), Box>(()) +/// ``` +pub fn parse_text_magic_file(input: &str) -> Result, ParseError> { + let lines = preprocess_lines(input)?; + build_rule_hierarchy(lines) +} - #[test] - fn test_build_rule_hierarchy_complex_structure() { - let lines = vec![ - li(1, "0 byte 1 Root1"), - li(2, ">4 byte 1 C1"), - li(3, ">4 byte 2 C2"), - li(4, ">>6 byte 3 GC1"), - li(5, "0 byte 2 Root2"), - li(6, ">4 byte 4 C3"), - ]; - let roots = build_rule_hierarchy(lines).unwrap(); - assert_eq!(roots.len(), 2); - assert_eq!(roots[0].children.len(), 2); - assert_eq!(roots[0].children[1].children.len(), 1); - assert_eq!(roots[1].children.len(), 1); - } +#[cfg(test)] +mod unit_tests { + use super::*; // ============================================================ // Tests for parse_text_magic_file (10+ test cases) @@ -1427,7 +439,7 @@ continued"; } // ============================================================ - // Overflow protection tests (from pr-test-analyzer) + // Overflow protection tests // ============================================================ #[test] @@ -1463,63 +475,9 @@ continued"; } // ============================================================ - // Continuation edge case tests (from pr-test-analyzer) - // ============================================================ - - #[test] - fn test_continuation_at_eof() { - // Continuation on last line with no following line - should error - let input = "0 string 0 Test \\"; - let result = preprocess_lines(input); - assert!( - result.is_err(), - "Should error on unterminated continuation at EOF" - ); - let err = result.unwrap_err(); - assert!( - format!("{err:?}").contains("Unterminated"), - "Error should mention unterminated continuation" - ); - } - - #[test] - fn test_continuation_with_empty_next() { - // Empty line after continuation causes unterminated continuation - // (empty lines are skipped but continuation state persists) - let input = "0 string 0 Test \\\n\n0 byte 1 Next"; - let lines = preprocess_lines(input).unwrap(); - // The continuation carries through the empty line, so "Next" gets appended - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 Test 0 byte 1 Next"); - } - - #[test] - fn test_continuation_into_empty_then_rule() { - let input = "0 string 0 First \\\n\ncontinued"; - let lines = preprocess_lines(input).unwrap(); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0].content, "0 string 0 First continued"); - } - - // ============================================================ - // Line number accuracy tests (from pr-test-analyzer) + // Line number accuracy test (uses parse_text_magic_file) // ============================================================ - #[test] - fn test_line_numbers_with_continuations() { - let input = "0 string 0 test1\n0 string 0 multi \\\nline \\\ntest\n0 string 0 test2"; - let lines = preprocess_lines(input).unwrap(); - - // Line 1: "0 string 0 test1" should report line 1 - assert_eq!(lines[0].line_number, 1); - - // Line 2-4 continuation should report line 2 (first line of continuation) - assert_eq!(lines[1].line_number, 2); - - // Line 5: "0 string 0 test2" should report line 5 - assert_eq!(lines[2].line_number, 5); - } - #[test] fn test_error_reports_correct_line_for_continuation() { // When a continued rule fails to parse, error should show the starting line @@ -1538,66 +496,6 @@ continued"; Ok(_) => panic!("Expected InvalidSyntax error"), } } - - #[test] - fn test_line_numbers_with_mixed_content() { - let input = "# Comment line 1\n0 string 0 rule1\n\n# Another comment\n0 string 0 rule2 \\\ncontinued"; - let lines = preprocess_lines(input).unwrap(); - - assert_eq!(lines.len(), 4); - assert_eq!(lines[0].line_number, 1); // Comment - assert_eq!(lines[1].line_number, 2); // rule1 - assert_eq!(lines[2].line_number, 4); // Another comment - assert_eq!(lines[3].line_number, 5); // rule2 (continued on line 6) - } - - // ============================================================ - // Bug reproduction tests - // ============================================================ - - #[test] - fn test_bug1_comment_during_continuation() { - // Bug 1: Comment during continuation should not corrupt line_buf - // The partial rule should be discarded, leaving only the comment and new rule - let input = "0 string 0 Partial rule \\\n# This is a comment\n0 byte 1 New rule"; - let lines = preprocess_lines(input).unwrap(); - - // The partial rule is discarded, so we should have 2 lines: comment and new rule - assert_eq!(lines.len(), 2); - // The comment should be separate and not contain rule content - let comment_line = lines.iter().find(|l| l.is_comment).unwrap(); - assert!(!comment_line.content.contains("Partial rule")); - assert_eq!(comment_line.content, "This is a comment"); - // The new rule should be intact - let rule_line = lines - .iter() - .find(|l| !l.is_comment && l.content.contains("New rule")) - .unwrap(); - assert_eq!(rule_line.content, "0 byte 1 New rule"); - } - - #[test] - fn test_bug2_empty_line_in_continuation() { - // Bug 2: Empty line in continuation should not break line number calculation - let input = "0 string 0 Test \\\n\ncontinued here"; - let lines = preprocess_lines(input).unwrap(); - - assert_eq!(lines.len(), 1); - // Line number should point to line 1 (where the rule started), not line 3 - assert_eq!(lines[0].line_number, 1); - assert_eq!(lines[0].content, "0 string 0 Test continued here"); - } - - #[test] - fn test_bug2_multiple_empty_lines_in_continuation() { - // Multiple empty lines in continuation - let input = "0 string 0 Test \\\n\n\ncontinued here"; - let lines = preprocess_lines(input).unwrap(); - - assert_eq!(lines.len(), 1); - // Line number should still point to line 1 - assert_eq!(lines[0].line_number, 1); - } } #[cfg(test)] @@ -1672,425 +570,3 @@ mod output_test { } } } - -#[cfg(test)] -mod format_detection_tests { - use super::*; - use std::fs; - use std::io::Write; - - #[test] - fn test_detect_format_text_file() { - let temp_dir = std::env::temp_dir(); - let text_file = temp_dir.join("test_text_magic.txt"); - fs::write(&text_file, "# Magic file\n0 string test Test").unwrap(); - - let format = detect_format(&text_file).unwrap(); - assert_eq!(format, MagicFileFormat::Text); - - fs::remove_file(&text_file).unwrap(); - } - - #[test] - fn test_detect_format_directory() { - let temp_dir = std::env::temp_dir().join("test_magic_dir"); - fs::create_dir_all(&temp_dir).unwrap(); - - let format = detect_format(&temp_dir).unwrap(); - assert_eq!(format, MagicFileFormat::Directory); - - fs::remove_dir_all(&temp_dir).unwrap(); - } - - #[test] - fn test_detect_format_binary_mgc() { - let temp_dir = std::env::temp_dir(); - let binary_file = temp_dir.join("test_binary.mgc"); - - // Write binary magic number 0xF11E041C in little-endian - let mut file = fs::File::create(&binary_file).unwrap(); - file.write_all(&[0x1C, 0x04, 0x1E, 0xF1]).unwrap(); - file.write_all(b"additional binary data").unwrap(); - - let result = detect_format(&binary_file); - assert!(result.is_ok()); - - match result.unwrap() { - MagicFileFormat::Binary => { - // Expected result - } - other => panic!("Expected Binary format, got {other:?}"), - } - - fs::remove_file(&binary_file).unwrap(); - } - - #[test] - fn test_detect_format_nonexistent_path() { - let nonexistent = std::env::temp_dir().join("nonexistent_magic_file.txt"); - - let result = detect_format(&nonexistent); - assert!(result.is_err()); - - match result.unwrap_err() { - ParseError::IoError(e) => { - assert_eq!(e.kind(), std::io::ErrorKind::NotFound); - } - other => panic!("Expected IoError, got: {other:?}"), - } - } - - #[test] - fn test_detect_format_empty_file() { - let temp_dir = std::env::temp_dir(); - let empty_file = temp_dir.join("test_empty_magic.txt"); - fs::write(&empty_file, "").unwrap(); - - // Empty files should be detected as text (too small for binary magic) - let format = detect_format(&empty_file).unwrap(); - assert_eq!(format, MagicFileFormat::Text); - - fs::remove_file(&empty_file).unwrap(); - } - - #[test] - fn test_detect_format_small_file() { - let temp_dir = std::env::temp_dir(); - let small_file = temp_dir.join("test_small_magic.txt"); - fs::write(&small_file, "ab").unwrap(); // Only 2 bytes - - // Small files should be detected as text - let format = detect_format(&small_file).unwrap(); - assert_eq!(format, MagicFileFormat::Text); - - fs::remove_file(&small_file).unwrap(); - } - - #[test] - fn test_detect_format_text_with_binary_content() { - let temp_dir = std::env::temp_dir(); - let binary_text_file = temp_dir.join("test_binary_text.txt"); - - // Write binary data that's NOT the magic number - let mut file = fs::File::create(&binary_text_file).unwrap(); - file.write_all(&[0xFF, 0xFE, 0xFD, 0xFC]).unwrap(); - file.write_all(b"some text").unwrap(); - - // Should be detected as text (wrong magic number) - let format = detect_format(&binary_text_file).unwrap(); - assert_eq!(format, MagicFileFormat::Text); - - fs::remove_file(&binary_text_file).unwrap(); - } - - #[test] - fn test_magic_file_format_enum_equality() { - assert_eq!(MagicFileFormat::Text, MagicFileFormat::Text); - assert_eq!(MagicFileFormat::Directory, MagicFileFormat::Directory); - assert_eq!(MagicFileFormat::Binary, MagicFileFormat::Binary); - - assert_ne!(MagicFileFormat::Text, MagicFileFormat::Directory); - assert_ne!(MagicFileFormat::Text, MagicFileFormat::Binary); - assert_ne!(MagicFileFormat::Directory, MagicFileFormat::Binary); - } - - #[test] - fn test_magic_file_format_debug() { - let text_format = MagicFileFormat::Text; - let debug_str = format!("{text_format:?}"); - assert!(debug_str.contains("Text")); - } - - #[test] - fn test_magic_file_format_clone() { - let original = MagicFileFormat::Directory; - let cloned = original; - assert_eq!(original, cloned); - } - - #[test] - fn test_magic_file_format_copy() { - let original = MagicFileFormat::Binary; - let copied = original; // Copy trait allows this - assert_eq!(original, copied); - } - - // ============================================================ - // Tests for load_magic_directory (6+ test cases) - // ============================================================ - - #[test] - fn test_load_directory_critical_error_io() { - use std::path::Path; - - let non_existent = Path::new("/this/should/not/exist/anywhere/at/all"); - let result = load_magic_directory(non_existent); - - assert!( - result.is_err(), - "Should return error for non-existent directory" - ); - let err = result.unwrap_err(); - assert!(err.to_string().contains("Failed to read directory")); - } - - #[test] - fn test_load_directory_non_critical_error_parse() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - - // Create a valid file - let valid_path = temp_dir.path().join("valid.magic"); - fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file"); - - // Create an invalid file - let invalid_path = temp_dir.path().join("invalid.magic"); - fs::write(&invalid_path, "this is invalid syntax\n").expect("Failed to write invalid file"); - - // Should succeed, loading only the valid file - let rules = load_magic_directory(temp_dir.path()).expect("Should load valid files"); - - assert_eq!(rules.len(), 1, "Should load only valid file"); - assert_eq!(rules[0].message, "valid"); - } - - #[test] - fn test_load_directory_empty_files() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - - // Create an empty file - let empty_path = temp_dir.path().join("empty.magic"); - fs::write(&empty_path, "").expect("Failed to write empty file"); - - // Create a file with only comments - let comments_path = temp_dir.path().join("comments.magic"); - fs::write(&comments_path, "# Just comments\n# Nothing else\n") - .expect("Failed to write comments file"); - - // Should succeed with no rules - let rules = load_magic_directory(temp_dir.path()).expect("Should handle empty files"); - - assert_eq!(rules.len(), 0, "Empty files should contribute no rules"); - } - - #[test] - fn test_load_directory_binary_files() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - - // Create a binary file (invalid UTF-8) - let binary_path = temp_dir.path().join("binary.dat"); - fs::write(&binary_path, [0xFF, 0xFE, 0xFF, 0xFE]).expect("Failed to write binary file"); - - // Create a valid text file - let valid_path = temp_dir.path().join("valid.magic"); - fs::write(&valid_path, "0 string \\x01\\x02 valid\n").expect("Failed to write valid file"); - - // Binary file should cause a critical error (invalid UTF-8) - let result = load_magic_directory(temp_dir.path()); - - // The function should fail when encountering binary files (critical I/O error) - assert!( - result.is_err(), - "Binary files should cause critical error due to invalid UTF-8" - ); - } - - #[test] - fn test_load_directory_mixed_extensions() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - - // Create files with different extensions - fs::write( - temp_dir.path().join("file.magic"), - "0 string \\x01\\x02 magic\n", - ) - .expect("Failed to write .magic file"); - fs::write( - temp_dir.path().join("file.txt"), - "0 string \\x03\\x04 txt\n", - ) - .expect("Failed to write .txt file"); - fs::write(temp_dir.path().join("noext"), "0 string \\x05\\x06 noext\n") - .expect("Failed to write no-ext file"); - - let rules = load_magic_directory(temp_dir.path()) - .expect("Should load all files regardless of extension"); - - assert_eq!( - rules.len(), - 3, - "Should process all files regardless of extension" - ); - - let messages: Vec<&str> = rules.iter().map(|r| r.message.as_str()).collect(); - assert!(messages.contains(&"magic")); - assert!(messages.contains(&"txt")); - assert!(messages.contains(&"noext")); - } - - #[test] - fn test_load_directory_alphabetical_ordering() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - - // Create files in non-alphabetical order - using valid magic syntax with hex escapes - fs::write( - temp_dir.path().join("03-third"), - "0 string \\x07\\x08\\x09 third\n", - ) - .expect("Failed to write third file"); - fs::write( - temp_dir.path().join("01-first"), - "0 string \\x01\\x02\\x03 first\n", - ) - .expect("Failed to write first file"); - fs::write( - temp_dir.path().join("02-second"), - "0 string \\x04\\x05\\x06 second\n", - ) - .expect("Failed to write second file"); - - let rules = load_magic_directory(temp_dir.path()).expect("Should load directory in order"); - - assert_eq!(rules.len(), 3); - // Should be sorted alphabetically by filename - assert_eq!(rules[0].message, "first"); - assert_eq!(rules[1].message, "second"); - assert_eq!(rules[2].message, "third"); - } - - // ============================================================ - // Tests for load_magic_file (5+ test cases) - // ============================================================ - - #[test] - fn test_load_magic_file_text_format() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - let magic_file = temp_dir.path().join("magic.txt"); - - // Create text magic file with valid content - fs::write(&magic_file, "0 string \\x7fELF ELF executable\n") - .expect("Failed to write magic file"); - - // Load using load_magic_file - let rules = load_magic_file(&magic_file).expect("Failed to load text magic file"); - - assert_eq!(rules.len(), 1); - assert_eq!(rules[0].message, "ELF executable"); - } - - #[test] - fn test_load_magic_file_directory_format() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - let magic_dir = temp_dir.path().join("magic.d"); - fs::create_dir(&magic_dir).expect("Failed to create magic directory"); - - // Create multiple files in directory - fs::write( - magic_dir.join("00_elf"), - "0 string \\x7fELF ELF executable\n", - ) - .expect("Failed to write elf file"); - fs::write( - magic_dir.join("01_zip"), - "0 string \\x50\\x4b\\x03\\x04 ZIP archive\n", - ) - .expect("Failed to write zip file"); - - // Load using load_magic_file - let rules = load_magic_file(&magic_dir).expect("Failed to load directory"); - - assert_eq!(rules.len(), 2); - assert_eq!(rules[0].message, "ELF executable"); - assert_eq!(rules[1].message, "ZIP archive"); - } - - #[test] - fn test_load_magic_file_binary_format_error() { - use std::fs::File; - use std::io::Write; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - let binary_file = temp_dir.path().join("magic.mgc"); - - // Create binary file with .mgc magic number - let mut file = File::create(&binary_file).expect("Failed to create binary file"); - let magic_number: [u8; 4] = [0x1C, 0x04, 0x1E, 0xF1]; // Little-endian 0xF11E041C - file.write_all(&magic_number) - .expect("Failed to write magic number"); - - // Attempt to load binary file - let result = load_magic_file(&binary_file); - - assert!(result.is_err(), "Should fail to load binary .mgc file"); - - let error = result.unwrap_err(); - let error_msg = error.to_string(); - - // Verify error mentions unsupported format and --use-builtin - assert!( - error_msg.contains("Binary") || error_msg.contains("binary"), - "Error should mention binary format: {error_msg}", - ); - assert!( - error_msg.contains("--use-builtin") || error_msg.contains("built-in"), - "Error should mention --use-builtin option: {error_msg}", - ); - } - - #[test] - fn test_load_magic_file_io_error() { - use std::path::Path; - - // Try to load non-existent file - let non_existent = Path::new("/this/path/should/not/exist/magic.txt"); - let result = load_magic_file(non_existent); - - assert!(result.is_err(), "Should fail for non-existent file"); - } - - #[test] - fn test_load_magic_file_parse_error_propagation() { - use std::fs; - use tempfile::TempDir; - - let temp_dir = TempDir::new().expect("Failed to create temp dir"); - let invalid_file = temp_dir.path().join("invalid.magic"); - - // Create file with invalid syntax (missing offset) - fs::write(&invalid_file, "string test invalid\n").expect("Failed to write invalid file"); - - // Attempt to load file with parse errors - let result = load_magic_file(&invalid_file); - - assert!(result.is_err(), "Should fail for file with parse errors"); - - // Error should be a parse error (not I/O error) - let error = result.unwrap_err(); - let error_msg = format!("{error:?}"); - assert!( - error_msg.contains("InvalidSyntax") || error_msg.contains("syntax"), - "Error should be parse error: {error_msg}", - ); - } -} diff --git a/src/parser/preprocessing.rs b/src/parser/preprocessing.rs new file mode 100644 index 00000000..1f9e1004 --- /dev/null +++ b/src/parser/preprocessing.rs @@ -0,0 +1,521 @@ +//! Line preprocessing for magic file parsing. +//! +//! Handles comment removal, empty line filtering, line continuations, +//! and strength directive parsing during magic file preprocessing. + +use crate::error::ParseError; +use crate::parser::ast::{MagicRule, StrengthModifier}; +use crate::parser::grammar::{ + has_continuation, is_comment_line, is_empty_line, is_strength_directive, parse_comment, + parse_magic_rule, parse_strength_directive, +}; + +/// Internal structure to track line metadata during preprocessing. +/// +/// Stores the processed content, original line number, and flags for comment +/// and strength directive lines in the input magic file. +#[derive(Debug)] +pub(crate) struct LineInfo { + pub(crate) content: String, + pub(crate) line_number: usize, + pub(crate) is_comment: bool, + /// Optional strength modifier parsed from `!:strength` directive + pub(crate) strength_modifier: Option, +} + +impl LineInfo { + pub(crate) fn new(content: String, line_number: usize, is_comment: bool) -> Self { + Self { + content, + line_number, + is_comment, + strength_modifier: None, + } + } + + pub(crate) fn with_strength( + content: String, + line_number: usize, + strength_modifier: StrengthModifier, + ) -> Self { + Self { + content, + line_number, + is_comment: false, + strength_modifier: Some(strength_modifier), + } + } +} + +/// Preprocesses raw magic file input by handling comments, empty lines, and continuations. +/// +/// This function performs the following transformations: +/// - Removes empty lines from the input +/// - Handles comment lines (lines starting with '#') +/// - Processes line continuations (lines ending with '\') +/// - Concatenates continued lines into single entries +/// - Preserves original line numbers for error reporting (continued lines +/// are assigned the line number of the first line in the continuation sequence) +/// +/// # Arguments +/// +/// * `input` - The raw magic file content as a string +/// +/// # Returns +/// +/// `Result, ParseError>` - A vector of processed lines or a parse error +/// +/// # Errors +/// +/// Returns an error if: +/// - Comment lines cannot be parsed +/// - Input ends with an unterminated line continuation +/// - The input is malformed +pub(crate) fn preprocess_lines(input: &str) -> Result, ParseError> { + let mut lines_info: Vec = Vec::new(); + let mut line_buf = String::new(); + let mut start_line_number: Option = None; + let mut last_line_num = 0usize; + for (i, mut line) in input.lines().enumerate() { + last_line_num = i + 1; + if is_empty_line(line) { + continue; + } + if is_comment_line(line) { + // Bug 1 fix: If we have an ongoing continuation, discard it before processing comment + if !line_buf.is_empty() { + line_buf.clear(); + start_line_number = None; + } + let parsed_comment = parse_comment(line) + .map_err(|_| ParseError::invalid_syntax(i + 1, "Unable to parse comment"))?; + line = parsed_comment.1.as_str(); + lines_info.push(LineInfo::new(line.trim().to_string(), i + 1, true)); + continue; + } + // Handle strength directives (!:strength ...) + if is_strength_directive(line) { + // If we have an ongoing continuation, discard it before processing directive + if !line_buf.is_empty() { + line_buf.clear(); + start_line_number = None; + } + let strength_modifier = parse_strength_directive(line) + .map_err(|e| { + ParseError::invalid_syntax( + i + 1, + format!("Failed to parse strength directive: {e}"), + ) + })? + .1; + lines_info.push(LineInfo::with_strength( + line.trim().to_string(), + i + 1, + strength_modifier, + )); + continue; + } + // Track the starting line number when we begin accumulating a rule + if start_line_number.is_none() { + start_line_number = Some(i + 1); + } + line_buf.push_str(line.trim()); + if has_continuation(line) { + // Remove trailing backslash in-place (O(1)) instead of + // strip_suffix().to_string() which allocates a new String + line_buf.pop(); + continue; + } + // Bug 2 fix: Use the stored starting line number instead of calculating from cont_ctr + let rule_line_number = start_line_number.unwrap_or(i + 1); + lines_info.push(LineInfo::new( + std::mem::take(&mut line_buf), + rule_line_number, + false, + )); + start_line_number = None; + } + + // Handle unterminated continuation at end of input + if !line_buf.is_empty() { + return Err(ParseError::invalid_syntax( + last_line_num, + "Unterminated line continuation", + )); + } + + Ok(lines_info) +} + +/// Parses a single magic rule line into a `MagicRule` AST node. +/// +/// This function takes a preprocessed `LineInfo` and converts it into a `MagicRule` +/// by delegating to the grammar parser. It handles error mapping to include +/// context about which line failed. +/// +/// # Arguments +/// +/// * `line` - The `LineInfo` struct containing the rule text and metadata +/// +/// # Returns +/// +/// `Result` - The parsed rule or a parse error +/// +/// # Errors +/// +/// Returns an error if: +/// - The line is marked as a comment +/// - The rule syntax is invalid +/// - Required fields are missing +/// - Value parsing fails +pub(crate) fn parse_magic_rule_line(line: &LineInfo) -> Result { + if line.is_comment { + return Err(ParseError::invalid_syntax( + line.line_number, + "Comment lines cannot be parsed as rules", + )); + } + parse_magic_rule(&line.content) + .map_err(|e| { + ParseError::invalid_syntax(line.line_number, format!("Failed to parse rule: {e}")) + }) + .map(|(_, rule)| rule) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::ast::{OffsetSpec, Operator, TypeKind}; + + fn li(line_number: usize, content: &str) -> LineInfo { + LineInfo { + content: content.to_string(), + line_number, + is_comment: false, + strength_modifier: None, + } + } + + fn li_comment(line_number: usize, content: &str) -> LineInfo { + LineInfo { + content: content.to_string(), + line_number, + is_comment: true, + strength_modifier: None, + } + } + + // ============================================================ + // Tests for parse_magic_rule_line (12 test cases) + // ============================================================ + + #[test] + fn test_parse_magic_rule_line_simple_string() { + let line = li(1, "0 string \\x7fELF ELF executable"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.level, 0); + assert_eq!(rule.message, "ELF executable"); + } + + #[test] + fn test_parse_magic_rule_line_byte_type() { + let line = li(1, "0 byte 1 ELF"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.level, 0); + assert!(matches!(rule.typ, TypeKind::Byte)); + } + + #[test] + fn test_parse_magic_rule_line_with_child_indentation() { + let line = li(2, ">4 byte 1 32-bit"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.level, 1); + } + + #[test] + fn test_parse_magic_rule_line_deep_indentation() { + let line = li(3, ">>>8 long = 0x12345678 Complex match"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.level, 3); + } + + #[test] + fn test_parse_magic_rule_line_not_equal_operator() { + let line = li(1, "0 byte != 0 Non-zero"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.op, Operator::NotEqual); + } + + #[test] + fn test_parse_magic_rule_line_greater_operator() { + let line = li(1, "0 long = 1000 Number"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.op, Operator::Equal); + } + + #[test] + fn test_parse_magic_rule_line_less_operator() { + let line = li(1, "0 long != 256 Not equal"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.op, Operator::NotEqual); + } + + #[test] + fn test_parse_magic_rule_line_bitwise_and_operator() { + let line = li(1, "0 byte & 0xFF Bitmask"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.op, Operator::BitwiseAnd); + } + + #[test] + fn test_parse_magic_rule_line_comment_line_error() { + let line = li_comment(1, "This is a comment"); + let result = parse_magic_rule_line(&line); + assert!(result.is_err()); + } + + #[test] + fn test_parse_magic_rule_line_hex_offset() { + let line = li(1, "0x100 byte 1 PDF document"); + let rule = parse_magic_rule_line(&line).unwrap(); + match rule.offset { + OffsetSpec::Absolute(offset) => assert_eq!(offset, 0x100), + _ => panic!("Expected absolute offset"), + } + } + + #[test] + fn test_parse_magic_rule_line_string_with_spaces() { + let line = li(1, "0 byte 1 Long message with multiple words"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert_eq!(rule.message, "Long message with multiple words"); + } + + #[test] + fn test_parse_magic_rule_line_short_type() { + let line = li(1, "0 short 0x4d5a MS-DOS executable"); + let rule = parse_magic_rule_line(&line).unwrap(); + assert!(matches!(rule.typ, TypeKind::Short { .. })); + } + + // ============================================================ + // Tests for preprocess_lines (12 test cases) + // ============================================================ + + #[test] + fn test_preprocess_lines_single_rule() { + let input = "0 string 0 Test"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 Test"); + assert!(!lines[0].is_comment); + } + + #[test] + fn test_preprocess_lines_multiple_rules() { + let input = "0 string 0 Test\n0 byte 1 Byte"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 2); + assert_eq!(lines[0].content, "0 string 0 Test"); + assert_eq!(lines[1].content, "0 byte 1 Byte"); + } + + #[test] + fn test_preprocess_lines_with_comments() { + let input = "# Comment\n0 string 0 Test"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 2); + assert!(lines[0].is_comment); + assert!(!lines[1].is_comment); + } + + #[test] + fn test_preprocess_lines_empty_lines() { + let input = "0 string 0 Test\n\n0 byte 1 Byte"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 2); + } + + #[test] + fn test_preprocess_lines_leading_empty_lines() { + let input = "\n\n0 string 0 Test"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 Test"); + } + + #[test] + fn test_preprocess_lines_trailing_empty_lines() { + let input = "0 string 0 Test\n\n"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + } + + #[test] + fn test_preprocess_lines_line_continuation() { + let input = "0 string 0 Long message \\\ncontinued here"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 Long message continued here"); + } + + #[test] + fn test_preprocess_lines_multiple_continuations() { + let input = "0 string 0 Multi \\\nline \\\ncontinuation"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 Multi line continuation"); + } + + #[test] + fn test_preprocess_lines_mixed_comments_and_rules() { + let input = "# Header\n0 string 0 Test\n# Another comment\n>4 byte 1 Child"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 4); + assert!(lines[0].is_comment); + assert!(!lines[1].is_comment); + assert!(lines[2].is_comment); + assert!(!lines[3].is_comment); + } + + #[test] + fn test_preprocess_lines_preserves_line_numbers() { + let input = "0 string 0 Test\n>4 byte 1 Child"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines[0].line_number, 1); + assert_eq!(lines[1].line_number, 2); + } + + #[test] + fn test_preprocess_lines_empty_input() { + let input = ""; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 0); + } + + #[test] + fn test_preprocess_lines_only_comments() { + let input = "# Comment 1\n# Comment 2\n# Comment 3"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 3); + assert!(lines.iter().all(|l| l.is_comment)); + } + + // ============================================================ + // Continuation edge case tests + // ============================================================ + + #[test] + fn test_continuation_at_eof() { + // Continuation on last line with no following line - should error + let input = "0 string 0 Test \\"; + let result = preprocess_lines(input); + assert!( + result.is_err(), + "Should error on unterminated continuation at EOF" + ); + let err = result.unwrap_err(); + assert!( + format!("{err:?}").contains("Unterminated"), + "Error should mention unterminated continuation" + ); + } + + #[test] + fn test_continuation_with_empty_next() { + // Empty line after continuation causes unterminated continuation + // (empty lines are skipped but continuation state persists) + let input = "0 string 0 Test \\\n\n0 byte 1 Next"; + let lines = preprocess_lines(input).unwrap(); + // The continuation carries through the empty line, so "Next" gets appended + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 Test 0 byte 1 Next"); + } + + #[test] + fn test_continuation_into_empty_then_rule() { + let input = "0 string 0 First \\\n\ncontinued"; + let lines = preprocess_lines(input).unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0].content, "0 string 0 First continued"); + } + + // ============================================================ + // Line number accuracy tests + // ============================================================ + + #[test] + fn test_line_numbers_with_continuations() { + let input = "0 string 0 test1\n0 string 0 multi \\\nline \\\ntest\n0 string 0 test2"; + let lines = preprocess_lines(input).unwrap(); + + // Line 1: "0 string 0 test1" should report line 1 + assert_eq!(lines[0].line_number, 1); + + // Line 2-4 continuation should report line 2 (first line of continuation) + assert_eq!(lines[1].line_number, 2); + + // Line 5: "0 string 0 test2" should report line 5 + assert_eq!(lines[2].line_number, 5); + } + + #[test] + fn test_line_numbers_with_mixed_content() { + let input = "# Comment line 1\n0 string 0 rule1\n\n# Another comment\n0 string 0 rule2 \\\ncontinued"; + let lines = preprocess_lines(input).unwrap(); + + assert_eq!(lines.len(), 4); + assert_eq!(lines[0].line_number, 1); // Comment + assert_eq!(lines[1].line_number, 2); // rule1 + assert_eq!(lines[2].line_number, 4); // Another comment + assert_eq!(lines[3].line_number, 5); // rule2 (continued on line 6) + } + + // ============================================================ + // Bug reproduction tests + // ============================================================ + + #[test] + fn test_bug1_comment_during_continuation() { + // Bug 1: Comment during continuation should not corrupt line_buf + // The partial rule should be discarded, leaving only the comment and new rule + let input = "0 string 0 Partial rule \\\n# This is a comment\n0 byte 1 New rule"; + let lines = preprocess_lines(input).unwrap(); + + // The partial rule is discarded, so we should have 2 lines: comment and new rule + assert_eq!(lines.len(), 2); + // The comment should be separate and not contain rule content + let comment_line = lines.iter().find(|l| l.is_comment).unwrap(); + assert!(!comment_line.content.contains("Partial rule")); + assert_eq!(comment_line.content, "This is a comment"); + // The new rule should be intact + let rule_line = lines + .iter() + .find(|l| !l.is_comment && l.content.contains("New rule")) + .unwrap(); + assert_eq!(rule_line.content, "0 byte 1 New rule"); + } + + #[test] + fn test_bug2_empty_line_in_continuation() { + // Bug 2: Empty line in continuation should not break line number calculation + let input = "0 string 0 Test \\\n\ncontinued here"; + let lines = preprocess_lines(input).unwrap(); + + assert_eq!(lines.len(), 1); + // Line number should point to line 1 (where the rule started), not line 3 + assert_eq!(lines[0].line_number, 1); + assert_eq!(lines[0].content, "0 string 0 Test continued here"); + } + + #[test] + fn test_bug2_multiple_empty_lines_in_continuation() { + // Multiple empty lines in continuation + let input = "0 string 0 Test \\\n\n\ncontinued here"; + let lines = preprocess_lines(input).unwrap(); + + assert_eq!(lines.len(), 1); + // Line number should still point to line 1 + assert_eq!(lines[0].line_number, 1); + } +} diff --git a/tests/compatibility_tests.rs b/tests/compatibility_tests.rs index 2b10c8ab..09930dee 100644 --- a/tests/compatibility_tests.rs +++ b/tests/compatibility_tests.rs @@ -34,14 +34,12 @@ enum TestStatus { /// Compatibility test runner struct CompatibilityTestRunner { test_dir: PathBuf, - magic_file: PathBuf, rmagic_binary: PathBuf, } impl CompatibilityTestRunner { fn new() -> Result> { let test_dir = PathBuf::from("third_party/tests"); - let magic_file = PathBuf::from("third_party/magic.mgc"); let rmagic_binary = find_rmagic_binary()?; if !test_dir.exists() { @@ -51,13 +49,8 @@ impl CompatibilityTestRunner { ); } - if !magic_file.exists() { - return Err("Magic file not found. Ensure third_party/magic.mgc exists.".into()); - } - Ok(Self { test_dir, - magic_file, rmagic_binary, }) } @@ -86,8 +79,7 @@ impl CompatibilityTestRunner { /// Run rmagic against a test file fn run_rmagic(&self, test_file: &Path) -> Result> { let output = Command::new(&self.rmagic_binary) - .arg("--magic-file") - .arg(&self.magic_file) + .arg("--use-builtin") .arg(test_file) .stdout(Stdio::piped()) .stderr(Stdio::piped()) diff --git a/tests/evaluator_tests.rs b/tests/evaluator_tests.rs new file mode 100644 index 00000000..bba2f187 --- /dev/null +++ b/tests/evaluator_tests.rs @@ -0,0 +1,244 @@ +//! Evaluator integration tests +//! +//! Tests for confidence calculation, rule ordering, and evaluation behavior +//! through the public `MagicDatabase` API. + +use libmagic_rs::{EvaluationConfig, MagicDatabase}; + +// ============================================================ +// Confidence Calculation Tests +// ============================================================ + +#[test] +fn test_confidence_nonzero_for_known_type() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!( + result.confidence > 0.0, + "ELF detection should have non-zero confidence, got {}", + result.confidence + ); +} + +#[test] +fn test_confidence_zero_for_unknown_type() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"random unknown content").unwrap(); + assert!( + (result.confidence - 0.0).abs() < f64::EPSILON, + "Unknown type should have zero confidence" + ); +} + +#[test] +fn test_confidence_matches_first_match() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + if let Some(first) = result.matches.first() { + assert!( + (result.confidence - first.confidence).abs() < f64::EPSILON, + "Result confidence should equal first match confidence" + ); + } +} + +// ============================================================ +// Rule Ordering Tests +// ============================================================ + +#[test] +fn test_elf_detected_before_generic() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!( + result.description.contains("ELF"), + "ELF should be detected, got: {}", + result.description + ); +} + +#[test] +fn test_pdf_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"%PDF-\x00\x00\x00").unwrap(); + assert!( + result.description.contains("PDF"), + "PDF should be detected, got: {}", + result.description + ); +} + +#[test] +fn test_png_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db + .evaluate_buffer(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") + .unwrap(); + assert!( + result.description.contains("PNG"), + "PNG should be detected, got: {}", + result.description + ); +} + +#[test] +fn test_jpeg_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db + .evaluate_buffer(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00") + .unwrap(); + assert!( + result.description.contains("JPEG") || result.description.contains("JFIF"), + "JPEG should be detected, got: {}", + result.description + ); +} + +#[test] +fn test_zip_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"PK\x03\x04rest of zip").unwrap(); + assert!( + result.description.contains("ZIP") || result.description.contains("Zip"), + "ZIP should be detected, got: {}", + result.description + ); +} + +#[test] +fn test_gzip_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db + .evaluate_buffer(b"\x1f\x8b\x08\x00\x00\x00\x00\x00") + .unwrap(); + assert!( + result.description.to_lowercase().contains("gzip"), + "GZIP should be detected, got: {}", + result.description + ); +} + +// ============================================================ +// Configuration Tests +// ============================================================ + +#[test] +fn test_evaluate_with_performance_config() { + let config = EvaluationConfig::performance(); + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!(result.description.contains("ELF")); +} + +#[test] +fn test_evaluate_with_comprehensive_config() { + let config = EvaluationConfig::comprehensive(); + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!(result.description.contains("ELF")); +} + +#[test] +fn test_evaluate_with_mime_types_enabled() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!( + result.mime_type.is_some(), + "MIME type should be present when enabled" + ); +} + +#[test] +fn test_evaluate_without_mime_types() { + let config = EvaluationConfig { + enable_mime_types: false, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!( + result.mime_type.is_none(), + "MIME type should be absent when disabled" + ); +} + +#[test] +fn test_invalid_config_rejected() { + let config = EvaluationConfig { + max_recursion_depth: 0, + ..EvaluationConfig::default() + }; + let result = MagicDatabase::with_builtin_rules_and_config(config); + assert!(result.is_err(), "Zero recursion depth should be rejected"); +} + +// ============================================================ +// Metadata Tests +// ============================================================ + +#[test] +fn test_metadata_populated_for_buffer() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + + assert_eq!(result.metadata.file_size, 8); + assert!(result.metadata.evaluation_time_ms >= 0.0); + assert!(result.metadata.rules_evaluated > 0); + assert!(result.metadata.magic_file.is_none()); + assert!(!result.metadata.timed_out); +} + +#[test] +fn test_metadata_for_no_match() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"nothing matches this").unwrap(); + + assert_eq!(result.description, "data"); + assert!(result.metadata.rules_evaluated > 0); +} + +// ============================================================ +// Edge Cases +// ============================================================ + +#[test] +fn test_evaluate_empty_buffer() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"").unwrap(); + assert_eq!(result.description, "data"); +} + +#[test] +fn test_evaluate_single_byte_buffer() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x00").unwrap(); + // Should not panic, may or may not match + assert!(!result.description.is_empty()); +} + +#[test] +fn test_evaluate_all_zeros() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(&[0u8; 1024]).unwrap(); + assert!(!result.description.is_empty()); +} + +#[test] +fn test_evaluate_all_ones() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(&[0xFF; 1024]).unwrap(); + assert!(!result.description.is_empty()); +} + +#[test] +fn test_evaluate_partial_magic_header() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + // Only first byte of ELF magic + let result = db.evaluate_buffer(b"\x7f").unwrap(); + // Should not crash, might not match + assert!(!result.description.is_empty()); +} diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs new file mode 100644 index 00000000..dd08776b --- /dev/null +++ b/tests/integration_tests.rs @@ -0,0 +1,374 @@ +//! End-to-end integration tests for core flows +//! +//! Tests the complete workflow: load database, evaluate files/buffers, +//! verify output formatting and metadata. + +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +use libmagic_rs::{EvaluationConfig, MagicDatabase}; +use tempfile::TempDir; + +// ============================================================ +// Built-in Rules End-to-End +// ============================================================ + +#[test] +fn test_builtin_rules_detect_elf() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert!(result.description.contains("ELF")); + assert!(result.confidence > 0.0); +} + +#[test] +fn test_builtin_rules_detect_pdf() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"%PDF-\x00\x00\x00").unwrap(); + assert!(result.description.contains("PDF")); +} + +#[test] +fn test_builtin_rules_detect_zip() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"PK\x03\x04").unwrap(); + assert!( + result.description.contains("ZIP") || result.description.contains("Zip"), + "Expected ZIP detection, got: {}", + result.description + ); +} + +#[test] +fn test_builtin_rules_detect_png() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db + .evaluate_buffer(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") + .unwrap(); + assert!(result.description.contains("PNG")); +} + +#[test] +fn test_builtin_rules_detect_jpeg() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db + .evaluate_buffer(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00") + .unwrap(); + assert!( + result.description.contains("JPEG") || result.description.contains("JFIF"), + "Expected JPEG detection, got: {}", + result.description + ); +} + +#[test] +fn test_builtin_rules_detect_gif() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"GIF8\x00\x00").unwrap(); + assert!( + result.description.contains("GIF"), + "Expected GIF detection, got: {}", + result.description + ); +} + +#[test] +fn test_builtin_rules_unknown_fallback() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_buffer(b"random bytes").unwrap(); + assert_eq!(result.description, "data"); +} + +// ============================================================ +// Load from File End-to-End +// ============================================================ + +#[test] +fn test_load_from_magic_file_and_evaluate() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("test.magic"); + + // Write a simple magic file using quoted string value + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 string \"TESTMAGIC\" Test file format").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + let result = db.evaluate_buffer(b"TESTMAGIC\x00some data").unwrap(); + assert!( + result.description.contains("Test file format"), + "Expected custom rule match, got: {}", + result.description + ); +} + +#[test] +fn test_load_from_file_with_config() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("test.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 string \"HELLO\" Hello file").unwrap(); + + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::load_from_file_with_config(&magic_path, config).unwrap(); + let result = db.evaluate_buffer(b"HELLO\x00world").unwrap(); + assert!(result.description.contains("Hello file")); +} + +#[test] +fn test_source_path_tracked() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("test.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 string \"X\" Test").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + assert!(db.source_path().is_some()); +} + +#[test] +fn test_source_path_none_for_builtin() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + assert!(db.source_path().is_none()); +} + +#[test] +fn test_load_nonexistent_file_errors() { + let result = MagicDatabase::load_from_file("/nonexistent/magic/file"); + assert!(result.is_err()); +} + +// ============================================================ +// Evaluate File End-to-End +// ============================================================ + +#[test] +fn test_evaluate_file_with_builtin() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.elf"); + + // Write ELF header + fs::write( + &test_file, + b"\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00", + ) + .unwrap(); + + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_file(&test_file).unwrap(); + assert!(result.description.contains("ELF")); + assert!(result.metadata.file_size > 0); +} + +#[test] +fn test_evaluate_empty_file() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("empty.bin"); + fs::write(&test_file, b"").unwrap(); + + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_file(&test_file).unwrap(); + assert_eq!(result.metadata.file_size, 0); +} + +#[test] +fn test_evaluate_nonexistent_file_errors() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_file("/nonexistent/test/file.bin"); + assert!(result.is_err()); +} + +// ============================================================ +// Multiple Format Detection +// ============================================================ + +#[test] +fn test_multiple_formats_detected_correctly() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + + let test_cases: Vec<(&[u8], &str)> = vec![ + (b"\x7fELF\x02\x01\x01\x00", "ELF"), + (b"%PDF-\x00\x00\x00", "PDF"), + (b"PK\x03\x04rest", "ZIP"), + (b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR", "PNG"), + (b"GIF8\x00\x00", "GIF"), + ]; + + for (input, expected) in test_cases { + let result = db.evaluate_buffer(input).unwrap(); + assert!( + result.description.contains(expected), + "Expected '{}' for input {:?}, got: '{}'", + expected, + &input[..4.min(input.len())], + result.description + ); + } +} + +// ============================================================ +// Custom Magic Rules End-to-End +// ============================================================ + +#[test] +fn test_custom_rules_with_hierarchy() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("custom.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 belong 0x7f454c46 ELF").unwrap(); + writeln!(f, ">4 byte 1 32-bit").unwrap(); + writeln!(f, ">4 byte 2 64-bit").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // Test 32-bit ELF (5 bytes: 4-byte magic + class byte) + let result = db.evaluate_buffer(b"\x7fELF\x01").unwrap(); + assert!( + result.description.contains("32-bit"), + "Expected 32-bit, got: {}", + result.description + ); + + // Test 64-bit ELF + let result = db.evaluate_buffer(b"\x7fELF\x02").unwrap(); + assert!( + result.description.contains("64-bit"), + "Expected 64-bit, got: {}", + result.description + ); +} + +#[test] +fn test_custom_rules_no_match_fallback() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("custom.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 string \"RARE\" Rare format").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + let result = db.evaluate_buffer(b"no match here").unwrap(); + assert_eq!(result.description, "data"); +} + +// ============================================================ +// Load from Directory End-to-End +// ============================================================ + +#[test] +fn test_load_directory_of_magic_files() { + let temp_dir = TempDir::new().unwrap(); + + // Create multiple magic files using belong for binary magic numbers + let mut f1 = fs::File::create(temp_dir.path().join("images.magic")).unwrap(); + writeln!(f1, "0 belong 0x89504e47 PNG image").unwrap(); + + let mut f2 = fs::File::create(temp_dir.path().join("docs.magic")).unwrap(); + writeln!(f2, "0 string \"%PDF-\" PDF document").unwrap(); + + let db = MagicDatabase::load_from_file(temp_dir.path()).unwrap(); + + let png_result = db + .evaluate_buffer(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") + .unwrap(); + assert!( + png_result.description.contains("PNG"), + "Expected PNG from directory load, got: {}", + png_result.description + ); + + let pdf_result = db.evaluate_buffer(b"%PDF-\x001.4").unwrap(); + assert!( + pdf_result.description.contains("PDF"), + "Expected PDF from directory load, got: {}", + pdf_result.description + ); +} + +// ============================================================ +// Config Accessor +// ============================================================ + +#[test] +fn test_config_accessor() { + let config = EvaluationConfig { + max_recursion_depth: 15, + timeout_ms: Some(5000), + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + assert_eq!(db.config().max_recursion_depth, 15); + assert_eq!(db.config().timeout_ms, Some(5000)); +} + +// ============================================================ +// Metadata in File Evaluation +// ============================================================ + +#[test] +fn test_file_evaluation_metadata() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.pdf"); + let content = b"%PDF-\x00some pdf content here for testing"; + fs::write(&test_file, content).unwrap(); + + let db = MagicDatabase::with_builtin_rules().unwrap(); + let result = db.evaluate_file(&test_file).unwrap(); + + assert_eq!(result.metadata.file_size, content.len() as u64); + assert!(result.metadata.evaluation_time_ms >= 0.0); + assert!(result.metadata.rules_evaluated > 0); + assert!(!result.metadata.timed_out); +} + +// ============================================================ +// Multiple Evaluations on Same Database +// ============================================================ + +#[test] +fn test_reuse_database_multiple_evaluations() { + let db = MagicDatabase::with_builtin_rules().unwrap(); + + let elf = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + let pdf = db.evaluate_buffer(b"%PDF-\x001.4").unwrap(); + let unknown = db.evaluate_buffer(b"nothing here").unwrap(); + + assert!(elf.description.contains("ELF")); + assert!(pdf.description.contains("PDF")); + assert_eq!(unknown.description, "data"); +} + +// ============================================================ +// Test Fixture Helper +// ============================================================ + +fn create_test_file(dir: &std::path::Path, name: &str, content: &[u8]) -> PathBuf { + let path = dir.join(name); + fs::write(&path, content).unwrap(); + path +} + +#[test] +fn test_evaluate_multiple_files() { + let temp_dir = TempDir::new().unwrap(); + let db = MagicDatabase::with_builtin_rules().unwrap(); + + let elf_file = create_test_file( + temp_dir.path(), + "test.elf", + b"\x7fELF\x02\x01\x01\x00\x00\x00", + ); + let pdf_file = create_test_file(temp_dir.path(), "test.pdf", b"%PDF-\x001.4 content"); + + let elf_result = db.evaluate_file(&elf_file).unwrap(); + let pdf_result = db.evaluate_file(&pdf_file).unwrap(); + + assert!(elf_result.description.contains("ELF")); + assert!(pdf_result.description.contains("PDF")); +} diff --git a/tests/mime_tests.rs b/tests/mime_tests.rs new file mode 100644 index 00000000..c917be04 --- /dev/null +++ b/tests/mime_tests.rs @@ -0,0 +1,226 @@ +//! MIME type mapping integration tests +//! +//! Tests for MIME type detection through the public API, +//! including hardcoded mappings, prefix matching, and case insensitivity. + +use libmagic_rs::mime::MimeMapper; +use libmagic_rs::{EvaluationConfig, MagicDatabase}; + +// ============================================================ +// Integration: MIME via MagicDatabase +// ============================================================ + +#[test] +fn test_mime_enabled_returns_type_for_elf() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert_eq!( + result.mime_type, + Some("application/x-executable".to_string()) + ); +} + +#[test] +fn test_mime_enabled_returns_type_for_pdf() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"%PDF-\x00\x00\x00").unwrap(); + assert_eq!(result.mime_type, Some("application/pdf".to_string())); +} + +#[test] +fn test_mime_enabled_returns_type_for_png() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db + .evaluate_buffer(b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR") + .unwrap(); + assert_eq!(result.mime_type, Some("image/png".to_string())); +} + +#[test] +fn test_mime_enabled_returns_type_for_jpeg() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db + .evaluate_buffer(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00") + .unwrap(); + assert_eq!(result.mime_type, Some("image/jpeg".to_string())); +} + +#[test] +fn test_mime_enabled_returns_type_for_zip() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"PK\x03\x04rest of zip").unwrap(); + assert_eq!(result.mime_type, Some("application/zip".to_string())); +} + +#[test] +fn test_mime_disabled_returns_none() { + let config = EvaluationConfig { + enable_mime_types: false, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"\x7fELF\x02\x01\x01\x00").unwrap(); + assert_eq!(result.mime_type, None); +} + +#[test] +fn test_mime_unknown_data_returns_none() { + let config = EvaluationConfig { + enable_mime_types: true, + ..EvaluationConfig::default() + }; + let db = MagicDatabase::with_builtin_rules_and_config(config).unwrap(); + let result = db.evaluate_buffer(b"random unknown content").unwrap(); + assert_eq!(result.mime_type, None); +} + +// ============================================================ +// MimeMapper Direct Tests +// ============================================================ + +#[test] +fn test_mapper_hardcoded_executables() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("ELF 64-bit LSB executable"), + Some("application/x-executable".to_string()) + ); + assert_eq!( + mapper.get_mime_type("PE32 executable (DLL)"), + Some("application/vnd.microsoft.portable-executable".to_string()) + ); + assert_eq!( + mapper.get_mime_type("Mach-O 64-bit executable"), + Some("application/x-mach-binary".to_string()) + ); +} + +#[test] +fn test_mapper_hardcoded_archives() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("Zip archive data"), + Some("application/zip".to_string()) + ); + assert_eq!( + mapper.get_mime_type("gzip compressed data"), + Some("application/gzip".to_string()) + ); + assert_eq!( + mapper.get_mime_type("POSIX tar archive"), + Some("application/x-tar".to_string()) + ); + assert_eq!( + mapper.get_mime_type("RAR archive data"), + Some("application/vnd.rar".to_string()) + ); + assert_eq!( + mapper.get_mime_type("7-zip archive"), + Some("application/x-7z-compressed".to_string()) + ); +} + +#[test] +fn test_mapper_hardcoded_images() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("JPEG image data"), + Some("image/jpeg".to_string()) + ); + assert_eq!( + mapper.get_mime_type("PNG image data, 800x600"), + Some("image/png".to_string()) + ); + assert_eq!( + mapper.get_mime_type("GIF image data, version 89a"), + Some("image/gif".to_string()) + ); + assert_eq!( + mapper.get_mime_type("BMP image data"), + Some("image/bmp".to_string()) + ); + assert_eq!( + mapper.get_mime_type("WebP image data"), + Some("image/webp".to_string()) + ); +} + +#[test] +fn test_mapper_hardcoded_documents() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("PDF document, version 1.4"), + Some("application/pdf".to_string()) + ); + assert_eq!( + mapper.get_mime_type("PostScript document"), + Some("application/postscript".to_string()) + ); +} + +#[test] +fn test_mapper_hardcoded_web() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("HTML document"), + Some("text/html".to_string()) + ); + assert_eq!( + mapper.get_mime_type("XML document"), + Some("application/xml".to_string()) + ); + assert_eq!( + mapper.get_mime_type("JSON data"), + Some("application/json".to_string()) + ); +} + +#[test] +fn test_mapper_case_insensitive() { + let mapper = MimeMapper::new(); + assert_eq!( + mapper.get_mime_type("elf executable"), + mapper.get_mime_type("ELF EXECUTABLE") + ); + assert_eq!( + mapper.get_mime_type("png image"), + mapper.get_mime_type("PNG IMAGE") + ); +} + +#[test] +fn test_mapper_prefers_longer_match() { + let mapper = MimeMapper::new(); + // "gzip" should match before "zip" for "gzip compressed" + assert_eq!( + mapper.get_mime_type("gzip compressed data"), + Some("application/gzip".to_string()) + ); +} + +#[test] +fn test_mapper_no_match_returns_none() { + let mapper = MimeMapper::new(); + assert_eq!(mapper.get_mime_type("data"), None); + assert_eq!(mapper.get_mime_type("unknown binary format"), None); +} diff --git a/tests/property_tests.rs b/tests/property_tests.rs new file mode 100644 index 00000000..4e486844 --- /dev/null +++ b/tests/property_tests.rs @@ -0,0 +1,217 @@ +//! Property-based tests for libmagic-rs +//! +//! Uses proptest to verify properties that should hold for all valid inputs: +//! - Evaluator never panics on any buffer +//! - Buffer access is always bounds-checked +//! - Metadata is consistent +//! - Serde roundtrips preserve data + +use proptest::prelude::*; + +use libmagic_rs::{ + EvaluationConfig, MagicDatabase, MagicRule, OffsetSpec, Operator, TypeKind, Value, +}; + +/// Generate a valid OffsetSpec for testing +fn arb_offset_spec() -> impl Strategy { + prop_oneof![ + (-1000i64..=1000i64).prop_map(OffsetSpec::Absolute), + (-100i64..=100i64).prop_map(OffsetSpec::Relative), + (-100i64..=0i64).prop_map(OffsetSpec::FromEnd), + ] +} + +/// Generate a valid TypeKind for testing +fn arb_type_kind() -> impl Strategy { + prop_oneof![ + Just(TypeKind::Byte), + (any::(), any::()).prop_map(|(is_big, signed)| { + TypeKind::Short { + endian: if is_big { + libmagic_rs::Endianness::Big + } else { + libmagic_rs::Endianness::Little + }, + signed, + } + }), + (any::(), any::()).prop_map(|(is_big, signed)| { + TypeKind::Long { + endian: if is_big { + libmagic_rs::Endianness::Big + } else { + libmagic_rs::Endianness::Little + }, + signed, + } + }), + (0usize..256usize).prop_map(|len| TypeKind::String { + max_length: Some(len), + }), + ] +} + +/// Generate a valid Operator for testing +fn arb_operator() -> impl Strategy { + prop_oneof![ + Just(Operator::Equal), + Just(Operator::NotEqual), + Just(Operator::BitwiseAnd), + (0u64..=255u64).prop_map(Operator::BitwiseAndMask), + ] +} + +/// Generate a valid Value for testing +fn arb_value() -> impl Strategy { + prop_oneof![ + (0u64..=u32::MAX as u64).prop_map(Value::Uint), + (i32::MIN as i64..=i32::MAX as i64).prop_map(Value::Int), + prop::collection::vec(any::(), 0..32).prop_map(Value::Bytes), + "[a-zA-Z0-9 ]{0,32}".prop_map(Value::String), + ] +} + +/// Generate a valid MagicRule for testing +fn arb_magic_rule() -> impl Strategy { + ( + arb_offset_spec(), + arb_type_kind(), + arb_operator(), + arb_value(), + "[a-zA-Z0-9 _-]{1,64}", + ) + .prop_map(|(offset, typ, op, value, message)| MagicRule { + offset, + typ, + op, + value, + message, + children: vec![], + level: 0, + strength_modifier: None, + }) +} + +/// Generate arbitrary binary data for testing +fn arb_buffer() -> impl Strategy> { + prop::collection::vec(any::(), 0..1024) +} + +// ============================================================================= +// Property Tests +// ============================================================================= + +proptest! { + /// Property: Evaluation should never panic on any valid buffer + #[test] + fn prop_evaluation_never_panics(buffer in arb_buffer()) { + let db = MagicDatabase::with_builtin_rules() + .expect("builtin rules should load"); + + let result = db.evaluate_buffer(&buffer); + + match result { + Ok(eval_result) => { + prop_assert!(!eval_result.description.is_empty()); + prop_assert!(eval_result.confidence >= 0.0); + prop_assert!(eval_result.confidence <= 1.0); + } + Err(e) => { + prop_assert!(!e.to_string().is_empty()); + } + } + } + + /// Property: EvaluationConfig validation accepts reasonable values + #[test] + fn prop_config_validation_consistent( + recursion_depth in 1u32..100u32, + string_length in 1usize..10000usize, + timeout in 1u64..100000u64 + ) { + let config = EvaluationConfig { + max_recursion_depth: recursion_depth, + max_string_length: string_length, + stop_at_first_match: true, + enable_mime_types: false, + timeout_ms: Some(timeout), + }; + + prop_assert!(config.validate().is_ok()); + } + + /// Property: Evaluation result metadata is consistent with input + #[test] + fn prop_metadata_valid(buffer in arb_buffer()) { + let db = MagicDatabase::with_builtin_rules() + .expect("builtin rules should load"); + + let result = db.evaluate_buffer(&buffer) + .expect("should evaluate"); + + prop_assert_eq!(result.metadata.file_size as usize, buffer.len()); + prop_assert!(result.metadata.evaluation_time_ms >= 0.0); + prop_assert!(result.metadata.rules_evaluated > 0); + } + + /// Property: Arbitrary rules should serialize/deserialize consistently + #[test] + fn prop_rule_serde_roundtrip(rule in arb_magic_rule()) { + let json = serde_json::to_string(&rule) + .expect("should serialize"); + + let deserialized: MagicRule = serde_json::from_str(&json) + .expect("should deserialize"); + + prop_assert_eq!(rule.message, deserialized.message); + prop_assert_eq!(rule.level, deserialized.level); + } +} + +// ============================================================================= +// Known-pattern detection (regular tests, not property tests) +// ============================================================================= + +#[test] +fn test_elf_detection() { + let db = MagicDatabase::with_builtin_rules().expect("builtin rules should load"); + let elf_buffer = vec![0x7f, b'E', b'L', b'F', 2, 1, 1, 0]; + + let result = db.evaluate_buffer(&elf_buffer).expect("should evaluate"); + assert!( + result.description.contains("ELF"), + "Expected ELF detection, got: {}", + result.description + ); +} + +#[test] +fn test_zip_detection() { + let db = MagicDatabase::with_builtin_rules().expect("builtin rules should load"); + let zip_buffer = vec![0x50, 0x4b, 0x03, 0x04]; + + let result = db.evaluate_buffer(&zip_buffer).expect("should evaluate"); + assert!( + result.description.contains("ZIP"), + "Expected ZIP detection, got: {}", + result.description + ); +} + +#[test] +fn test_empty_buffer_handled() { + let db = MagicDatabase::with_builtin_rules().expect("builtin rules should load"); + + let result = db.evaluate_buffer(&[]).expect("should evaluate"); + assert!(!result.description.is_empty()); +} + +#[test] +fn test_zero_recursion_fails_validation() { + let config = EvaluationConfig { + max_recursion_depth: 0, + ..EvaluationConfig::default() + }; + + assert!(config.validate().is_err()); +} diff --git a/tests/tags_tests.rs b/tests/tags_tests.rs new file mode 100644 index 00000000..83c7ce51 --- /dev/null +++ b/tests/tags_tests.rs @@ -0,0 +1,159 @@ +//! Tag extraction integration tests +//! +//! Tests for keyword extraction, case insensitivity, rule path tags, +//! and custom keyword support. + +use libmagic_rs::tags::TagExtractor; + +// ============================================================ +// Keyword Extraction +// ============================================================ + +#[test] +fn test_extract_single_keyword() { + let extractor = TagExtractor::new(); + let tags = extractor.extract_tags("ELF 64-bit executable"); + assert!(tags.contains(&"executable".to_string())); +} + +#[test] +fn test_extract_multiple_keywords() { + let extractor = TagExtractor::new(); + let tags = extractor.extract_tags("compressed archive with encrypted data"); + assert!(tags.contains(&"archive".to_string())); + assert!(tags.contains(&"compressed".to_string())); + assert!(tags.contains(&"encrypted".to_string())); + assert!(tags.contains(&"data".to_string())); +} + +#[test] +fn test_extract_all_default_keywords() { + let extractor = TagExtractor::new(); + + let keyword_descriptions = [ + ("ELF executable", "executable"), + ("Zip archive", "archive"), + ("PNG image data", "image"), + ("MPEG video stream", "video"), + ("FLAC audio bitstream", "audio"), + ("PDF document", "document"), + ("gzip compressed", "compressed"), + ("AES encrypted", "encrypted"), + ("ASCII text", "text"), + ("binary data", "binary"), + ("raw data", "data"), + ("Python script", "script"), + ("TrueType font", "font"), + ("SQLite database", "database"), + ("Excel spreadsheet", "spreadsheet"), + ]; + + for (description, expected_tag) in &keyword_descriptions { + let tags = extractor.extract_tags(description); + assert!( + tags.contains(&expected_tag.to_string()), + "Expected tag '{}' from description '{}'", + expected_tag, + description + ); + } +} + +#[test] +fn test_no_match_returns_empty() { + let extractor = TagExtractor::new(); + let tags = extractor.extract_tags("unknown format xyz"); + assert!(tags.is_empty()); +} + +#[test] +fn test_tags_are_sorted() { + let extractor = TagExtractor::new(); + let tags = extractor.extract_tags("compressed archive with encrypted data"); + let mut sorted = tags.clone(); + sorted.sort(); + assert_eq!(tags, sorted); +} + +// ============================================================ +// Case Insensitivity +// ============================================================ + +#[test] +fn test_case_insensitive_matching() { + let extractor = TagExtractor::new(); + assert_eq!( + extractor.extract_tags("EXECUTABLE file"), + extractor.extract_tags("executable file") + ); +} + +#[test] +fn test_mixed_case_matching() { + let extractor = TagExtractor::new(); + let tags = extractor.extract_tags("Executable Archive Compressed"); + assert!(tags.contains(&"executable".to_string())); + assert!(tags.contains(&"archive".to_string())); + assert!(tags.contains(&"compressed".to_string())); +} + +// ============================================================ +// Custom Keywords +// ============================================================ + +#[test] +fn test_custom_keywords() { + let extractor = TagExtractor::with_keywords(vec!["custom", "special"]); + let tags = extractor.extract_tags("This has custom and special content"); + assert!(tags.contains(&"custom".to_string())); + assert!(tags.contains(&"special".to_string())); + assert!(!tags.contains(&"executable".to_string())); +} + +#[test] +fn test_custom_keywords_case_normalized() { + let extractor = TagExtractor::with_keywords(vec!["UPPER", "MiXeD"]); + let tags = extractor.extract_tags("upper and mixed content"); + assert!(tags.contains(&"upper".to_string())); + assert!(tags.contains(&"mixed".to_string())); +} + +#[test] +fn test_keyword_count() { + let extractor = TagExtractor::new(); + assert!( + extractor.keyword_count() >= 15, + "Default extractor should have at least 15 keywords" + ); + + let custom = TagExtractor::with_keywords(vec!["a", "b", "c"]); + assert_eq!(custom.keyword_count(), 3); +} + +// ============================================================ +// Rule Path Tags +// ============================================================ + +#[test] +fn test_extract_rule_path_basic() { + let extractor = TagExtractor::new(); + let messages = ["ELF magic", "64-bit LSB", "executable"]; + let tags = extractor.extract_rule_path(messages.iter().copied()); + assert_eq!(tags, vec!["elf-magic", "64-bit-lsb", "executable"]); +} + +#[test] +fn test_extract_rule_path_removes_special_chars() { + let extractor = TagExtractor::new(); + let messages = ["File (version 1.0)", "Data: test!"]; + let tags = extractor.extract_rule_path(messages.iter().copied()); + assert_eq!(tags, vec!["file-version-10", "data-test"]); +} + +#[test] +fn test_extract_rule_path_empty() { + let extractor = TagExtractor::new(); + let messages: Vec<&str> = vec![]; + let tags = extractor.extract_rule_path(messages.iter().copied()); + assert!(tags.is_empty()); +}