diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 91e5d01a..b857a7a9 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -52,7 +52,6 @@ reviews: [ "src/**", "docs/src/**", - "spec/**", ".kiro/**/*.md", ".cursor/**/*.mdc", ".github/**", @@ -64,6 +63,7 @@ reviews: "*.json", "*.sh", "justfile", + "build.rs", "!target/**", "!dist/**", "!docs/book/**", @@ -84,16 +84,26 @@ reviews: instructions: "Focus on magic file DSL parsing, AST node definitions, and grammar rules. Ensure comprehensive error handling for malformed input, proper escape sequence handling, and robust parsing logic. Include property tests for parser invariants." - path: "src/parser/ast.rs" instructions: "Review AST data structures matching the libmagic spec. Ensure proper serialization support, comprehensive documentation, and type safety. Focus on memory layout and performance characteristics." - - path: "src/parser/grammar.rs" + - path: "src/parser/grammar/**" instructions: "Review nom-based parsing logic, grammar rules, and parser combinators. Ensure robust parsing of magic file syntax, proper error recovery, and comprehensive test coverage for edge cases." + - path: "src/parser/codegen.rs" + instructions: "Review codegen serialization of AST types to Rust source. This module is shared by build.rs and build_helpers.rs. Ensure all TypeKind, Operator, and Value variants are serialized. Verify generated use-statement imports match." + - path: "src/parser/types.rs" + instructions: "Review type keyword parsing and TypeKind conversion. Ensure all libmagic type keywords are handled, case-insensitive matching is applied at entry points, and new types follow established patterns." + - path: "src/parser/loader.rs" + instructions: "Review magic file loading from disk. Ensure proper error handling for missing/invalid files, directory traversal safety, and resource limits." - path: "src/evaluator/**" instructions: "Focus on rule evaluation engine, offset resolution, type interpretation, and comparison operators. Ensure memory safety, bounds checking, and performance optimization. Include comprehensive error handling for malformed files." - - path: "src/evaluator/offset.rs" + - path: "src/evaluator/engine/**" + instructions: "Review the core evaluation loop. Focus on rule matching, recursion depth limits, timeout handling, and early exit optimization. Ensure no panics in library code." + - path: "src/evaluator/offset/**" instructions: "Review offset resolution logic for absolute, indirect, and relative offsets. Ensure proper bounds checking, endianness handling, and performance optimization. Focus on memory safety and edge case handling." - - path: "src/evaluator/types.rs" - instructions: "Review type interpretation logic for byte, short, long, string types with endianness support. Ensure proper bounds checking, overflow handling, and performance optimization." - - path: "src/evaluator/operators.rs" - instructions: "Review comparison and bitwise operators implementation. Ensure proper type coercion, overflow handling, and performance optimization. Focus on correctness and edge case handling." + - path: "src/evaluator/types/**" + instructions: "Review type interpretation logic for byte, short, long, quad, float, double, string, pstring, and date types with endianness support. Ensure proper bounds checking, overflow handling via checked_add, and no unwrap() in library code." + - path: "src/evaluator/operators/**" + instructions: "Review comparison and bitwise operators implementation. Ensure proper type coercion, overflow handling, and performance optimization. Focus on correctness, epsilon-aware float comparison, and edge case handling." + - path: "src/evaluator/strength.rs" + instructions: "Review strength calculation for rule ordering. Ensure all TypeKind and Operator variants are handled in match arms. Focus on correctness of strength modifiers." - path: "src/output/**" instructions: "Focus on output formatting for text and JSON formats. Ensure compatibility with GNU file command output, proper error handling, and structured metadata. Review performance for large result sets." - path: "src/output/text.rs" @@ -103,17 +113,13 @@ reviews: - path: "src/io/**" instructions: "Focus on file I/O operations, memory mapping, and resource management. Ensure proper bounds checking, error handling, and RAII patterns. Prioritize memory safety and performance." - path: "src/error.rs" - instructions: "Review error types and handling strategies. Ensure comprehensive error coverage, proper error propagation, and user-friendly error messages. Focus on error recovery and debugging support." + instructions: "Review error types and handling strategies. This file is shared with build.rs and cannot reference lib-only types. Ensure comprehensive error coverage, proper error propagation, and user-friendly error messages." + - path: "build.rs" + instructions: "Review build script that compiles magic rules at build time. Uses #[path] includes to share code with the library. Ensure rerun-if-changed directives are correct and generated code is valid." - path: "tests/**" - instructions: "Review test coverage, test organization, and test quality. Ensure comprehensive unit tests, integration tests, and property-based tests. Focus on test maintainability and edge case coverage." - - path: "tests/integration/**" - instructions: "Review integration test design and coverage. Ensure end-to-end workflow testing, compatibility testing, and performance regression testing. Focus on real-world usage scenarios." - - path: "tests/fixtures/**" - instructions: "Review test fixtures and sample files. Ensure comprehensive test data coverage, proper file organization, and test data maintenance. Focus on realistic test scenarios." + instructions: "Review test coverage, test organization, and test quality. Prefer table-driven tests over one-assertion-per-function. Ensure comprehensive unit tests, integration tests, and property-based tests with proptest." - path: "benches/**" instructions: "Review benchmark design and performance testing. Ensure meaningful benchmarks, proper statistical analysis, and performance regression detection. Focus on critical path optimization." - - path: "magic/**" - instructions: "Review magic file databases and rule sets. Ensure proper magic file syntax, comprehensive rule coverage, and compatibility with libmagic. Focus on rule accuracy and performance." - path: "docs/**" instructions: "Review documentation quality, completeness, and accuracy. Ensure comprehensive API documentation, usage examples, and migration guides. Focus on user experience and developer onboarding." - path: "Cargo.toml" @@ -126,9 +132,9 @@ reviews: enabled: true auto_incremental_review: true ignore_title_keywords: ["WIP", "draft", "do not merge"] - labels: ["libmagic-rs", "rust", "file-type-detection"] + labels: ["rust", "testing", "compatibility"] drafts: false - base_branches: ["main", "develop"] + base_branches: ["main"] ignore_usernames: [] finishing_touches: docstrings: @@ -141,7 +147,7 @@ reviews: threshold: 85 title: mode: warning - requirements: "Must follow Conventional Commits specification: type(scope): description. Types: feat, fix, docs, style, refactor, perf, test, build, ci, chore. Scopes: auth, api, cli, models, detection, alerting, etc. Breaking changes indicated with ! in header or BREAKING CHANGE: in footer." + requirements: "Must follow Conventional Commits specification: type(scope): description. Types: feat, fix, docs, style, refactor, perf, test, build, ci, chore. Scopes: parser, evaluator, io, output, cli, ast, grammar, types, operators, offsets, strength, codegen, loader, engine. Breaking changes indicated with ! in header or BREAKING CHANGE: in footer." description: mode: warning issue_assessment: @@ -165,8 +171,6 @@ reviews: packages: [] shellcheck: enabled: true - ruff: - enabled: false markdownlint: enabled: true github-checks: @@ -180,81 +184,21 @@ reviews: disabled_categories: [] enabled_only: false level: default - biome: - enabled: false - hadolint: - enabled: false - swiftlint: - enabled: false - phpstan: - enabled: false - level: default - phpmd: - enabled: false - phpcs: - enabled: false - golangci-lint: - enabled: false yamllint: enabled: true gitleaks: enabled: true - checkov: - enabled: false - detekt: - enabled: false - eslint: - enabled: false - flake8: - enabled: false - rubocop: - enabled: false - buf: - enabled: false - regal: - enabled: false actionlint: enabled: true - pmd: - enabled: false - cppcheck: - enabled: false semgrep: enabled: true - circleci: - enabled: false clippy: enabled: true - sqlfluff: - enabled: false - prismaLint: - enabled: false - pylint: - enabled: false - oxc: - enabled: false - shopifyThemeCheck: - enabled: false - luacheck: - enabled: false - brakeman: - enabled: false - dotenvLint: - enabled: false - htmlhint: - enabled: false - checkmake: - enabled: false osvScanner: enabled: true chat: art: false auto_reply: true - integrations: - jira: - usage: auto - linear: - usage: auto knowledge_base: opt_out: false web_search: @@ -266,12 +210,6 @@ knowledge_base: scope: auto issues: scope: local - jira: - usage: auto - project_keys: [] - linear: - usage: auto - team_keys: [] pull_requests: scope: local mcp: @@ -301,20 +239,12 @@ code_generation: instructions: "Generate API integration tests with real magic files, error condition testing, and memory safety validation. Include compatibility tests against libmagic." - path: "src/main.rs" instructions: "Test CLI interface with various arguments, error conditions, and cross-platform behavior. Include help text validation and exit code testing." - - path: "src/parser/ast.rs" - instructions: "Test AST serialization/deserialization, memory layout validation, and type safety. Include property tests for AST invariants and edge cases." - - path: "src/parser/grammar.rs" + - path: "src/parser/**/*.rs" instructions: "Test parser with malformed magic files, boundary conditions, escape sequences, and performance. Include property tests for parser correctness and fuzzing with proptest." - - path: "src/evaluator/offset.rs" - instructions: "Test offset resolution with various file types, boundary conditions, and endianness scenarios. Include memory safety tests and performance benchmarks." - - path: "src/evaluator/types.rs" - instructions: "Test type interpretation with edge cases, overflow conditions, and endianness handling. Include property tests for type conversion invariants." - - path: "src/evaluator/operators.rs" - instructions: "Test operators with various data types, edge cases, and performance scenarios. Include correctness tests and type coercion validation." - - path: "src/output/text.rs" - instructions: "Test text output with various file types, encoding scenarios, and GNU file compatibility. Include performance tests for large result sets." - - path: "src/output/json.rs" - instructions: "Test JSON output with schema validation, metadata accuracy, and serialization performance. Include API versioning and backward compatibility tests." + - path: "src/evaluator/**/*.rs" + instructions: "Test type interpretation, offset resolution, and operators with edge cases, overflow conditions, and endianness handling. Include property tests for invariants." + - path: "src/output/**/*.rs" + instructions: "Test output formatting with schema validation, metadata accuracy, GNU file compatibility, and serialization performance." - path: "src/io/**/*.rs" instructions: "Test I/O operations with truncated files, permission errors, memory mapping edge cases, and resource cleanup. Include memory safety tests and performance benchmarks." - path: "src/error.rs" diff --git a/.gitignore b/.gitignore index ef35e0cb..a3b3e3e6 100644 --- a/.gitignore +++ b/.gitignore @@ -146,3 +146,6 @@ docs/plans/ .full-review/ SECURITY_AUDIT.md todos/ +**/tessl__* +.tessl/tiles/ +.tessl/RULES.md diff --git a/.tessl/.gitignore b/.tessl/.gitignore new file mode 100644 index 00000000..7bbb3941 --- /dev/null +++ b/.tessl/.gitignore @@ -0,0 +1,2 @@ +tiles/ +RULES.md diff --git a/AGENTS.md b/AGENTS.md index 08a6d932..964af34f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,8 @@ This document provides comprehensive guidelines for AI assistants working on the libmagic-rs project, ensuring consistent, high-quality development practices and project understanding. +@GOTCHAS.md + ## Project Overview **libmagic-rs** is a pure-Rust implementation of libmagic, designed to replace the C-based library with a memory-safe, efficient alternative for file type detection. @@ -154,15 +156,10 @@ pub fn evaluate_magic_rules( ### Architecture Constraints -- `src/error.rs` is shared with `build.rs` -- cannot reference lib-only types like `crate::io::IoError` -- `FileError(String)` wraps structured I/O errors as strings to work around the build.rs constraint -- Serialization functions live in `src/parser/codegen.rs`, shared by both `build.rs` (via `#[path]` include) and `src/build_helpers.rs` (via `crate::parser::codegen`); `format_parse_error` remains duplicated in both because `ParseError` has different import paths -- Use `ParseError::IoError` for I/O errors in parser code, not `ParseError::invalid_syntax` -- Use `LibmagicError::ConfigError` for config validation, not `ParseError::invalid_syntax` - Clippy pedantic lints are active (e.g., prefer `trailing_zeros()` over bitwise masks) -- All public enum variants need `# Examples` rustdoc sections - Comparison operators share a `compare_values() -> Option` helper in `operators/comparison.rs` -- new comparison logic goes there, not in individual `apply_*` functions -- libmagic types are signed by default (`byte`, `short`, `long`, `quad`); unsigned variants use `u` prefix (`ubyte`, `ushort`, `ulong`, `uquad`, etc.) + +> See **[GOTCHAS.md](GOTCHAS.md)** for build script boundaries (S1), enum variant update checklists (S2), parser architecture (S3), numeric type pitfalls (S5), string/pstring encoding (S6), and other non-obvious behaviors. ### Naming Conventions @@ -208,7 +205,7 @@ cargo test --doc # Test documentation examples ### Currently Implemented (v0.1.0) - **Offsets**: Absolute and from-end specifications (indirect and relative are parsed but not yet evaluated) -- **Types**: `byte`, `short`, `long`, `quad`, `float`, `double`, `string`, `pstring` with endianness support; unsigned variants `ubyte`, `ushort`/`ubeshort`/`uleshort`, `ulong`/`ubelong`/`ulelong`, `uquad`/`ubequad`/`ulequad`; float/double endian variants `befloat`/`lefloat`, `bedouble`/`ledouble`; 32-bit date/timestamp types `date`/`ldate`/`bedate`/`beldate`/`ledate`/`leldate`; 64-bit date/timestamp types `qdate`/`qldate`/`beqdate`/`beqldate`/`leqdate`/`leqldate`; `pstring` is a Pascal string (length-prefixed byte followed by string data); date values formatted as `"Www Mmm DD HH:MM:SS YYYY"` matching GNU `file` output; types are signed by default (libmagic-compatible) +- **Types**: `byte`, `short`, `long`, `quad`, `float`, `double`, `string`, `pstring` with endianness support; unsigned variants `ubyte`, `ushort`/`ubeshort`/`uleshort`, `ulong`/`ubelong`/`ulelong`, `uquad`/`ubequad`/`ulequad`; float/double endian variants `befloat`/`lefloat`, `bedouble`/`ledouble`; 32-bit date/timestamp types `date`/`ldate`/`bedate`/`beldate`/`ledate`/`leldate`; 64-bit date/timestamp types `qdate`/`qldate`/`beqdate`/`beqldate`/`leqdate`/`leqldate`; `pstring` is a Pascal string (length-prefixed) with support for 1/2/4-byte length prefixes via `/B`, `/H` (2-byte BE), `/h` (2-byte LE), `/L` (4-byte BE), `/l` (4-byte LE) suffixes, and the `/J` flag (stored length includes prefix width, JPEG convention) which is combinable with width suffixes (e.g., `pstring/HJ`); date values formatted as "Www Mmm DD HH:MM:SS YYYY" matching GNU `file` output; types are signed by default (libmagic-compatible) - **Operators**: `=` (equal), `!=` (not equal), `<` (less than), `>` (greater than), `<=` (less equal), `>=` (greater equal), `&` (bitwise AND with optional mask), `^` (bitwise XOR), `~` (bitwise NOT), `x` (any value) - **Nested Rules**: Hierarchical rule evaluation with proper indentation - **String Matching**: Exact string matching with null-termination and Pascal string (length-prefixed) support @@ -240,7 +237,7 @@ impl BinaryRegex for regex::bytes::Regex { - No regex/search pattern matching - 64-bit integer types: `quad`/`uquad`, `bequad`/`ubequad`, `lequad`/`ulequad` are implemented; `qquad` (128-bit) is not yet supported - String evaluation reads until first NUL or end-of-buffer by default; `pstring` reads a length-prefixed Pascal string; `max_length: Some(_)` is supported internally but no dedicated fixed-length string parser syntax exists yet -- `pstring` only supports the default 1-byte length prefix (`/B`); multi-byte length prefix variants (`pstring/H` for 2-byte, `pstring/L` for 4-byte) are not yet implemented +- `pstring` supports 1-byte (`/B`), 2-byte big-endian (`/H`), 2-byte little-endian (`/h`), 4-byte big-endian (`/L`), and 4-byte little-endian (`/l`) length prefixes, plus the `/J` flag (stored length includes prefix width). All flags are combinable (e.g., `pstring/HJ`) and fully implemented. ### Operators diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 00b89d8d..d14d0cd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -361,10 +361,22 @@ libmagic-rs uses a **maintainer-driven** governance model. Decisions are made by As the project grows, active contributors who demonstrate sustained, high-quality contributions and alignment with project goals may be invited to become maintainers. +## Known Gotchas + +Before diving into the codebase, read **[GOTCHAS.md](GOTCHAS.md)** -- it documents non-obvious behaviors, common pitfalls, and architectural quirks that will save you debugging time. Key topics include: + +- Build script boundary constraints and shared code between `build.rs` and the library (S1) +- Enum variant update checklists for `TypeKind`, `Operator`, and `Value` (S2) +- Parser architecture split between `types.rs` and `grammar/mod.rs` (S3) +- Numeric type conversion and checked arithmetic requirements (S5) +- PString multi-byte length prefix endianness (S6) +- Clippy lint surprises and `unsafe_code = "forbid"` enforcement (S8) + ## Getting Help - **Issues**: For bug reports and feature requests - **Discussions**: For questions and ideas - **Documentation**: Check [docs/](docs/) for detailed guides +- **Gotchas**: Check [GOTCHAS.md](GOTCHAS.md) for known pitfalls Thank you for contributing to libmagic-rs! diff --git a/GOTCHAS.md b/GOTCHAS.md new file mode 100644 index 00000000..dfcf651c --- /dev/null +++ b/GOTCHAS.md @@ -0,0 +1,189 @@ +# Development Gotchas & Pitfalls + +This document tracks non-obvious behaviors, common pitfalls, and architectural "gotchas" in the libmagic-rs codebase to assist future maintainers and contributors. + +## 1. Build Script Boundary + +### 1.1 `error.rs` is Shared with `build.rs` + +`src/error.rs` is compiled by both the library and `build.rs`. It cannot reference lib-only types like `crate::io::IoError`. + +- **Workaround:** `FileError(String)` wraps structured I/O errors as strings. +- **Rule:** Use `ParseError::IoError` for I/O errors in parser code, not `ParseError::invalid_syntax`. + +### 1.2 Serialization Code Sharing + +Serialization functions live in `src/parser/codegen.rs`, shared by both `build.rs` (via `#[path]` include) and `src/build_helpers.rs` (via `crate::parser::codegen`). Only `format_parse_error` remains duplicated because `ParseError` has different import paths in each context. + +### 1.3 Generated Imports Must Stay in Sync + +`generate_builtin_rules()` emits a `use crate::parser::ast::{...}` import line in generated code. When adding new AST types (e.g., `PStringLengthWidth`), this import must be updated or the build pipeline breaks on any rule that uses the new type. + +- **Symptom:** Build fails with `E0433` ("cannot find type") in the generated `builtin_rules.rs`. +- **Fix:** Update the import in `generate_builtin_rules()` in `src/parser/codegen.rs`. + +## 2. Adding Enum Variants + +### 2.1 `TypeKind` Exhaustive Matches + +Adding a variant to `TypeKind` requires updating exhaustive matches in 10+ files: `ast`, `grammar`, `types`, `codegen`, `strength`, `property_tests`, `evaluator/types/mod.rs` (`read_typed_value`, `coerce_value_to_type`), `output/mod.rs` (2 length matches), `output/json.rs` (`format_value_as_hex`), and `grammar/tests.rs` (stale assertions). Note: `coerce_value_to_type` and output matches use catch-all `_ =>` so they compile without changes but may need semantic updates. + +### 2.2 `Operator` Exhaustive Matches + +Adding a variant to `Operator` requires updating: `ast`, `grammar`, `codegen`, `strength`, `property_tests`, `evaluator/operators/`. + +### 2.3 `Value` Exhaustive Matches + +Adding a variant to `Value` requires updating: `ast`, `codegen`, `strength`, `property_tests`, `output/mod.rs` (2 length matches), `output/json.rs` (`format_value_as_hex`), `evaluator/operators/comparison.rs` (`compare_values`). Bitwise/equality operators use catch-all `_ =>` so they are safe. + +- **Note:** `Value` no longer derives `Eq` (removed when `Value::Float(f64)` was added) -- no production code depends on `Value: Eq`. + +## 3. Parser Architecture + +### 3.1 Type Keyword Parsing Split + +Type keyword parsing lives in `src/parser/types.rs` (`parse_type_keyword` + `type_keyword_to_kind`); the grammar module in `src/parser/grammar/mod.rs` orchestrates and handles type-specific suffixes (e.g., pstring `/B`, `/H`, `/L`). + +- **Gotcha:** `parse_type_keyword` only consumes the base keyword (e.g., `"pstring"`), leaving suffixes for the grammar layer. +- **Gotcha:** `parse_type_and_operator` consumes trailing whitespace after parsing type+suffix -- test expectations for remaining input must account for this. + +### 3.2 `parse_number` Return Type + +`parse_number` returns `i64` and is shared by offset + value parsing. Never widen its return type. Use the separate `parse_unsigned_number` (`u64`) path only in `parse_numeric_value`. + +### 3.3 nom `tuple` Combinator + +The nom `tuple` combinator is deprecated. Use bare tuple syntax `(a, b, c)` directly as `Parser` is implemented for tuples. + +### 3.4 `type_keyword_to_kind` Line Length + +`type_keyword_to_kind` has `#[allow(clippy::too_many_lines)]` because it exceeds 100 lines with all date keywords. + +## 4. Module Visibility & Re-exports + +### 4.1 Private Engine Module + +`evaluator::engine` is private. Integration tests must import `libmagic_rs::evaluator::evaluate_rules` (re-exported), not `evaluator::engine::evaluate_rules`. + +### 4.2 `MatchResult` Name Collision + +`evaluator::MatchResult` was renamed to `evaluator::RuleMatch` (issue #60). `output::MatchResult` is the public-facing type. Do not confuse the two. + +### 4.3 Glob Re-exports + +Prefer `pub mod` over `pub use module::*` for submodules -- glob re-exports expand public API surface unintentionally. + +## 5. Numeric Types & Arithmetic + +### 5.1 `usize::from(u32)` Does Not Compile + +There is no `From for usize` (fails on 32-bit targets). Use `as usize` for widening conversions from `u32`. + +### 5.2 Checked Arithmetic for Offsets + +Buffer offset calculations must use `checked_add` to prevent overflow panics from malicious/malformed input. Always test with `usize::MAX` -- otherwise regressions go undetected. + +### 5.3 Float Epsilon Equality + +`inf - inf = NaN`, so `(a - b).abs() <= epsilon` fails for infinities. Handle `is_infinite()` before the epsilon check. Use `#[allow(clippy::float_cmp)]` on the exact infinity comparison. + +- **Note:** `apply_equal`/`apply_not_equal` use epsilon-aware comparison for `Value::Float`; ordering operators (`<`, `>`, `<=`, `>=`) use direct `partial_cmp` -- these are deliberately different semantics. + +## 6. String & PString Types + +### 6.1 Multi-Byte PString Length Prefixes + +Uppercase pstring suffix letters indicate **big-endian** byte order, lowercase indicate **little-endian**: `/H` (2-byte BE), `/h` (2-byte LE), `/L` (4-byte BE), `/l` (4-byte LE). The `/J` flag means the stored length includes the length field itself (JPEG convention). Flags are combinable (e.g., `pstring/HJ`). Test data must use the correct byte order for the variant (e.g., `\x00\x05` for `TwoByteBE` length 5, `\x05\x00` for `TwoByteLE` length 5). + +### 6.2 `medate`/`meldate` Not Supported + +Middle-endian date keywords are NOT supported. They were removed until real middle-endian decoding is implemented end-to-end. + +### 6.3 Signed-by-Default Types + +libmagic types are signed by default (`byte`, `short`, `long`, `quad`). Unsigned variants use `u` prefix (`ubyte`, `ushort`, `ulong`, `uquad`, etc.). + +## 7. Testing + +### 7.1 Doctest Import Paths + +Doctests compile as external crates. Use `libmagic_rs::` imports, never `crate::`. + +### 7.2 Heterogeneous Byte-Array Test Tables + +Table-driven tests with byte-array buffers of different sizes need `&[(&[u8], ...)]` slice type annotation. Bare arrays can't hold heterogeneous-length byte literals in a `[T; N]`. + +### 7.3 Doc Examples and Pattern Matching + +`cargo test --doc` verifies doc examples. Ensure example strings don't accidentally match multiple patterns. + +### 7.4 `clap` Args in Tests + +Adding fields to the clap `Args` struct requires updating ALL manual `Args { ... }` constructions in unit tests (search for `Args {` in `src/main.rs`). `process_file` and `run_analysis` are called from unit tests with mocked stdin -- signature changes require updating all test call sites. + +### 7.5 Reserved-for-Future Enum Variants + +Reserved-for-future enum variants (e.g., `TypeReadError::UnsupportedType`) need explicit doc comments explaining intent -- otherwise every reviewer flags them as dead code. + +## 8. Clippy & Formatting + +### 8.1 Enum Variant Naming + +Enum variants with a shared suffix (e.g., `OneByte`, `TwoByte`, `FourByte`) trigger `clippy::enum_variant_names`. Add `#[allow(clippy::enum_variant_names)]` when renaming would hurt readability. + +### 8.2 `unsafe_code = "forbid"` is Absolute + +`unsafe_code = "forbid"` in `Cargo.toml` workspace lints cannot be overridden with `#[allow(unsafe_code)]`. Use vetted crates (e.g., `chrono` for timezone) instead of libc FFI or subprocess calls. + +### 8.3 Pre-commit Hook Reformats + +Pre-commit hook runs `cargo fmt`. First commit attempt may fail if fmt modifies files -- just re-stage and retry. + +## 9. Error Handling Patterns + +### 9.1 Error Return Path Cleanup + +On error return paths, use `let _ = cleanup()` not `cleanup()?` -- the `?` masks the original error with cleanup failures. Bind original errors with `e @` pattern instead of reconstructing after state changes (avoids data loss). + +### 9.2 Use Correct Error Constructors + +- Use `ParseError::IoError` for I/O errors in parser code, not `ParseError::invalid_syntax` +- Use `LibmagicError::ConfigError` for config validation, not `ParseError::invalid_syntax` + +## 10. Documentation + +### 10.1 Extracting Public Functions + +When extracting public functions to new modules, verify `# Examples`, `# Arguments`, `# Returns` doc sections are preserved -- they silently disappear. + +### 10.2 `# Returns` and `# Security` Accuracy + +When refactoring docs, verify `# Returns` and `# Security` sections match actual behavior. For example, `from_utf8_lossy` never errors but docs may claim it does. + +### 10.3 Public Enum Variant Docs + +All public enum variants need `# Examples` rustdoc sections. Clippy enforces this. + +## 11. Date/Timestamp Types + +### 11.1 Date Value Representation + +`read_date`/`read_qdate` return `Value::Uint(raw_secs)` for correct numeric comparisons. Use `format_timestamp_value(secs, utc)` at output time for GNU `file`-compatible display. There is no special date coercion in `coerce_value_to_type`. + +### 11.2 Timezone Handling + +`chrono` crate (no default features, `std` + `clock`) is a vetted dependency for timezone offset computation in `evaluator/types/date.rs`. Do not use libc FFI or subprocess calls (see S8.2). + +## 12. Git & Release + +### 12.1 Repository Identity + +Repository is **EvilBit-Labs/libmagic-rs**. NEVER guess repo/org names -- always run `git remote -v` to verify. + +### 12.2 Tag Signing + +All tags and commits MUST be signed -- use `git tag -s` and `git commit -s -S`. If signing fails, STOP and troubleshoot. Never push unsigned tags/commits. Never delete or force-push tags without explicit user permission. + +### 12.3 Auto-Generated Release Workflow + +`.github/workflows/release.yml` is auto-generated by cargo-dist -- never modify manually. diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index 427c549c..b6b42e25 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -296,17 +296,17 @@ Data type specifications. use libmagic_rs::TypeKind; ``` -| Variant | Description | -| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Byte { signed }` | Single byte with explicit signedness (changed in v0.2.0) | -| `Short { endian, signed }` | 16-bit integer | -| `Long { endian, signed }` | 32-bit integer | -| `Quad { endian, signed }` | 64-bit integer | -| `Float { endian }` | 32-bit IEEE 754 floating-point | -| `Double { endian }` | 64-bit IEEE 754 floating-point | -| `Date { endian, utc }` | 32-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. Date values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | +| Variant | Description | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Byte { signed }` | Single byte with explicit signedness (changed in v0.2.0) | +| `Short { endian, signed }` | 16-bit integer | +| `Long { endian, signed }` | 32-bit integer | +| `Quad { endian, signed }` | 64-bit integer | +| `Float { endian }` | 32-bit IEEE 754 floating-point | +| `Double { endian }` | 64-bit IEEE 754 floating-point | +| `Date { endian, utc }` | 32-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. Date values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | | `QDate { endian, utc }` | 64-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. QDate values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | -| `String { max_length }` | String data | +| `String { max_length }` | String data | ##### 64-bit Integer Types @@ -381,12 +381,12 @@ Value types for matching. use libmagic_rs::Value; ``` -| Variant | Description | -| ---------------- | --------------------------------------------------------------------------------- | -| `Uint(u64)` | Unsigned integer | -| `Int(i64)` | Signed integer | -| `Float(f64)` | 64-bit floating-point value | -| `Bytes(Vec)` | Byte sequence | +| Variant | Description | +| ---------------- | -------------------------------------------------------------------------------------- | +| `Uint(u64)` | Unsigned integer | +| `Int(i64)` | Signed integer | +| `Float(f64)` | 64-bit floating-point value | +| `Bytes(Vec)` | Byte sequence | | `String(String)` | String value (also used for date/timestamp values formatted as human-readable strings) | **Note:** `Value` implements `PartialEq` but not `Eq` due to IEEE 754 NaN semantics (NaN is not equal to itself). diff --git a/docs/MAGIC_FORMAT.md b/docs/MAGIC_FORMAT.md index ef2e2eb1..6c1e2929 100644 --- a/docs/MAGIC_FORMAT.md +++ b/docs/MAGIC_FORMAT.md @@ -188,16 +188,38 @@ String escape sequences: **Pascal String (pstring)** -Length-prefixed string type where the first byte contains the string length (0-255), followed by that many bytes of string data. Unlike C strings, Pascal strings are not null-terminated. +Length-prefixed string type where a length prefix (1, 2, or 4 bytes) specifies the number of bytes of string data that follow. Unlike C strings, Pascal strings are not null-terminated. + +The length prefix width is controlled by suffix flags: + +| Suffix | Length Prefix Width | Byte Order | +| ------ | ------------------- | ------------- | +| `/B` | 1 byte (default) | N/A | +| `/H` | 2 bytes | big-endian | +| `/h` | 2 bytes | little-endian | +| `/L` | 4 bytes | big-endian | +| `/l` | 4 bytes | little-endian | + +The `/J` flag indicates JPEG-style self-inclusive length where the stored length value includes the size of the length prefix itself. This flag can be combined with any width suffix (`/HJ`, `/lJ`, etc.) or used alone (`/J` defaults to 1-byte width). + +Examples: ```text -0 pstring =JPEG JPEG image (Pascal string) +0 pstring =JPEG JPEG image (1-byte prefix, default) +0 pstring/B =JPEG JPEG image (1-byte prefix, explicit) +0 pstring/H =JPEG JPEG image (2-byte big-endian prefix) +0 pstring/h =JPEG JPEG image (2-byte little-endian prefix) +0 pstring/L =JPEG JPEG image (4-byte big-endian prefix) +0 pstring/l =JPEG JPEG image (4-byte little-endian prefix) +0 pstring/HJ =JPEG JPEG image (2-byte BE, self-inclusive length) ``` -The length byte value determines how many bytes to read for the string data. If `max_length` is specified in the magic file (not shown in the basic syntax), it caps the length byte value to prevent reading excessive data. +If `max_length` is specified in the magic file (not shown in the basic syntax), it caps the length value to prevent reading excessive data. ### String Flags +Flags for `string` type: + | Flag | Description | | ---- | ---------------------- | | `/c` | Case-insensitive match | @@ -210,6 +232,8 @@ Example: 0 string/c `), comparison (`<`, `>`, `<=`, `>=`), bitwise (`&`, `^`, `~`), and any-value (`x`) operators - ✅ **Value parsing**: Strings, numbers, and hex byte sequences with escape sequences +- ✅ **PString suffixes**: `/B`, `/H`, `/h`, `/L`, `/l` (length prefix width), `/J` (self-inclusive length) - ✅ **Error handling**: Comprehensive nom error handling with meaningful messages - ✅ **Rule parsing**: Complete rule parsing via `parse_magic_rule()` - ✅ **File parsing**: Complete magic file parsing with `parse_text_magic_file()` @@ -95,7 +96,11 @@ pub enum TypeKind { Long { endian: Endianness, signed: bool }, Quad { endian: Endianness, signed: bool }, String { max_length: Option }, - PString { max_length: Option }, // Pascal string (length-prefixed) + PString { + max_length: Option, + length_width: PStringLengthWidth, + length_includes_itself: bool, + }, // Pascal string (length-prefixed) } pub enum Operator { @@ -111,6 +116,14 @@ pub enum Operator { BitwiseNot, // ~ AnyValue, // x (always matches) } + +pub enum PStringLengthWidth { + OneByte, // 1-byte prefix (default, /B) + TwoByteBE, // 2-byte big-endian prefix (/H) + TwoByteLE, // 2-byte little-endian prefix (/h) + FourByteBE, // 4-byte big-endian prefix (/L) + FourByteLE, // 4-byte little-endian prefix (/l) +} ``` **Design Principles:** @@ -121,6 +134,18 @@ pub enum Operator { - **Type-safe**: Rust's type system prevents invalid rule combinations - **Explicit signedness**: `TypeKind::Byte` and integer types (Short, Long, Quad) distinguish signed from unsigned interpretations +**PString Length Prefix Support:** + +The `PString` type supports multiple length prefix formats through the `length_width` field: + +- **OneByte** (`/B`): Default 1-byte length prefix (0-255 range) +- **TwoByteBE** (`/H`): 2-byte big-endian prefix +- **TwoByteLE** (`/h`): 2-byte little-endian prefix +- **FourByteBE** (`/L`): 4-byte big-endian prefix +- **FourByteLE** (`/l`): 4-byte little-endian prefix + +The `length_includes_itself` field (controlled by the `/J` suffix) indicates JPEG-style self-inclusive length, where the stored length value includes the length field itself. This can be combined with any width variant (e.g., `/HJ` for 2-byte big-endian with self-inclusive length). + ### 3. Evaluator Module (`src/evaluator/`) The evaluator executes magic rules against file buffers to identify file types. (✅ Complete) @@ -134,7 +159,7 @@ The evaluator executes magic rules against file buffers to identify file types. - `types/`: Type interpretation submodule - `mod.rs`: Public API surface with `read_typed_value`, `coerce_value_to_type`, and type re-exports - `numeric.rs`: Numeric type handling (`read_byte`, `read_short`, `read_long`, `read_quad`) with endianness and signedness support - - `string.rs`: String type handling (`read_string`) with null-termination and UTF-8 conversion + - `string.rs`: String type handling (`read_string`, `read_pstring`) with null-termination, UTF-8 conversion, and multi-byte length prefix support - `tests.rs`: Module tests - `offset/`: Offset resolution submodule - `mod.rs`: Dispatcher (`resolve_offset`) and re-exports diff --git a/docs/src/ast-structures.md b/docs/src/ast-structures.md index b126e56d..0910425f 100644 --- a/docs/src/ast-structures.md +++ b/docs/src/ast-structures.md @@ -186,7 +186,11 @@ pub enum TypeKind { String { max_length: Option }, /// Pascal string (length-prefixed) - PString { max_length: Option }, + PString { + max_length: Option, + length_width: PStringLengthWidth, + length_includes_itself: bool, + }, } ``` @@ -231,34 +235,91 @@ let string_type = TypeKind::String { ### PString (Pascal String) -Pascal-style length-prefixed strings where the first byte contains the string length. +Pascal-style length-prefixed strings where the length prefix can be 1, 2, or 4 bytes depending on the `length_width` field. **Structure:** -- Length byte: 1 byte indicating string length (0-255) -- String data: The number of bytes specified by the length byte +- Length prefix: 1, 2, or 4 bytes indicating string length, with configurable endianness +- String data: The number of bytes specified by the length prefix **Example:** ``` 0 pstring JPEG +0 pstring/H JPEG ``` -Reads one byte as length, then reads that many bytes as a string. +The first line reads a 1-byte length prefix (default), then reads that many bytes as a string. The second line reads a 2-byte big-endian length prefix. **Behavior:** - Returns `Value::String` containing the string data (without the length prefix) -- Performs bounds checking on both the length byte and the string data +- Performs bounds checking on both the length prefix and the string data - Supports all string comparison operators +- Length prefix width controlled by `PStringLengthWidth` enum +- Optional `/J` flag indicates JPEG-style self-inclusive length (stored length includes the prefix itself) + +### PStringLengthWidth Enum + +The `PStringLengthWidth` enum specifies the width and endianness of the length prefix: + +```rust +pub enum PStringLengthWidth { + /// 1-byte length prefix (default, `/B` suffix) + OneByte, + /// 2-byte big-endian length prefix (`/H` suffix) + TwoByteBE, + /// 2-byte little-endian length prefix (`/h` suffix) + TwoByteLE, + /// 4-byte big-endian length prefix (`/L` suffix) + FourByteBE, + /// 4-byte little-endian length prefix (`/l` suffix) + FourByteLE, +} +``` + +**Suffix conventions:** +- `/B` - 1-byte length prefix (default if no suffix specified) +- `/H` - 2-byte big-endian length prefix +- `/h` - 2-byte little-endian length prefix +- `/L` - 4-byte big-endian length prefix +- `/l` - 4-byte little-endian length prefix +- `/J` - Length includes the prefix width itself (combinable: `/HJ`, `/lJ`, etc.) -**Usage:** +**Examples:** ```rust -// Pascal string with no length limit -let pstring_type = TypeKind::PString { - max_length: None +use libmagic_rs::parser::ast::{TypeKind, PStringLengthWidth}; + +// 1-byte length prefix (default) +let pstring_default = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, +}; + +// 2-byte big-endian length prefix +let pstring_be = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::TwoByteBE, + length_includes_itself: false, +}; + +// 4-byte little-endian length prefix +let pstring_le = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::FourByteLE, + length_includes_itself: false, +}; + +// 2-byte big-endian with /J flag (JPEG-style self-inclusive length) +let pstring_jpeg = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::TwoByteBE, + length_includes_itself: true, }; -// Pascal string with maximum 64-byte limit +// Maximum 64-byte limit with 1-byte prefix let limited_pstring = TypeKind::PString { - max_length: Some(64) + max_length: Some(64), + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, }; ``` diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index 2437c6de..f6f70183 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -34,7 +34,7 @@ The evaluator module separates public interface from implementation: - **`types/numeric.rs`** - Numeric type handling: `read_byte`, `read_short`, `read_long`, `read_quad` with endianness and signedness support - **`types/float.rs`** - Floating-point type handling: `read_float` (32-bit IEEE 754), `read_double` (64-bit IEEE 754) with endianness support - **`types/date.rs`** - Date and timestamp type handling: `read_date` (32-bit Unix timestamps), `read_qdate` (64-bit Unix timestamps) with endianness and UTC/local time support - - **`types/string.rs`** - String type handling: `read_string` with null-termination and UTF-8 conversion + - **`types/string.rs`** - String type handling: `read_string` with null-termination and UTF-8 conversion, `read_pstring` with configurable length-prefix widths (1, 2, or 4 bytes) - **`types/tests.rs`** - Module tests - **`evaluator/strength.rs`** - Rule strength calculation @@ -118,6 +118,7 @@ Interprets bytes according to type specifications. The types module is organized - **Date**: 32-bit Unix timestamps (signed seconds since epoch) with configurable endianness and UTC/local time formatting - **QDate**: 64-bit Unix timestamps (signed seconds since epoch) with configurable endianness and UTC/local time formatting - **String**: Byte sequences with length limits +- **PString**: Pascal-style length-prefixed strings with 1-byte (`/B`), 2-byte (`/H` or `/h`), or 4-byte (`/L` or `/l`) length prefixes, supporting big-endian and little-endian byte order - **Bounds checking**: Prevents buffer overruns ```rust @@ -175,6 +176,34 @@ pub fn read_qdate( - The evaluator reads raw integer timestamps from the buffer and converts them to formatted date strings for comparison - Example: A 32-bit value `1234567890` at offset 0 with type `ldate` would be evaluated as `"Fri Feb 13 23:31:30 2009"` +**Pascal String Type Reading (`evaluator/types/string.rs`):** + +```rust +pub fn read_pstring( + buffer: &[u8], + offset: usize, + max_length: Option, + length_width: PStringLengthWidth, + length_includes_itself: bool, +) -> Result +``` + +- `read_pstring()` reads a length-prefixed Pascal string with configurable prefix width +- **Length prefix width** (`length_width`): + - `PStringLengthWidth::OneByte` - 1-byte length prefix (`/B` suffix, default) + - `PStringLengthWidth::TwoByteBE` - 2-byte big-endian length prefix (`/H` suffix) + - `PStringLengthWidth::TwoByteLE` - 2-byte little-endian length prefix (`/h` suffix) + - `PStringLengthWidth::FourByteBE` - 4-byte big-endian length prefix (`/L` suffix) + - `PStringLengthWidth::FourByteLE` - 4-byte little-endian length prefix (`/l` suffix) +- **Length interpretation**: + - Reads 1, 2, or 4 bytes from buffer using `from_be_bytes` or `from_le_bytes` depending on variant + - The length value specifies how many bytes of string data follow the prefix +- **`/J` flag** (`length_includes_itself`): + - When `true`, the stored length value includes the prefix width itself (JPEG-style) + - The evaluator subtracts the prefix width (1, 2, or 4 bytes) from the length to get effective content length + - Example: A 2-byte big-endian prefix with value `7` and `/J` flag yields `7 - 2 = 5` bytes of string content +- Returns `Value::String` with UTF-8 conversion (using lossy conversion for invalid UTF-8) + ### Operator Application (`evaluator/operators.rs`) Applies comparison operations: @@ -486,11 +515,42 @@ let matches = evaluate_rules(&rules, &buffer)?; assert_eq!(matches[0].message, "Pi constant detected"); ``` +**Example with pstring types:** + +```rust +use libmagic_rs::{evaluate_rules, EvaluationConfig}; +use libmagic_rs::parser::parse_text_magic_file; + +// Parse magic rules with pstring variants +let magic_content = r#" +0 pstring/B MAGIC Pascal string (1-byte prefix) +0 pstring/H =\x00\x05MAGIC Pascal string (2-byte BE prefix) +0 pstring/h =\x05\x00MAGIC Pascal string (2-byte LE prefix) +0 pstring/L =\x00\x00\x00\x05MAGIC Pascal string (4-byte BE prefix) +0 pstring/l =\x05\x00\x00\x00MAGIC Pascal string (4-byte LE prefix) +"#; +let rules = parse_text_magic_file(magic_content)?; + +// 1-byte prefix: length=5, then "MAGIC" +let buffer = b"\x05MAGIC"; +let matches = evaluate_rules(&rules, &buffer)?; +assert_eq!(matches[0].message, "Pascal string (1-byte prefix)"); + +// 2-byte big-endian prefix with /J flag: stored length 7 (includes 2-byte prefix), effective content 5 bytes +let magic_content_j = r#" +0 pstring/HJ =MAGIC JPEG-style pstring with self-inclusive length +"#; +let rules_j = parse_text_magic_file(magic_content_j)?; +let buffer_j = b"\x00\x07MAGIC"; // 2-byte BE prefix: value 7, minus 2 = 5 bytes of content +let matches_j = evaluate_rules(&rules_j, &buffer_j)?; +assert_eq!(matches_j[0].message, "JPEG-style pstring with self-inclusive length"); +``` + ## Implementation Status - [x] Basic evaluation engine structure - [x] Offset resolution (absolute, relative, from-end) -- [x] Type reading with endianness support (Byte, Short, Long, Quad, Float, Double, Date, QDate, String) +- [x] Type reading with endianness support (Byte, Short, Long, Quad, Float, Double, Date, QDate, String, PString with 1/2/4-byte prefixes) - [x] Operator application (Equal, NotEqual, LessThan, GreaterThan, LessEqual, GreaterEqual, BitwiseAnd, BitwiseAndMask) - [x] Hierarchical rule processing with child evaluation - [x] Error handling with graceful degradation diff --git a/docs/src/magic-format.md b/docs/src/magic-format.md index 354dac18..8a372f52 100644 --- a/docs/src/magic-format.md +++ b/docs/src/magic-format.md @@ -227,13 +227,57 @@ String escape sequences: ### Pascal String Type -Pascal string (pstring) is a length-prefixed string type. The first byte contains the string length (0-255), followed by that many bytes of string data. Unlike C strings, Pascal strings are not null-terminated. +Pascal string (pstring) is a length-prefixed string type. The length prefix can be 1, 2, or 4 bytes depending on the suffix flag. Unlike C strings, Pascal strings are not null-terminated. + +#### Length Prefix Width + +The default pstring type uses a 1-byte length prefix (0-255 range). Use suffix flags to specify different prefix widths: + +| Suffix | Width | Endianness | Range | +| ------ | ------- | ------------- | --------------- | +| `/B` | 1 byte | N/A | 0-255 (default) | +| `/H` | 2 bytes | big-endian | 0-65535 | +| `/h` | 2 bytes | little-endian | 0-65535 | +| `/L` | 4 bytes | big-endian | 0-4294967295 | +| `/l` | 4 bytes | little-endian | 0-4294967295 | + +#### Self-Inclusive Length (`/J` Flag) + +The `/J` flag indicates that the stored length value includes the size of the length prefix itself (JPEG-style). This flag can be combined with any width variant. + +#### Examples + +Basic pstring with default 1-byte prefix: ```text 0 pstring =JPEG JPEG image (Pascal string) ``` -The evaluator reads the length byte, then reads that many bytes as string data. The optional max_length parameter caps the length byte value: +2-byte big-endian length prefix: + +```text +0 pstring/H =JPEG JPEG image (2-byte BE prefix) +``` + +4-byte little-endian length prefix: + +```text +0 pstring/l x \b, name: %s +``` + +Self-inclusive length with 2-byte big-endian prefix: + +```text +0 pstring/HJ x \b, JPEG-style length +``` + +Self-inclusive length with default 1-byte prefix: + +```text +0 pstring/J x \b, self-inclusive length +``` + +The optional max_length parameter caps the length value: ```text 0 pstring x \b, name: %s diff --git a/docs/src/parser.md b/docs/src/parser.md index a2e69b02..3a253860 100644 --- a/docs/src/parser.md +++ b/docs/src/parser.md @@ -186,40 +186,77 @@ Parsed literals are stored as `Value::Float(f64)` in the AST, regardless of whet ### Pascal String (pstring) Type -The parser supports Pascal-style length-prefixed strings through the `pstring` keyword: +The parser supports Pascal-style length-prefixed strings through the `pstring` keyword with multiple length prefix width variants: **Type Keyword:** -- `pstring` - Length-prefixed string (1-byte length + string data) → `TypeKind::PString { max_length: None }` +- `pstring` - Length-prefixed string → `TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false }` + +**Length Prefix Width Variants:** + +Pascal strings support multiple length prefix widths via suffix modifiers: + +- `/B` - 1-byte length prefix (default) → `PStringLengthWidth::OneByte` +- `/H` - 2-byte big-endian length prefix → `PStringLengthWidth::TwoByteBE` +- `/h` - 2-byte little-endian length prefix → `PStringLengthWidth::TwoByteLE` +- `/L` - 4-byte big-endian length prefix → `PStringLengthWidth::FourByteBE` +- `/l` - 4-byte little-endian length prefix → `PStringLengthWidth::FourByteLE` + +**Self-Inclusive Length Flag (`/J`):** + +The `/J` flag indicates JPEG-style self-inclusive length, where the stored length value includes the length prefix bytes themselves. The evaluator subtracts the prefix width from the stored length to determine the actual string data length. + +The `/J` flag can be combined with any width variant: + +- `/J` - 1-byte self-inclusive (default width) +- `/BJ` - 1-byte self-inclusive (explicit) +- `/HJ` - 2-byte big-endian self-inclusive +- `/hJ` - 2-byte little-endian self-inclusive +- `/LJ` - 4-byte big-endian self-inclusive +- `/lJ` - 4-byte little-endian self-inclusive **Format:** -Pascal strings store the length as the first byte (0-255), followed by that many bytes of string data. Unlike C strings, they are not null-terminated. +Pascal strings store the length as a prefix (1, 2, or 4 bytes depending on the variant), followed by that many bytes of string data. Unlike C strings, they are not null-terminated. When the `/J` flag is used, the length value includes the prefix size itself. **Parser Implementation:** - Recognized by `parse_type_keyword()` in `src/parser/types.rs` -- Maps to `TypeKind::PString` in the AST -- Evaluator reads length prefix byte then that many bytes as string data +- Suffix parsing handled by `parse_pstring_suffix()` in `src/parser/grammar/mod.rs` +- Maps to `TypeKind::PString` in the AST with `length_width` and `length_includes_itself` fields +- Evaluator reads length prefix using appropriate byte order (`from_be_bytes` or `from_le_bytes`) - Stored as `Value::String` for comparison with string operators -- Supports optional `max_length` field to cap the length byte value +- Supports optional `max_length` field to cap the length value **Usage in Magic Rules:** ```rust -// Basic pstring matching +// Basic pstring matching (1-byte length prefix) 0 pstring =Hello // Match if pstring equals "Hello" 0 pstring x // Match any pstring value -// With max_length constraint (parsed separately) -0 pstring/64 x // Limit string read to 64 bytes +// Multi-byte length prefix variants +0 pstring/H =Test // 2-byte big-endian length prefix +0 pstring/h =Test // 2-byte little-endian length prefix +0 pstring/L =Test // 4-byte big-endian length prefix +0 pstring/l =Test // 4-byte little-endian length prefix + +// JPEG-style self-inclusive length +0 pstring/J x // 1-byte self-inclusive length +0 pstring/HJ =Data // 2-byte big-endian self-inclusive length +0 pstring/lJ =Data // 4-byte little-endian self-inclusive length + +// With max_length constraint +0 pstring/H/64 x // 2-byte prefix, limit read to 64 bytes ``` **Features:** -- ✅ Single type keyword `pstring` -- ✅ Length-prefixed format (1 byte length, 0-255 bytes data) -- ✅ Bounds checking for both length byte and string data +- ✅ Five length prefix width variants (1-byte, 2-byte BE/LE, 4-byte BE/LE) +- ✅ Self-inclusive length flag (`/J`) for JPEG-style length encoding +- ✅ Combinable suffix syntax (`/HJ`, `/lJ`, etc.) +- ✅ Bounds checking for both length prefix and string data +- ✅ Proper endianness handling via `from_be_bytes` / `from_le_bytes` - ✅ UTF-8 validation with replacement character for invalid sequences - ✅ Optional `max_length` parameter to limit string reads - ✅ String comparison operators work with pstring values diff --git a/src/build_helpers.rs b/src/build_helpers.rs index 98cac75d..6e7e0936 100644 --- a/src/build_helpers.rs +++ b/src/build_helpers.rs @@ -316,15 +316,28 @@ mod tests { #[test] fn test_serialize_type_kind_pstring() { - let typ1 = TypeKind::PString { max_length: None }; + use crate::parser::ast::PStringLengthWidth; + let typ1 = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }; let serialized1 = serialize_type_kind(&typ1); - assert_eq!(serialized1, "TypeKind::PString { max_length: None }"); + assert_eq!( + serialized1, + "TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false }" + ); let typ2 = TypeKind::PString { max_length: Some(128), + length_width: PStringLengthWidth::FourByteLE, + length_includes_itself: false, }; let serialized2 = serialize_type_kind(&typ2); - assert_eq!(serialized2, "TypeKind::PString { max_length: Some(128) }"); + assert_eq!( + serialized2, + "TypeKind::PString { max_length: Some(128), length_width: PStringLengthWidth::FourByteLE, length_includes_itself: false }" + ); } #[test] diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests.rs index afcef3b4..b3352dac 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests.rs @@ -1890,7 +1890,11 @@ fn test_evaluate_single_rule_pstring_match() { // Pascal string: length byte (5) followed by "Hello" let rule = MagicRule { offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { max_length: None }, + typ: TypeKind::PString { + max_length: None, + length_width: crate::parser::ast::PStringLengthWidth::OneByte, + length_includes_itself: false, + }, op: Operator::Equal, value: Value::String("Hello".to_string()), message: "Pascal string detected".to_string(), @@ -1912,7 +1916,11 @@ fn test_evaluate_single_rule_pstring_no_match() { // Pascal string in buffer is "Hello", rule expects "World" let rule = MagicRule { offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { max_length: None }, + typ: TypeKind::PString { + max_length: None, + length_width: crate::parser::ast::PStringLengthWidth::OneByte, + length_includes_itself: false, + }, op: Operator::Equal, value: Value::String("World".to_string()), message: "Should not match".to_string(), @@ -1935,7 +1943,11 @@ fn test_evaluate_single_rule_pstring_with_child_rule() { // Child: byte at offset 4 equals 0x02 let rule = MagicRule { offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { max_length: None }, + typ: TypeKind::PString { + max_length: None, + length_width: crate::parser::ast::PStringLengthWidth::OneByte, + length_includes_itself: false, + }, op: Operator::Equal, value: Value::String("ELF".to_string()), message: "Pascal ELF".to_string(), diff --git a/src/evaluator/strength.rs b/src/evaluator/strength.rs index 197fcb8e..11211239 100644 --- a/src/evaluator/strength.rs +++ b/src/evaluator/strength.rs @@ -71,7 +71,7 @@ pub fn calculate_default_strength(rule: &MagicRule) -> i32 { // Type contribution: more specific types get higher strength strength += match &rule.typ { // Strings are most specific (they match exact byte sequences) - TypeKind::String { max_length } | TypeKind::PString { max_length } => { + TypeKind::String { max_length } | TypeKind::PString { max_length, .. } => { // Base string strength let base = 20; // Add bonus for limited-length strings (more constrained match) diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index 4616983c..fa3e4783 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -40,6 +40,31 @@ pub enum TypeReadError { /// The name of the unsupported type. type_name: String, }, + /// Invalid pstring length prefix value (e.g., `/J` flag with stored length + /// smaller than the prefix width). + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::evaluator::types::TypeReadError; + /// let err = TypeReadError::InvalidPStringLength { + /// stored_length: 1, + /// prefix_width: 2, + /// }; + /// assert_eq!( + /// err.to_string(), + /// "Invalid pstring length prefix: stored length 1 is less than prefix width 2" + /// ); + /// ``` + #[error( + "Invalid pstring length prefix: stored length {stored_length} is less than prefix width {prefix_width}" + )] + InvalidPStringLength { + /// The length value stored in the pstring prefix. + stored_length: usize, + /// The byte width of the length prefix field. + prefix_width: usize, + }, } /// Reads bytes according to the specified `TypeKind`. @@ -81,7 +106,17 @@ pub fn read_typed_value( TypeKind::Date { endian, utc } => read_date(buffer, offset, *endian, *utc), TypeKind::QDate { endian, utc } => read_qdate(buffer, offset, *endian, *utc), TypeKind::String { max_length } => read_string(buffer, offset, *max_length), - TypeKind::PString { max_length } => read_pstring(buffer, offset, *max_length), + TypeKind::PString { + max_length, + length_width, + length_includes_itself, + } => read_pstring( + buffer, + offset, + *max_length, + *length_width, + *length_includes_itself, + ), } } diff --git a/src/evaluator/types/string.rs b/src/evaluator/types/string.rs index c273aed0..b239fbe6 100644 --- a/src/evaluator/types/string.rs +++ b/src/evaluator/types/string.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use super::TypeReadError; -use crate::parser::ast::Value; +use crate::parser::ast::{PStringLengthWidth, Value}; /// Safely reads a null-terminated string from the buffer at the specified offset. /// @@ -85,8 +85,9 @@ pub fn read_string( /// Reads a Pascal-style length-prefixed string from the buffer. /// -/// Pascal strings store the length as the first byte (0-255), followed by -/// that many bytes of string data. Unlike C strings, they are not null-terminated. +/// Pascal strings store the length prefix (1, 2, or 4 bytes depending on +/// `length_width`), followed by that many bytes of string data. Unlike C +/// strings, they are not null-terminated. /// /// # Arguments /// @@ -110,52 +111,116 @@ pub fn read_string( /// /// ``` /// use libmagic_rs::evaluator::types::read_pstring; -/// use libmagic_rs::parser::ast::Value; +/// use libmagic_rs::parser::ast::{Value, PStringLengthWidth}; /// /// let buffer = b"\x05Hello"; -/// let result = read_pstring(buffer, 0, None).unwrap(); +/// let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); +/// assert_eq!(result, Value::String("Hello".to_string())); +/// +/// let buffer = b"\x00\x05Hello"; +/// let result = read_pstring(buffer, 0, None, PStringLengthWidth::TwoByteBE, false).unwrap(); /// assert_eq!(result, Value::String("Hello".to_string())); /// -/// let buffer = b"\x0aHelloWorld"; -/// let result = read_pstring(buffer, 0, Some(5)).unwrap(); +/// let buffer = b"\x05\x00\x00\x00Hello"; +/// let result = read_pstring(buffer, 0, None, PStringLengthWidth::FourByteLE, false).unwrap(); /// assert_eq!(result, Value::String("Hello".to_string())); /// ``` /// /// # Errors /// /// Returns `TypeReadError::BufferOverrun` if: -/// - The offset is beyond buffer bounds (cannot read the length byte) -/// - The string data (length byte value) extends beyond the buffer +/// - The offset is beyond buffer bounds (cannot read the length prefix) +/// - The string data (length prefix value) extends beyond the buffer pub fn read_pstring( buffer: &[u8], offset: usize, max_length: Option, + length_width: PStringLengthWidth, + length_includes_itself: bool, ) -> Result { - // Check if we can read the length byte - let length_byte = *buffer.get(offset).ok_or(TypeReadError::BufferOverrun { - offset, - buffer_len: buffer.len(), - })?; - - let string_length = usize::from(length_byte); - - // Apply max_length limit if specified. - // NOTE: We intentionally validate bounds against actual_length (after capping), - // not against the raw length byte. This matches GNU file's behavior: if the - // length byte claims 10 bytes but max_length caps to 3 and 3+ bytes exist, - // the read succeeds. Validating against the raw length byte would reject - // valid magic rules where max_length is used precisely to handle truncated data. + let width = length_width.byte_count(); + // Check if we can read the length prefix (checked arithmetic to prevent overflow) + let prefix_end = offset + .checked_add(width) + .ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + let len_bytes = buffer + .get(offset..prefix_end) + .ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + let string_length = match length_width { + PStringLengthWidth::OneByte => { + usize::from(*len_bytes.first().ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?) + } + PStringLengthWidth::TwoByteBE => { + let arr: [u8; 2] = len_bytes + .try_into() + .map_err(|_| TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + usize::from(u16::from_be_bytes(arr)) + } + PStringLengthWidth::TwoByteLE => { + let arr: [u8; 2] = len_bytes + .try_into() + .map_err(|_| TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + usize::from(u16::from_le_bytes(arr)) + } + PStringLengthWidth::FourByteBE => { + let arr: [u8; 4] = len_bytes + .try_into() + .map_err(|_| TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + u32::from_be_bytes(arr) as usize + } + PStringLengthWidth::FourByteLE => { + let arr: [u8; 4] = len_bytes + .try_into() + .map_err(|_| TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + u32::from_le_bytes(arr) as usize + } + }; + + // /J flag: the stored length includes the prefix width itself + let string_length = if length_includes_itself { + string_length + .checked_sub(width) + .ok_or(TypeReadError::InvalidPStringLength { + stored_length: string_length, + prefix_width: width, + })? + } else { + string_length + }; + let actual_length = if let Some(max_len) = max_length { std::cmp::min(string_length, max_len) } else { string_length }; - // Check if we have enough bytes for the (possibly capped) string data - let string_start = offset.checked_add(1).ok_or(TypeReadError::BufferOverrun { - offset, - buffer_len: buffer.len(), - })?; + let string_start = offset + .checked_add(width) + .ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; let string_end = string_start .checked_add(actual_length) @@ -163,22 +228,110 @@ pub fn read_pstring( offset: usize::MAX, buffer_len: buffer.len(), })?; - if string_end > buffer.len() { return Err(TypeReadError::BufferOverrun { offset: string_end, buffer_len: buffer.len(), }); } - let string_bytes = &buffer[string_start..string_end]; let string_value = String::from_utf8_lossy(string_bytes).into_owned(); - Ok(Value::String(string_value)) } #[cfg(test)] mod tests { + use crate::parser::ast::PStringLengthWidth; + #[test] + fn test_read_pstring_one_byte_width() { + let result = read_pstring(b"\x03abc", 0, None, PStringLengthWidth::OneByte, false).unwrap(); + assert_eq!(result, Value::String("abc".to_string())); + + let result = + read_pstring(b"\x05Hello", 0, Some(3), PStringLengthWidth::OneByte, false).unwrap(); + assert_eq!(result, Value::String("Hel".to_string())); + } + + #[test] + fn test_read_pstring_two_byte_width() { + let result = read_pstring( + b"\x03\x00abc", + 0, + None, + PStringLengthWidth::TwoByteLE, + false, + ) + .unwrap(); + assert_eq!(result, Value::String("abc".to_string())); + + let result = read_pstring( + b"\x05\x00Hello", + 0, + Some(3), + PStringLengthWidth::TwoByteLE, + false, + ) + .unwrap(); + assert_eq!(result, Value::String("Hel".to_string())); + } + + #[test] + fn test_read_pstring_four_byte_width() { + let result = read_pstring( + b"\x03\x00\x00\x00abc", + 0, + None, + PStringLengthWidth::FourByteLE, + false, + ) + .unwrap(); + assert_eq!(result, Value::String("abc".to_string())); + + let result = read_pstring( + b"\x05\x00\x00\x00Hello", + 0, + Some(3), + PStringLengthWidth::FourByteLE, + false, + ) + .unwrap(); + assert_eq!(result, Value::String("Hel".to_string())); + } + + #[test] + fn test_read_pstring_buffer_overrun_widths() { + // Not enough bytes for length prefix + let cases: &[(&[u8], usize, PStringLengthWidth)] = &[ + (b"", 0, PStringLengthWidth::OneByte), + (b"\x01", 1, PStringLengthWidth::OneByte), + (b"\x01", 0, PStringLengthWidth::TwoByteLE), + (b"\x01\x00", 0, PStringLengthWidth::FourByteLE), + ]; + for &(buffer, offset, width) in cases { + let result = read_pstring(buffer, offset, None, width, false); + assert!( + matches!(result, Err(TypeReadError::BufferOverrun { .. })), + "Expected buffer overrun for buffer {buffer:?}, offset {offset}, width {width:?}" + ); + } + } + + #[test] + fn test_read_pstring_data_overrun_widths() { + // Enough for prefix, not enough for data + let cases: &[(&[u8], usize, PStringLengthWidth)] = &[ + (b"\x05ab", 0, PStringLengthWidth::OneByte), + (b"\x05\x00ab", 0, PStringLengthWidth::TwoByteLE), + (b"\x05\x00\x00\x00ab", 0, PStringLengthWidth::FourByteLE), + ]; + for &(buffer, offset, width) in cases { + let result = read_pstring(buffer, offset, None, width, false); + assert!( + matches!(result, Err(TypeReadError::BufferOverrun { .. })), + "Expected buffer overrun for buffer {buffer:?}, offset {offset}, width {width:?}" + ); + } + } use super::*; use crate::evaluator::types::read_typed_value; use crate::parser::ast::TypeKind; @@ -424,56 +577,57 @@ mod tests { #[test] fn test_read_pstring_basic() { let buffer = b"\x05Hello"; - let result = read_pstring(buffer, 0, None).unwrap(); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Hello".to_string())); } #[test] fn test_read_pstring_at_offset() { let buffer = b"PREFIX\x03Foo"; - let result = read_pstring(buffer, 6, None).unwrap(); + let result = read_pstring(buffer, 6, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Foo".to_string())); } #[test] fn test_read_pstring_empty_string() { let buffer = b"\x00trailing"; - let result = read_pstring(buffer, 0, None).unwrap(); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String(String::new())); } #[test] fn test_read_pstring_single_char() { let buffer = b"\x01A"; - let result = read_pstring(buffer, 0, None).unwrap(); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("A".to_string())); } #[test] fn test_read_pstring_max_length_limits() { let buffer = b"\x0aHelloWorld"; - let result = read_pstring(buffer, 0, Some(5)).unwrap(); + let result = read_pstring(buffer, 0, Some(5), PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Hello".to_string())); } #[test] fn test_read_pstring_max_length_larger_than_prefix() { let buffer = b"\x03Foo"; - let result = read_pstring(buffer, 0, Some(100)).unwrap(); + let result = + read_pstring(buffer, 0, Some(100), PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Foo".to_string())); } #[test] fn test_read_pstring_max_length_zero() { let buffer = b"\x05Hello"; - let result = read_pstring(buffer, 0, Some(0)).unwrap(); + let result = read_pstring(buffer, 0, Some(0), PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String(String::new())); } #[test] fn test_read_pstring_buffer_overrun_offset_past_end() { let buffer = b"Hello"; - let result = read_pstring(buffer, 10, None); + let result = read_pstring(buffer, 10, None, PStringLengthWidth::OneByte, false); assert_eq!( result.unwrap_err(), TypeReadError::BufferOverrun { @@ -486,7 +640,7 @@ mod tests { #[test] fn test_read_pstring_buffer_overrun_empty_buffer() { let buffer = b""; - let result = read_pstring(buffer, 0, None); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false); assert_eq!( result.unwrap_err(), TypeReadError::BufferOverrun { @@ -500,7 +654,7 @@ mod tests { fn test_read_pstring_buffer_overrun_length_exceeds_data() { // Length byte says 10 but only 3 bytes follow let buffer = b"\x0aFoo"; - let result = read_pstring(buffer, 0, None); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false); assert_eq!( result.unwrap_err(), TypeReadError::BufferOverrun { @@ -514,7 +668,7 @@ mod tests { fn test_read_pstring_length_byte_only() { // Buffer has length byte but no string data, and length > 0 let buffer = b"\x05"; - let result = read_pstring(buffer, 0, None); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false); assert_eq!( result.unwrap_err(), TypeReadError::BufferOverrun { @@ -527,7 +681,7 @@ mod tests { #[test] fn test_read_pstring_offset_overflow() { let buffer = b"\x05Hello"; - let result = read_pstring(buffer, usize::MAX, None); + let result = read_pstring(buffer, usize::MAX, None, PStringLengthWidth::OneByte, false); assert_eq!( result.unwrap_err(), TypeReadError::BufferOverrun { @@ -541,7 +695,7 @@ mod tests { fn test_read_pstring_max_length_caps_when_buffer_short() { // Length byte says 10, only 5 data bytes follow, but max_length=5 caps the read let buffer = b"\x0aHello"; - let result = read_pstring(buffer, 0, Some(5)).unwrap(); + let result = read_pstring(buffer, 0, Some(5), PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Hello".to_string())); } @@ -549,14 +703,14 @@ mod tests { fn test_read_pstring_utf8_valid() { // "Café" in UTF-8 is 5 bytes: 43 61 66 c3 a9 let buffer = b"\x05Caf\xc3\xa9"; - let result = read_pstring(buffer, 0, None).unwrap(); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("Café".to_string())); } #[test] fn test_read_pstring_utf8_invalid() { let buffer = b"\x03\xff\xfe\xfd"; - let result = read_pstring(buffer, 0, None).unwrap(); + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); if let Value::String(s) = result { assert!(s.contains('\u{FFFD}')); } else { @@ -569,16 +723,21 @@ mod tests { // Length byte = 255, with exactly 255 bytes of data let mut buffer = vec![0xFF]; buffer.extend(std::iter::repeat_n(b'A', 255)); - let result = read_pstring(&buffer, 0, None).unwrap(); + let result = read_pstring(&buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); assert_eq!(result, Value::String("A".repeat(255))); } #[test] fn test_read_pstring_consistency_with_typed_value() { let buffer = b"\x04Test"; - let direct_result = read_pstring(buffer, 0, None).unwrap(); + let direct_result = + read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, false).unwrap(); - let type_kind = TypeKind::PString { max_length: None }; + let type_kind = TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }; let typed_result = read_typed_value(buffer, 0, &type_kind).unwrap(); assert_eq!(direct_result, typed_result); @@ -588,14 +747,133 @@ mod tests { #[test] fn test_read_pstring_consistency_with_max_length() { let buffer = b"\x0aLongString"; - let direct_result = read_pstring(buffer, 0, Some(4)).unwrap(); + let direct_result = + read_pstring(buffer, 0, Some(4), PStringLengthWidth::OneByte, false).unwrap(); let type_kind = TypeKind::PString { max_length: Some(4), + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, }; let typed_result = read_typed_value(buffer, 0, &type_kind).unwrap(); assert_eq!(direct_result, typed_result); assert_eq!(typed_result, Value::String("Long".to_string())); } + + #[test] + fn test_read_pstring_big_endian() { + let cases: &[(&[u8], PStringLengthWidth, &str)] = &[ + // 2-byte BE: length 3 in big-endian = [0x00, 0x03] + (b"\x00\x03abc", PStringLengthWidth::TwoByteBE, "abc"), + // 4-byte BE: length 5 in big-endian = [0x00, 0x00, 0x00, 0x05] + ( + b"\x00\x00\x00\x05Hello", + PStringLengthWidth::FourByteBE, + "Hello", + ), + ]; + for &(buffer, width, expected) in cases { + let result = read_pstring(buffer, 0, None, width, false); + assert_eq!( + result.unwrap(), + Value::String(expected.to_string()), + "Failed for width {width:?}" + ); + } + } + + #[test] + fn test_read_pstring_length_includes_itself() { + let cases: &[(&[u8], PStringLengthWidth, bool, &str)] = &[ + // 1-byte /J: stored length=4, minus 1 byte prefix = 3 bytes of data + (b"\x04abc", PStringLengthWidth::OneByte, true, "abc"), + // 2-byte LE /J: stored length=5, minus 2 byte prefix = 3 bytes of data + (b"\x05\x00abc", PStringLengthWidth::TwoByteLE, true, "abc"), + // 2-byte BE /J: stored length=7, minus 2 byte prefix = 5 bytes of data + ( + b"\x00\x07Hello", + PStringLengthWidth::TwoByteBE, + true, + "Hello", + ), + // 4-byte LE /J: stored length=9, minus 4 byte prefix = 5 bytes of data + ( + b"\x09\x00\x00\x00Hello", + PStringLengthWidth::FourByteLE, + true, + "Hello", + ), + // 4-byte BE /J: stored length=7, minus 4 byte prefix = 3 bytes of data + ( + b"\x00\x00\x00\x07abc", + PStringLengthWidth::FourByteBE, + true, + "abc", + ), + ]; + for &(buffer, width, includes_itself, expected) in cases { + let result = read_pstring(buffer, 0, None, width, includes_itself); + assert_eq!( + result.unwrap(), + Value::String(expected.to_string()), + "Failed for width {width:?}, includes_itself={includes_itself}" + ); + } + } + + #[test] + fn test_read_pstring_j_flag_length_equals_prefix_width() { + // /J where length exactly equals prefix width -> empty string + let buffer = b"\x01"; + let result = read_pstring(buffer, 0, None, PStringLengthWidth::OneByte, true).unwrap(); + assert_eq!(result, Value::String(String::new())); + } + + #[test] + fn test_read_pstring_j_flag_length_less_than_prefix_width() { + // /J where length < prefix width -> InvalidPStringLength error + // LE bytes [0x01, 0x00] = stored length 1, but prefix width is 2 + let buffer = b"\x01\x00xx"; + let result = read_pstring(buffer, 0, None, PStringLengthWidth::TwoByteLE, true); + assert!( + matches!( + result, + Err(TypeReadError::InvalidPStringLength { + stored_length: 1, + prefix_width: 2 + }) + ), + "Expected InvalidPStringLength, got {result:?}" + ); + } + + #[test] + fn test_read_pstring_j_flag_with_max_length() { + // /J + max_length interaction: subtract prefix width first, then cap + // stored length=9, width=4, /J gives 5, max_length=3 caps to 3 + let buffer = b"\x09\x00\x00\x00Hello"; + let result = read_pstring(buffer, 0, Some(3), PStringLengthWidth::FourByteLE, true); + assert_eq!(result.unwrap(), Value::String("Hel".to_string())); + } + + #[test] + fn test_read_pstring_j_flag_zero_length_all_widths() { + // /J where stored length equals prefix width -> empty string + let cases: &[(&[u8], PStringLengthWidth)] = &[ + (b"\x01", PStringLengthWidth::OneByte), + (b"\x00\x02", PStringLengthWidth::TwoByteBE), + (b"\x02\x00", PStringLengthWidth::TwoByteLE), + (b"\x00\x00\x00\x04", PStringLengthWidth::FourByteBE), + (b"\x04\x00\x00\x00", PStringLengthWidth::FourByteLE), + ]; + for &(buffer, width) in cases { + let result = read_pstring(buffer, 0, None, width, true); + assert_eq!( + result.unwrap(), + Value::String(String::new()), + "Expected empty string for /J with width {width:?}" + ); + } + } } diff --git a/src/evaluator/types/tests.rs b/src/evaluator/types/tests.rs index 75cf06f4..1ffc5288 100644 --- a/src/evaluator/types/tests.rs +++ b/src/evaluator/types/tests.rs @@ -475,7 +475,7 @@ fn test_read_typed_value_empty_buffer() { assert_eq!(offset, 0); assert_eq!(buffer_len, 0); } - TypeReadError::UnsupportedType { .. } => panic!("Expected BufferOverrun error"), + other => panic!("Expected BufferOverrun error, got {other:?}"), } } } diff --git a/src/lib.rs b/src/lib.rs index dcdf3f67..60a62a89 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -124,7 +124,8 @@ pub mod build_helpers; // Re-export core AST types for convenience pub use parser::ast::{ - Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value, + Endianness, MagicRule, OffsetSpec, Operator, PStringLengthWidth, StrengthModifier, TypeKind, + Value, }; // Re-export evaluator types for convenience diff --git a/src/parser/ast.rs b/src/parser/ast.rs index 894321b9..9ea82403 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -8,6 +8,92 @@ use serde::{Deserialize, Serialize}; +/// The width of the length prefix for Pascal strings. +/// +/// Uppercase suffix letters (`/H`, `/L`) indicate big-endian byte order. +/// Lowercase suffix letters (`/h`, `/l`) indicate little-endian byte order. +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::parser::ast::PStringLengthWidth; +/// let width = PStringLengthWidth::OneByte; +/// assert_eq!(width.byte_count(), 1); +/// +/// let width = PStringLengthWidth::TwoByteBE; +/// assert_eq!(width.byte_count(), 2); +/// +/// let width = PStringLengthWidth::FourByteLE; +/// assert_eq!(width.byte_count(), 4); +/// ``` +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[allow(clippy::enum_variant_names)] +#[non_exhaustive] +pub enum PStringLengthWidth { + /// 1-byte length prefix (default, `/B` suffix) + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::PStringLengthWidth; + /// let width = PStringLengthWidth::OneByte; + /// assert_eq!(width.byte_count(), 1); + /// ``` + OneByte, + /// 2-byte big-endian length prefix (`/H` suffix) + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::PStringLengthWidth; + /// let width = PStringLengthWidth::TwoByteBE; + /// assert_eq!(width.byte_count(), 2); + /// ``` + TwoByteBE, + /// 2-byte little-endian length prefix (`/h` suffix) + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::PStringLengthWidth; + /// let width = PStringLengthWidth::TwoByteLE; + /// assert_eq!(width.byte_count(), 2); + /// ``` + TwoByteLE, + /// 4-byte big-endian length prefix (`/L` suffix) + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::PStringLengthWidth; + /// let width = PStringLengthWidth::FourByteBE; + /// assert_eq!(width.byte_count(), 4); + /// ``` + FourByteBE, + /// 4-byte little-endian length prefix (`/l` suffix) + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::PStringLengthWidth; + /// let width = PStringLengthWidth::FourByteLE; + /// assert_eq!(width.byte_count(), 4); + /// ``` + FourByteLE, +} + +impl PStringLengthWidth { + /// Returns the number of bytes used for the length prefix. + #[must_use] + pub fn byte_count(&self) -> usize { + match self { + Self::OneByte => 1, + Self::TwoByteBE | Self::TwoByteLE => 2, + Self::FourByteBE | Self::FourByteLE => 4, + } + } +} + /// Offset specification for locating data in files #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum OffsetSpec { @@ -180,26 +266,33 @@ pub enum TypeKind { /// Maximum length to read max_length: Option, }, - /// Pascal string (length-prefixed byte followed by string data) + /// Pascal string (length-prefixed, supports 1/2/4-byte prefix, with optional max length) /// - /// Pascal strings store the length as the first byte (0-255), followed by - /// that many bytes of string data. Unlike C strings, they are not - /// null-terminated. + /// Pascal strings store the length as a prefix (1, 2, or 4 bytes, with configurable endianness), followed by + /// that many bytes of string data. Unlike C strings, they are not null-terminated. /// /// # Examples /// /// ``` - /// use libmagic_rs::parser::ast::TypeKind; + /// use libmagic_rs::parser::ast::{TypeKind, PStringLengthWidth}; + /// + /// let pstring = TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false }; + /// assert_eq!(pstring, TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false }); /// - /// let pstring = TypeKind::PString { max_length: None }; - /// assert_eq!(pstring, TypeKind::PString { max_length: None }); + /// let limited = TypeKind::PString { max_length: Some(64), length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: false }; + /// assert_eq!(limited, TypeKind::PString { max_length: Some(64), length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: false }); /// - /// let limited = TypeKind::PString { max_length: Some(64) }; - /// assert_eq!(limited, TypeKind::PString { max_length: Some(64) }); + /// // /J flag: stored length includes the length field itself + /// let jpeg = TypeKind::PString { max_length: None, length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: true }; + /// assert_eq!(jpeg, TypeKind::PString { max_length: None, length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: true }); /// ``` PString { - /// Maximum length to read (caps the length byte value) + /// Maximum length to read (caps the length value) max_length: Option, + /// Width of the length prefix + length_width: PStringLengthWidth, + /// Whether the stored length includes the length field itself (`/J` flag) + length_includes_itself: bool, }, } @@ -885,9 +978,25 @@ mod tests { TypeKind::String { max_length: Some(128), }, - TypeKind::PString { max_length: None }, + TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }, TypeKind::PString { max_length: Some(64), + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }, + TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::TwoByteBE, + length_includes_itself: true, + }, + TypeKind::PString { + max_length: Some(128), + length_width: PStringLengthWidth::FourByteLE, + length_includes_itself: false, }, ]; diff --git a/src/parser/codegen.rs b/src/parser/codegen.rs index 922af9ae..bf36be8f 100644 --- a/src/parser/codegen.rs +++ b/src/parser/codegen.rs @@ -11,7 +11,10 @@ //! The generated code creates `MagicRule` struct literals that are compiled into the //! binary as built-in rules. -use super::ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; +use super::ast::{ + Endianness, MagicRule, OffsetSpec, Operator, PStringLengthWidth, StrengthModifier, TypeKind, + Value, +}; const INDENT_WIDTH: usize = 4; @@ -26,7 +29,7 @@ pub fn generate_builtin_rules(rules: &[MagicRule]) -> String { push_line(&mut output, "#[allow(unused_imports)]"); push_line( &mut output, - "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier};", + "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier, PStringLengthWidth};", ); push_line(&mut output, "use std::sync::LazyLock;"); push_line(&mut output, ""); @@ -211,15 +214,38 @@ pub fn serialize_type_kind(typ: &TypeKind) -> String { } None => "TypeKind::String { max_length: None }".to_string(), }, - TypeKind::PString { max_length } => match max_length { + TypeKind::PString { + max_length, + length_width, + length_includes_itself, + } => match max_length { Some(value) => { - format!("TypeKind::PString {{ max_length: Some({value}) }}") + format!( + "TypeKind::PString {{ max_length: Some({value}), length_width: {}, length_includes_itself: {} }}", + serialize_pstring_length_width(*length_width), + length_includes_itself + ) } - None => "TypeKind::PString { max_length: None }".to_string(), + None => format!( + "TypeKind::PString {{ max_length: None, length_width: {}, length_includes_itself: {} }}", + serialize_pstring_length_width(*length_width), + length_includes_itself + ), }, } } +/// Serialize a `PStringLengthWidth` as a Rust path +pub fn serialize_pstring_length_width(width: PStringLengthWidth) -> &'static str { + match width { + PStringLengthWidth::OneByte => "PStringLengthWidth::OneByte", + PStringLengthWidth::TwoByteBE => "PStringLengthWidth::TwoByteBE", + PStringLengthWidth::TwoByteLE => "PStringLengthWidth::TwoByteLE", + PStringLengthWidth::FourByteBE => "PStringLengthWidth::FourByteBE", + PStringLengthWidth::FourByteLE => "PStringLengthWidth::FourByteLE", + } +} + /// Serialize an operator as a Rust expression pub fn serialize_operator(op: &Operator) -> String { match op { diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs index e6e187c5..ca8074a8 100644 --- a/src/parser/grammar/mod.rs +++ b/src/parser/grammar/mod.rs @@ -657,6 +657,51 @@ pub fn parse_value(input: &str) -> IResult<&str, Value> { Ok((input, value)) } +/// Parse pstring suffix flags after the `/` character. +/// +/// Recognizes width characters (`B`, `H`, `h`, `L`, `l`) and the optional `J` +/// modifier that indicates the stored length includes the length field itself. +/// +/// Returns `Ok((remaining_input, width, length_includes_itself))` on success, +/// or `Err` if an unrecognized suffix character is found. +fn parse_pstring_suffix( + input: &str, +) -> Result<(&str, crate::parser::ast::PStringLengthWidth, bool), nom::Err>> +{ + use crate::parser::ast::PStringLengthWidth; + + // Parse width character + let (rest, width) = if let Some(rest) = input.strip_prefix('B') { + (rest, PStringLengthWidth::OneByte) + } else if let Some(rest) = input.strip_prefix('H') { + (rest, PStringLengthWidth::TwoByteBE) + } else if let Some(rest) = input.strip_prefix('h') { + (rest, PStringLengthWidth::TwoByteLE) + } else if let Some(rest) = input.strip_prefix('L') { + (rest, PStringLengthWidth::FourByteBE) + } else if let Some(rest) = input.strip_prefix('l') { + (rest, PStringLengthWidth::FourByteLE) + } else if let Some(rest) = input.strip_prefix('J') { + // Bare /J with no width = default OneByte + self-inclusive + return Ok((rest, PStringLengthWidth::OneByte, true)); + } else { + // Unrecognized suffix character after '/' + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::OneOf, + ))); + }; + + // Parse optional J flag after width character + let (rest, includes_j) = if let Some(rest) = rest.strip_prefix('J') { + (rest, true) + } else { + (rest, false) + }; + + Ok((rest, width, includes_j)) +} + /// Parse a type specification with an optional attached bitwise-AND mask operator /// (e.g., `lelong&0xf0000000`). /// @@ -681,9 +726,23 @@ pub fn parse_value(input: &str) -> IResult<&str, Value> { /// # Errors /// Returns a nom parsing error if the input doesn't match the expected format pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option)> { + use crate::parser::ast::PStringLengthWidth; + let (input, _) = multispace0(input)?; - let (input, type_name) = crate::parser::types::parse_type_keyword(input)?; + let (mut input, type_name) = crate::parser::types::parse_type_keyword(input)?; + + // Handle pstring suffixes: /B, /H, /h, /L, /l, and optional /J modifier + let mut pstring_length_width = PStringLengthWidth::OneByte; + let mut pstring_length_includes_itself = false; + if type_name == "pstring" + && let Some(suffix_rest) = input.strip_prefix('/') + { + let (rest, width, includes_j) = parse_pstring_suffix(suffix_rest)?; + input = rest; + pstring_length_width = width; + pstring_length_includes_itself = includes_j; + } // Check for attached operator with mask (like &0xf0000000) // Uses unsigned parsing so full u64 masks (e.g. 0xffffffffffffffff) are supported. @@ -712,7 +771,15 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option IResult<&str, (TypeKind, Option { + assert_eq!(max_length, None, "max_length for input: {input}"); + assert_eq!( + length_width, expected_width, + "length_width for input: {input}" + ); + assert_eq!( + length_includes_itself, expected_j, + "length_includes_itself for input: {input}" + ); + } + _ => panic!("Expected PString for input: {input}, got {kind:?}"), + } + } +} diff --git a/src/parser/types.rs b/src/parser/types.rs index e24f23ed..fd3b64ab 100644 --- a/src/parser/types.rs +++ b/src/parser/types.rs @@ -11,7 +11,7 @@ use nom::{IResult, Parser, branch::alt, bytes::complete::tag}; -use crate::parser::ast::{Endianness, TypeKind}; +use crate::parser::ast::{Endianness, PStringLengthWidth, TypeKind}; /// Parse a type keyword from magic file input /// @@ -294,10 +294,12 @@ pub fn type_keyword_to_kind(type_name: &str) -> TypeKind { // STRING types "string" => TypeKind::String { max_length: None }, - // NOTE: GNU libmagic also supports pstring/B (1-byte, default), pstring/H - // (2-byte LE), and pstring/L (4-byte LE) length-prefix variants. Only the - // default 1-byte prefix is supported here; /B, /H, /L suffixes are not yet parsed. - "pstring" => TypeKind::PString { max_length: None }, + // Default to 1-byte prefix; suffix parsing handled in grammar/mod.rs + "pstring" => TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }, _ => unreachable!("type_keyword_to_kind called with unknown type: {type_name}"), } @@ -475,10 +477,66 @@ mod tests { fn test_type_keyword_to_kind_pstring() { assert_eq!( type_keyword_to_kind("pstring"), - TypeKind::PString { max_length: None } + TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false + } + ); + } + + #[test] + fn test_pstring_keyword_defaults_to_one_byte_width() { + // pstring keyword alone should produce OneByte length_width + // (suffix parsing is handled by grammar/mod.rs, not types.rs) + let kind = type_keyword_to_kind("pstring"); + match kind { + TypeKind::PString { + max_length, + length_width, + length_includes_itself: _, + } => { + assert_eq!( + max_length, None, + "pstring default should have no max_length" + ); + assert_eq!( + length_width, + PStringLengthWidth::OneByte, + "pstring default should be OneByte" + ); + } + _ => panic!("Expected TypeKind::PString, got {kind:?}"), + } + } + + #[test] + fn test_pstring_keyword_does_not_consume_suffix() { + // parse_type_keyword should only consume "pstring", leaving suffix for grammar + let (rest, keyword) = parse_type_keyword("pstring/H =value").unwrap(); + assert_eq!(keyword, "pstring"); + assert_eq!( + rest, "/H =value", + "Suffix should remain unconsumed by type keyword parser" ); } + #[test] + fn test_pstring_keyword_boundary() { + // pstring at exact boundary (no trailing input) + let (rest, keyword) = parse_type_keyword("pstring").unwrap(); + assert_eq!(keyword, "pstring"); + assert_eq!(rest, ""); + } + + #[test] + fn test_pstring_before_operator() { + // pstring followed by whitespace then operator + let (rest, keyword) = parse_type_keyword("pstring =hello").unwrap(); + assert_eq!(keyword, "pstring"); + assert_eq!(rest, " =hello"); + } + #[test] fn test_roundtrip_all_keywords() { // Verify that every keyword parsed by parse_type_keyword can be diff --git a/tests/evaluator_tests.rs b/tests/evaluator_tests.rs index 443fec6f..649373de 100644 --- a/tests/evaluator_tests.rs +++ b/tests/evaluator_tests.rs @@ -8,6 +8,7 @@ //! function for type-specific evaluation scenarios. use libmagic_rs::evaluator::evaluate_rules; +use libmagic_rs::parser::ast::PStringLengthWidth; use libmagic_rs::{ Endianness, EvaluationConfig, EvaluationContext, MagicDatabase, MagicRule, OffsetSpec, Operator, TypeKind, Value, @@ -360,7 +361,11 @@ fn test_evaluate_pstring_rule_match() { // Pascal string: length byte (3) followed by "PDF" let rule = MagicRule { offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { max_length: None }, + typ: TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }, op: Operator::Equal, value: Value::String("PDF".to_string()), message: "Pascal PDF marker".to_string(), @@ -381,7 +386,11 @@ fn test_evaluate_pstring_rule_match() { fn test_evaluate_pstring_rule_no_match() { let rule = MagicRule { offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { max_length: None }, + typ: TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, + }, op: Operator::Equal, value: Value::String("ZIP".to_string()), message: "Should not match".to_string(), @@ -407,6 +416,8 @@ fn test_evaluate_pstring_with_max_length() { offset: OffsetSpec::Absolute(0), typ: TypeKind::PString { max_length: Some(3), + length_width: PStringLengthWidth::OneByte, + length_includes_itself: false, }, op: Operator::Equal, value: Value::String("Hel".to_string()), @@ -429,6 +440,36 @@ fn test_evaluate_pstring_with_max_length() { ); } +#[test] +fn test_evaluate_pstring_two_byte_be_with_j_flag() { + // 2-byte big-endian prefix with /J: stored length=7 includes the 2-byte prefix, so string is 5 bytes + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::PString { + max_length: None, + length_width: PStringLengthWidth::TwoByteBE, + length_includes_itself: true, + }, + op: Operator::Equal, + value: Value::String("Hello".to_string()), + message: "BE pstring with /J flag".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + // Big-endian length 7 = [0x00, 0x07], minus 2-byte prefix = 5 bytes of "Hello" + let buffer: &[u8] = &[0x00, 0x07, b'H', b'e', b'l', b'l', b'o']; + let config = EvaluationConfig::default(); + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[rule], buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "PString/HJ rule should match 2-byte BE prefix with self-inclusive length" + ); +} + #[test] fn test_evaluate_float_rule_no_match() { // Buffer contains 1.0f32 LE, rule expects == 2.0 -- should NOT match diff --git a/tests/property_tests.rs b/tests/property_tests.rs index dfa9d23d..55f26e3a 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -11,6 +11,7 @@ use proptest::prelude::*; +use libmagic_rs::parser::ast::PStringLengthWidth; use libmagic_rs::{ Endianness, EvaluationConfig, MagicDatabase, MagicRule, OffsetSpec, Operator, TypeKind, Value, }; @@ -48,9 +49,22 @@ fn arb_type_kind() -> impl Strategy { (0usize..256usize).prop_map(|len| TypeKind::String { max_length: Some(len), }), - (0usize..256usize).prop_map(|len| TypeKind::PString { - max_length: Some(len), - }), + ( + 0usize..256usize, + prop_oneof![ + Just(PStringLengthWidth::OneByte), + Just(PStringLengthWidth::TwoByteBE), + Just(PStringLengthWidth::TwoByteLE), + Just(PStringLengthWidth::FourByteBE), + Just(PStringLengthWidth::FourByteLE), + ], + any::(), + ) + .prop_map(|(len, width, includes_self)| TypeKind::PString { + max_length: Some(len), + length_width: width, + length_includes_itself: includes_self, + }), ] }