diff --git a/.serena/project.yml b/.serena/project.yml index d88a6623..608f2cfd 100644 --- a/.serena/project.yml +++ b/.serena/project.yml @@ -1,13 +1,15 @@ + + # list of languages for which language servers are started; choose from: # al bash clojure cpp csharp # csharp_omnisharp dart elixir elm erlang # fortran fsharp go groovy haskell # java julia kotlin lua markdown # matlab nix pascal perl php -# powershell python python_jedi r rego -# ruby ruby_solargraph rust scala swift -# terraform toml typescript typescript_vts vue -# yaml zig +# php_phpactor powershell python python_jedi r +# rego ruby ruby_solargraph rust scala +# swift terraform toml typescript typescript_vts +# vue yaml zig # (This list may be outdated. For the current list, see values of Language enum here: # https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py # For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) @@ -16,8 +18,8 @@ # - For JavaScript, use typescript # - For Free Pascal/Lazarus, use pascal # Special requirements: -# - csharp: Requires the presence of a .sln file in the project folder. -# - pascal: Requires Free Pascal Compiler (fpc) and optionally Lazarus. +# Some languages require additional setup/installations. +# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers # When using multiple languages, the first language server that supports a given file will be used for that file. # The first language is the default language and the respective language server will be used as a fallback. # Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. @@ -31,8 +33,9 @@ encoding: "utf-8" # whether to use project's .gitignore files to ignore files ignore_all_files_in_gitignore: true -# list of additional paths to ignore in all projects -# same syntax as gitignore, so you can use * and ** +# list of additional paths to ignore in this project. +# Same syntax as gitignore, so you can use * and **. +# Note: global ignored_paths from serena_config.yml are also applied additively. ignored_paths: [] # whether the project is in read-only mode @@ -40,7 +43,9 @@ ignored_paths: [] # Added on 2025-04-18 read_only: false -# list of tool names to exclude. We recommend not excluding any tools, see the readme for more details. +# list of tool names to exclude. +# This extends the existing exclusions (e.g. from the global configuration) +# # Below is the complete list of tools for convenience. # To make sure you have the latest list of tools, and to view their descriptions, # execute `uv run scripts/print_tool_overview.py`. @@ -87,7 +92,8 @@ initial_prompt: "" # the name by which the project can be referenced within Serena project_name: "libmagic-rs" -# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default) +# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default). +# This extends the existing inclusions (e.g. from the global configuration). included_optional_tools: [] # list of mode names to that are always to be included in the set of active modes @@ -108,3 +114,39 @@ default_modes: # fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools. # This cannot be combined with non-empty excluded_tools or included_optional_tools. fixed_tools: [] + +# time budget (seconds) per tool call for the retrieval of additional symbol information +# such as docstrings or parameter information. +# This overrides the corresponding setting in the global configuration; see the documentation there. +# If null or missing, use the setting from the global configuration. +symbol_info_budget: + +# The language backend to use for this project. +# If not set, the global setting from serena_config.yml is used. +# Valid values: LSP, JetBrains +# Note: the backend is fixed at startup. If a project with a different backend +# is activated post-init, an error will be returned. +language_backend: + +# line ending convention to use when writing source files. +# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default) +# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings. +line_ending: + +# list of regex patterns which, when matched, mark a memory entry as read‑only. +# Extends the list from the global configuration, merging the two lists. +read_only_memory_patterns: [] + +# list of regex patterns for memories to completely ignore. +# Matching memories will not appear in list_memories or activate_project output +# and cannot be accessed via read_memory or write_memory. +# To access ignored memory files, use the read_file tool on the raw file path. +# Extends the list from the global configuration, merging the two lists. +# Example: ["_archive/.*", "_episodes/.*"] +ignored_memory_patterns: [] + +# advanced configuration option allowing to configure language server-specific options. +# Maps the language key to the options. +# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available. +# No documentation on options means no options are available. +ls_specific_settings: {} diff --git a/AGENTS.md b/AGENTS.md index 1e0fc93f..3328ad90 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -209,32 +209,19 @@ cargo test --doc # Test documentation examples - **Operators**: `=` (equal), `!=` (not equal), `<` (less than), `>` (greater than), `<=` (less equal), `>=` (greater equal), `&` (bitwise AND with optional mask), `^` (bitwise XOR), `~` (bitwise NOT), `x` (any value) - **Nested Rules**: Hierarchical rule evaluation with proper indentation - **String Matching**: Exact string matching with null-termination and Pascal string (length-prefixed) support +- **Regex type**: Binary-safe regex matching via `regex::bytes::Regex`. Full flag support: `/c` (case-insensitive), `/s` (anchor advances to match-start instead of match-end), `/l` (scan window is measured in lines instead of bytes). Flags combine in any order (`regex/cs`, `regex/csl`, `regex/lc`). Numeric counts are honored: `regex/100` scans at most 100 bytes; `regex/1l` scans at most 1 line. Multi-line regex matching is always on (matching libmagic's unconditional `REG_NEWLINE`), so `^` and `$` match at line boundaries regardless of `/l`. Every scan window is capped at 8192 bytes (`FILE_REGEX_MAX`) regardless of the user's count. +- **Search type**: Bounded literal pattern scan via `memchr::memmem::find`; `search/N` caps the scan window to `N` bytes from the offset. The range is **mandatory** and stored as `NonZeroUsize`, so bare `search` and `search/0` are parse errors (matching GNU `file` magic(5)). Anchor advance follows GNU `file` semantics (match-end, not window-end) so relative-offset children resolve to the byte immediately after the matched pattern. ### Planned Features (v1.0+) -- Regex type: Pattern matching with binary-safe regex support -- Search type: Multi-pattern string searching - -### Future Enhancement: Binary-Safe Regex Handling - -> **Note:** The following is planned for future releases and is not yet implemented. - -```rust -// Use regex crate with bytes feature for binary-safe matching -pub trait BinaryRegex { - fn find_at(&self, haystack: &[u8], start: usize) -> Option; -} - -impl BinaryRegex for regex::bytes::Regex { - /* ... */ -} -``` +- Aho-Corasick multi-pattern search optimization for `search/` rules. +- `!:mime`/`!:ext`/`!:apple` directive evaluation (currently only `!:strength` is parsed). +- `use`/`name` named test directives for rule reuse. ## Current Limitations (v0.1.0) ### Type System -- No regex/search pattern matching - 64-bit integer types: `quad`/`uquad`, `bequad`/`ubequad`, `lequad`/`ulequad` are implemented; `qquad` (128-bit) is not yet supported - String evaluation reads until first NUL or end-of-buffer by default; `pstring` reads a length-prefixed Pascal string; `max_length: Some(_)` is supported internally but no dedicated fixed-length string parser syntax exists yet - `pstring` supports 1-byte (`/B`), 2-byte big-endian (`/H`), 2-byte little-endian (`/h`), 4-byte big-endian (`/L`), and 4-byte little-endian (`/l`) length prefixes, plus the `/J` flag (stored length includes prefix width). All flags are combinable (e.g., `pstring/HJ`) and fully implemented. @@ -317,7 +304,7 @@ sample.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) ### Adding New Type Support -> **Note:** Currently implemented types are `Byte`, `Short`, `Long`, `Quad`, `Float`, `Double`, `Date`, `QDate`, `String`, and `PString`. Regex and search types are planned for future releases. +> **Note:** Currently implemented types are `Byte`, `Short`, `Long`, `Quad`, `Float`, `Double`, `Date`, `QDate`, `String`, `PString`, `Regex`, and `Search`. See "Current Limitations" for the remaining gaps in regex/search flag coverage. 1. Extend `TypeKind` enum in `src/parser/ast.rs` 2. Add keyword parsing in `src/parser/types.rs` (`parse_type_keyword` and `type_keyword_to_kind`) @@ -464,14 +451,15 @@ CI must pass before merge. Mergify merge protections enforce these checks. Bot P - `nom`: Parser combinators - `serde`: Serialization - `clap`: CLI argument parsing -- `regex`: Pattern matching (used in tests; regex *type* for magic rules is planned) +- `regex`: Binary-safe pattern matching via `regex::bytes::Regex` for `TypeKind::Regex` evaluation +- `memchr`: SIMD-accelerated literal pattern search, used for `TypeKind::Search` - `aho-corasick`: Multi-pattern search (planned, not yet added) ### Development Phases 1. **MVP (v0.1.0)** - CURRENT: Basic parsing and evaluation with byte/short/long/quad/string types, equality and bitwise AND operators, built-in rules for 10 common formats 2. **Enhanced Features (v0.2)**: Comparison operators (`>`, `<`), indirect offset improvements, strength-based rule ordering -3. **Advanced Types (v0.3)**: Regex type, search patterns +3. **Advanced Types (v0.3)**: Regex flag completeness (`/s`, proper `/l` line-count semantics, `regex/Nl`), search range enforcement, 8192-byte default regex range 4. **Full Compatibility (v0.4)**: Complete libmagic syntax support, all special directives, named tests 5. **Production Ready (v1.0)**: Stable API, complete documentation, 95%+ compatibility with GNU file diff --git a/Cargo.lock b/Cargo.lock index 0b9f8241..8a480c9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,9 +258,9 @@ dependencies = [ [[package]] name = "clap_complete" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19c9f1dde76b736e3681f28cec9d5a61299cbaae0fce80a68e43724ad56031eb" +checksum = "406e68b4de5c59cfb8f750a7cbd4d31ae153788b8352167c1e5f4fc26e8c91e9" dependencies = [ "clap", ] diff --git a/Cargo.toml b/Cargo.toml index 6d7a4a6b..da04e1d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -147,13 +147,14 @@ cfg-if = "1.0.4" chrono = { version = "0.4.41", default-features = false, features = ["std", "clock"] } clap = { version = "4.6.0", features = ["derive"] } clap-stdin = "0.8.1" -clap_complete = "4.6.0" +clap_complete = "4.6.1" ctrlc = { version = "3.5.2", features = ["termination"] } env_logger = "0.11" log = "0.4" memchr = "2.8.0" memmap2 = "0.9.10" nom = "8.0.0" +regex = "1.12.3" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" thiserror = "2.0.18" @@ -171,7 +172,6 @@ insta = { version = "1.47.2", features = ["json"] } nix = { version = "0.31.2", features = ["fs"] } predicates = "3.1.4" proptest = "1.11.0" -regex = "1.12.3" tempfile = "3.27.0" [[bench]] diff --git a/GOTCHAS.md b/GOTCHAS.md index 2b88e42a..5acfcc8a 100644 --- a/GOTCHAS.md +++ b/GOTCHAS.md @@ -26,7 +26,7 @@ Serialization functions live in `src/parser/codegen.rs`, shared by both `build.r ### 2.1 `TypeKind` Exhaustive Matches -Adding a variant to `TypeKind` requires updating exhaustive matches in 10+ files: `ast`, `grammar`, `types`, `codegen`, `strength`, `property_tests`, `evaluator/types/mod.rs` (`read_typed_value`, `coerce_value_to_type`, **`bytes_consumed`** -- variable-width variants must be matched explicitly or relative-offset anchors will silently corrupt), `output/mod.rs` (2 length matches), `output/json.rs` (`format_value_as_hex`), and `grammar/tests.rs` (stale assertions). Note: `coerce_value_to_type`, output matches, and `bytes_consumed` use catch-all `_ =>` so they compile without changes but may need semantic updates -- `bytes_consumed` will fire a `debug_assert` in test/dev builds for unhandled variable-width variants. +Adding a variant to `TypeKind` requires updating exhaustive matches in 10+ files: `ast`, `grammar`, `types`, `codegen` (`serialize_type_kind` -- easy to forget; build.rs is a separate compilation unit so the error surfaces there first), `strength`, `property_tests`, `evaluator/types/mod.rs` (`read_typed_value`, `coerce_value_to_type`, **`bytes_consumed`** -- variable-width variants must be matched explicitly or relative-offset anchors will silently corrupt), `output/mod.rs` (2 length matches), `output/json.rs` (`format_value_as_hex`), and `grammar/tests.rs` (stale assertions). Note: `coerce_value_to_type`, output matches, and `bytes_consumed` use catch-all `_ =>` so they compile without changes but may need semantic updates -- `bytes_consumed` will fire a `debug_assert` in test/dev builds for unhandled variable-width variants. ### 2.2 `Operator` Exhaustive Matches @@ -38,6 +38,36 @@ Adding a variant to `Value` requires updating: `ast`, `codegen`, `strength`, `pr - **Note:** `Value` no longer derives `Eq` (removed when `Value::Float(f64)` was added) -- no production code depends on `Value: Eq`. +### 2.4 Pattern-Bearing Types Bypass `apply_operator` in the Engine + +`TypeKind::Regex` and `TypeKind::Search` are evaluated by **logical match** in `evaluate_single_rule_with_anchor` (`src/evaluator/engine/mod.rs`), not by string equality against `rule.value`. The engine calls `types::read_pattern_match`, which returns `Result, _>`: `Some(v)` means the pattern matched (possibly zero-width) and `None` means it did not. The engine translates that `Option` directly into `Equal`/`NotEqual`. Comparing matched text to the pattern literal via `apply_operator` would fail for any regex with metacharacters (e.g., matched `"123"` vs pattern `"[0-9]+"`). **Non-equality operators on pattern-bearing types are rejected as `TypeReadError::UnsupportedType`** — an earlier revision fell through to `apply_operator` and silently produced lexicographic ordering comparisons against the pattern source text. If you add a new pattern-bearing `TypeKind` variant, add its arm to both `read_pattern_match` and `bytes_consumed_with_pattern`; the engine's special-case match is keyed on the `Regex | Search` pair so you must add new variants there too. + +### 2.5 Zero-Width Regex Matches vs Misses + +`read_regex` returns `Ok(Some(Value::String("")))` for a legitimate zero-width match (`^`, `a*`, lookaheads, `.{0}`) and `Ok(None)` for a genuine miss. An earlier revision collapsed both cases to `Value::String(String::new())` and distinguished them by `is_empty()`, which broke every pattern that legitimately matches zero bytes. The structured `Option` is the invariant — do not re-flatten it. `read_typed_value_with_pattern` does collapse `None` to `Value::String(String::new())` for back-compat with its single-`Value` return shape, but the engine does not go through that function for pattern types; it calls `read_pattern_match` directly. + +### 2.6 Search Anchor Advance Is Match-End, Not Window-End + +`search_bytes_consumed` returns `match_idx + pattern.len()` — the byte just past the matched pattern — not `range` (the window size). This matches GNU `file` semantics: `src/softmagic.c` `FILE_SEARCH` in `moffset()` computes `o = ms->search.offset + vlen - offset` where `ms->search.offset` has already been advanced by `idx` (the match index inside the window) in `magiccheck`, and `vlen = m->vallen` (the pattern length). An earlier revision returned the full window size, which silently corrupted relative-offset children of every successful `search` rule (e.g., `search/256 "MAGIC"` at index 4 advanced the anchor by 256 instead of by 9). The fix threaded the pattern through `bytes_consumed_with_pattern` for `TypeKind::Search` so the scan can be re-run at anchor-advance time. Search does not currently support a `/s`-style start-offset flag; if one is added, match-end can become match-start. + +### 2.7 Regex `/l` Is Scan Window Bounds, Not Multi-Line Toggle + +`RegexFlags::line_based` (the `/l` suffix) controls *only* the scan window extent: when set, `count` is interpreted as a line count and `compute_window` walks line terminators (both `\n` and `\r\n`, each counting as one terminator) to bound the scan. It does **not** toggle regex multi-line matching — libmagic always compiles with `REG_NEWLINE` (unconditional at `src/softmagic.c::alloc_regex` line 2123), so `^` and `$` match at line boundaries for every regex rule regardless of `/l`. An earlier revision of this crate wrapped line-based patterns in `^(?:...)` and only set `multi_line(true)` when `/l` was set; that was wrong on both counts and has been removed. `build_regex` now unconditionally sets `multi_line(true)` and `dot_matches_new_line(false)` for all patterns. + +### 2.8 Regex Scan Window Is Always Capped at 8192 Bytes + +Every regex rule is subject to the `REGEX_MAX_BYTES` (8192) hard cap, matching GNU `file`'s `FILE_REGEX_MAX` (`src/file.h:522`). This applies: + +- When `count` is `None` (default scan). +- When `count` is `Some(n)` with `n > 8192` (explicit counts are clamped). +- When `flags.line_based` is set (the line-based walk stops after 8192 bytes even if the Nth terminator has not been reached yet). + +The cap is a DoS mitigation: without it, a malicious regex against a multi-GB buffer combined with `EvaluationConfig::default()` (no timeout — see S13.1) can hang the evaluator. It is enforced inside `compute_window` in `src/evaluator/types/regex.rs`. Do not add a path that bypasses the cap, even for "trusted" rules — the cap is also what makes the regex evaluator's worst-case runtime bounded. + +### 2.9 Regex `/s` Flag Affects Anchor Advance Only, Not Match Result + +`RegexFlags::start_offset` (the `/s` suffix) controls *only* `regex_bytes_consumed`: when set, the anchor advance is `m.start()` (match-start) instead of `m.end()` (match-end). The match *result* (whether a pattern matches, and what matched text is returned) is unchanged. This matches libmagic's `REGEX_OFFSET_START` flag, which zeros the `rm_len` contribution in `moffset()` but does not alter the regex scan itself. Tests for `/s` must exercise `regex_bytes_consumed` directly or check the resolved offset of a `Relative(N)` child rule; checking `read_regex` alone won't detect a broken `/s` implementation. + ## 3. Parser Architecture ### 3.1 Type Keyword Parsing Split @@ -77,6 +107,10 @@ Lowercase pointer specifiers (`.s`, `.l`, `.q`) map to **little-endian**, not na The load-bearing invariant is that the anchor is updated *before recursing into children* (so children and their followers see the new anchor). The current code also happens to set the anchor before `matches.push(...)`, but the push-ordering relative to `set_last_match_end` is incidental for anchor correctness -- only the ordering before the `evaluate_rules` recursion call matters. (Future code that reads the anchor while iterating `matches` would make this ordering load-bearing, so do not "optimize" the order without checking call sites first.) `bytes_consumed()` (in `evaluator/types/mod.rs`) is the source of truth for advance distance; for variable-width types it re-derives consumption from the buffer rather than trusting `Value::String.len()` (which can drift from the original byte length via `from_utf8_lossy`). Pascal-string consumption is also clamped against the remaining buffer to prevent attacker-controlled length prefixes from poisoning the anchor to `usize::MAX`. +### 3.9 `parse_text_magic_file` is Fail-Fast, Not Skip-on-Error + +`build_rule_hierarchy` propagates any `parse_magic_rule_line` error immediately, so a single unparseable rule (e.g., a child using unsupported `&+N` relative-offset syntax or an unquoted `$VAR` string value -- see S3.6) causes the **entire file load** to fail with `ParseError::InvalidSyntax`. There is no skip-and-continue mode. When writing corpus tests against third_party `.magic` files that mix supported and unsupported syntax, bypass the parser and build the equivalent `MagicRule` tree programmatically via the AST; the runtime evaluator can still be exercised end-to-end against the real testfile buffer. See `tests/evaluator_tests.rs::test_regex_eol_corpus` for a worked example. + ## 4. Module Visibility & Re-exports ### 4.1 Private Engine Module @@ -130,6 +164,10 @@ Middle-endian date keywords are NOT supported. They were removed until real midd libmagic types are signed by default (`byte`, `short`, `long`, `quad`). Unsigned variants use `u` prefix (`ubyte`, `ushort`, `ulong`, `uquad`, etc.). +### 6.4 `TypeKind::String { max_length: None }` Against Buffers Without NUL + +`read_string` with `max_length: None` reads until the first NUL or end of buffer. On NUL-free buffers (raw ASCII text, JSON, log lines, etc.) it reads the *entire remaining buffer*, and equality comparison against a short target value then fails. Programmatic rules built against such buffers must set `max_length: Some(target_len)` explicitly. Text magic rules (`string "MZ"`) typically work anyway because real executable headers contain NULs within the first few bytes. + ## 7. Testing ### 7.1 Doctest Import Paths @@ -224,3 +262,13 @@ All tags and commits MUST be signed -- use `git tag -s` and `git commit -s -S`. - **Rule:** Library consumers embedding libmagic-rs in services or untrusted-input pipelines should **not** use `EvaluationConfig::default()`. Use `EvaluationConfig::performance()` (which sets `timeout_ms: Some(1000)`) as the safe preset, or construct a config explicitly with a non-`None` timeout sized for your workload. - **Validation:** `timeout_ms` is clamped to `MAX_SAFE_TIMEOUT_MS` (5 minutes) by config validation and must be `> 0` if specified -- see the validation logic in `src/config.rs`. - **Note:** `Default` cannot be changed to set a timeout without breaking API expectations of callers who deliberately want no timeout (e.g., CLI one-shot invocations). The gotcha is that the unsafe default is the ergonomic choice; document the tradeoff prominently in any new consumer-facing docs. + +## 14. Output Formatting + +### 14.1 `\b` (Backspace) Prefix in Rule Messages Suppresses Leading Space + +`MagicDatabase::build_result` concatenates rule messages with a space separator, **except** when a message starts with `\u{0008}` (backspace / `\b`), in which case the backspace is stripped and no leading space is inserted. This mirrors GNU `file`'s description formatting (used by rules like `>&1 regex/1l ... \b, version %s` to produce `Ansible Vault text, version 1.1` instead of `Ansible Vault text , version 1.1`). Tests that manually simulate the concatenation path (e.g., corpus tests that bypass `load_from_file` -- see S3.9) must honor this convention or their assertions will diverge from the real evaluator output. + +### 14.2 `%s` (and Other printf-Style Format Specifiers) Are Not Substituted + +Magic rule messages like `\b, version %s` are passed through verbatim to the final concatenated description -- the evaluator does not implement printf-style format substitution. Captured values from regex/search/pattern matches live on `RuleMatch.value`, not embedded in `RuleMatch.message`. Tests or output checks that expect substituted text (e.g., "version 1.1") must either hardcode the expected token in the rule's message or assert against `RuleMatch.value` directly. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a8cf7305..d442e328 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -94,7 +94,14 @@ libmagic-rs/ │ │ ├── mod.rs # Public API surface with re-exports, EvaluationContext, RuleMatch │ │ ├── engine.rs # Core evaluation logic (evaluate_single_rule, evaluate_rules, evaluate_rules_with_config) │ │ ├── offset.rs # Offset resolution -│ │ ├── types.rs # Type reading with bounds checking +│ │ ├── types/ # Type reading subsystem +│ │ │ ├── mod.rs # Type dispatch and pattern matching +│ │ │ ├── numeric.rs # Byte, Short, Long, Quad +│ │ │ ├── float.rs # Float, Double +│ │ │ ├── date.rs # Date, QDate +│ │ │ ├── string.rs # String, PString +│ │ │ ├── regex.rs # Regex pattern matching +│ │ │ └── search.rs # Search literal scanning │ │ ├── operators.rs # Comparison operations │ │ └── strength.rs # Strength calculation and sorting │ │ @@ -288,6 +295,8 @@ pub struct MagicRule { - `Long { endian: Endianness, signed: bool }` - 32-bit integer - `Quad { endian: Endianness, signed: bool }` - 64-bit integer - `String { max_length: Option }` - Null-terminated string +- `Regex { flags: RegexFlags, count: Option }` - Regular expression matching +- `Search { range: NonZeroUsize }` - Bounded literal pattern search **Hierarchical Structure:** @@ -465,6 +474,8 @@ Vetted dependencies with minimal unsafe: - `memmap2` - Memory mapping (audited) - `nom` - Parsing (no unsafe) - `thiserror` - Error handling (no unsafe) +- `regex` - Pattern matching (production dependency) +- `memchr` - Fast byte searching (production dependency) --- @@ -499,7 +510,7 @@ The evaluation hot path is optimized for: 1. Add variant to `TypeKind` enum (`ast.rs`) 2. Add parsing logic (`grammar/mod.rs`) -3. Add reading logic (`types.rs`) +3. Add reading logic in `evaluator/types/` (as a submodule for complex types or in `types/mod.rs` for simple ones) 4. Add serialization support (`build_helpers.rs`) 5. Add tests 6. Update documentation diff --git a/docs/solutions/integration-issues/implementing-variable-width-typekind-variant.md b/docs/solutions/integration-issues/implementing-variable-width-typekind-variant.md new file mode 100644 index 00000000..8086fb61 --- /dev/null +++ b/docs/solutions/integration-issues/implementing-variable-width-typekind-variant.md @@ -0,0 +1,133 @@ +--- +title: Implementing regex and search evaluator types in libmagic-rs +category: integration-issues +date: 2026-04-10 +tags: [rust, evaluator, regex, search, typekind, libmagic, exhaustive-match] +severity: medium +components: [evaluator/types, parser/codegen, parser/grammar] +related_issues: [39] +--- + +## Problem + +Implementing evaluator support for `TypeKind::Regex` and `TypeKind::Search` in libmagic-rs exposed five interlocking issues: a stale `regex` crate feature flag, a dispatch signature that could not carry pattern operands to the type-reading layer, a missing anchor-advance path for variable-width regex matches, a build-script exhaustive-match failure that surfaced before the library error, and a clippy `doc_markdown` lint on module-level docs. + +## Root Cause + +1. `regex` v1.12+ exposes `regex::bytes::RegexBuilder` unconditionally; declaring `features = ["bytes"]` references a feature that no longer exists, so cargo rejects the manifest. +2. `read_typed_value(buffer, offset, type_kind)` was designed for fixed-shape numeric and string types that need only the buffer and offset. Regex and Search are fundamentally different — they require the rule's *value operand* (the pattern) at read time to compile the regex or locate the needle. +3. `bytes_consumed` (the source of truth for advancing `EvaluationContext::last_match_end` per GOTCHAS.md S3.8) re-derives consumption from the buffer for variable-width types. Regex matches have buffer-dependent lengths, so the anchor advance cannot be computed without re-running the regex. +4. `src/parser/codegen.rs` is included by `build.rs` via `#[path]` (GOTCHAS.md S1.2). Adding `TypeKind` variants breaks `serialize_type_kind`'s exhaustive match, and cargo surfaces the build-script compilation failure *before* the library error — a trap not previously documented in S2.1. +5. Clippy's pedantic `doc_markdown` lint flags unquoted identifiers like `TypeKind` in rustdoc, and each identifier must be individually backticked. + +## Solution + +**Manifest fix:** Drop the nonexistent feature flag in `Cargo.toml`: + +```toml +regex = "1.12.3" +``` + +**Dispatch threading:** Add `read_typed_value_with_pattern(buffer, offset, type_kind, pattern: Option<&Value>)` as a new entry point alongside the existing 3-arg `read_typed_value`, which becomes a thin wrapper that forwards `pattern: None`. The engine calls the pattern-aware form uniformly; the 3-arg convenience wrapper is retained so the ~30 existing call sites (`read_typed_value(buf, off, &kind)`) compile unchanged. Add a parallel `bytes_consumed_with_pattern` so the anchor-advance path can reach the pattern operand for `TypeKind::Regex` and `TypeKind::Search`. + +Additionally, expose a `read_pattern_match(buffer, offset, type_kind, pattern) -> Result, TypeReadError>` helper for the engine's pattern-bearing code path. `Option` is the structured "no match" signal: a genuine miss returns `None`, while a legitimate zero-width regex match (e.g., `^`, `a*`, lookaheads) returns `Some(Value::String(String::new()))`. `read_typed_value_with_pattern` collapses `None` to `Value::String(String::new())` for back-compat with the single-`Value` return shape; the engine path uses `read_pattern_match` directly and drives its own `Equal`/`NotEqual` decision from the `Option` discriminant. + +**Regex reader** (`src/evaluator/types/regex.rs`) — uses a `build_regex` helper that wraps the pattern in `^(?:...)` when `/l` is set so bare, unanchored patterns cannot match mid-line: + +```rust +fn build_regex( + pattern: &str, + case_insensitive: bool, + start_of_line: bool, +) -> Result { + let owned; + let effective_pattern: &str = if start_of_line { + owned = format!("^(?:{pattern})"); + &owned + } else { + pattern + }; + RegexBuilder::new(effective_pattern) + .case_insensitive(case_insensitive) + .multi_line(start_of_line) + .build() +} + +pub fn read_regex( + buffer: &[u8], + offset: usize, + pattern: &str, + case_insensitive: bool, + start_of_line: bool, +) -> Result, TypeReadError> { + if offset >= buffer.len() { return Err(BufferOverrun { .. }); } + let regex = build_regex(pattern, case_insensitive, start_of_line) + .map_err(|e| UnsupportedType { + type_name: format!("regex compile error: {e}"), + })?; + let remaining = &buffer[offset..]; + Ok(regex.find(remaining).map(|m| { + Value::String(String::from_utf8_lossy(m.as_bytes()).into_owned()) + })) +} +``` + +**Search reader** (`src/evaluator/types/search.rs`): + +```rust +pub fn read_search( + buffer: &[u8], + offset: usize, + pattern: &[u8], + range: Option, +) -> Result, TypeReadError> { + if offset >= buffer.len() { return Err(BufferOverrun { .. }); } + let remaining = &buffer[offset..]; + let window_len = range.map_or(remaining.len(), |n| n.min(remaining.len())); + let window = &remaining[..window_len]; + Ok(memchr::memmem::find(window, pattern).map(|_| { + Value::String(String::from_utf8_lossy(pattern).into_owned()) + })) +} +``` + +`None` is the structured "no match" signal, which lets the engine distinguish a zero-width regex match from a genuine miss without reusing `Value::String(String::new())` as a sentinel. + +**Anchor advance:** In `bytes_consumed_with_pattern`, the `Regex` arm re-runs the regex via `regex_bytes_consumed(...)` and returns `m.end()`. The `Search` arm re-runs `memchr::memmem::find` against the window and returns `match_idx + pattern.len()` — the byte just past the matched needle, matching GNU `file`'s `softmagic.c` `FILE_SEARCH` path where `ms->search.offset += idx` and then `moffset()` adds `vlen = m->vallen`. An earlier revision of this PR advanced by the full window size (`range`); that was wrong and caused relative-offset children to land far past the intended byte. + +**Engine pattern-bearing code path:** In `evaluate_single_rule_with_anchor`, split the flow into two arms. For `TypeKind::Regex | Search`, call `read_pattern_match` and translate its `Option` result directly into `Equal` (`Some` → match) / `NotEqual` (`None` → match) — no `apply_operator` call. Any other operator on a pattern-bearing type is rejected as `TypeReadError::UnsupportedType` because it has no well-defined semantics (ordering a matched string against the pattern literal produces nonsense). For all other types, continue through `read_typed_value_with_pattern` + `coerce_value_to_type` + `apply_operator` as before. + +**Codegen:** Add `Regex { .. }` and `Search { .. }` arms to `serialize_type_kind` in `src/parser/codegen.rs`. Verify `cargo check` against `build.rs` output, not just the library. + +**Doc lint:** Backtick identifiers individually in module docs: `` //! Implements the `regex` `TypeKind`. `` + +## Prevention + +- **Verify crate features on docs.rs before adding them.** The `regex` crate dropped the `bytes` feature by v1.12 (`regex::bytes` is unconditional). Check `https://docs.rs///` for the exact feature list before editing `Cargo.toml`. A wasted `cargo build` cycle is the cheap failure mode; a silently-disabled feature is the expensive one. +- **When adding a `TypeKind` variant, walk GOTCHAS S2.1 in order, then verify the build.rs pipeline.** The hidden site is `serialize_type_kind` in `src/parser/codegen.rs` — it is included via `#[path]` in `build.rs`, so omissions surface as confusing `E0004`/`E0599` errors from `build.rs` *before* any library file compiles. Run `cargo clean && cargo check` after editing `TypeKind` to shake these out early. +- **`bytes_consumed` is load-bearing for relative offsets.** Any variable-width variant (`Regex`, `Search`, `String`, `PString`, future additions) MUST have an explicit arm in `bytes_consumed` in `src/evaluator/types/mod.rs`. The catch-all `_ =>` arm fires a `debug_assert` in dev/test, but release builds will silently corrupt the GNU `file` anchor for any downstream `Relative(N)` sibling. Treat missing arms as a correctness bug, not a lint. +- **Sibling functions beat signature extensions when the new concern is narrow.** The earlier design in this solution suggested extending `read_typed_value` in place; the current implementation instead added a sibling `read_typed_value_with_pattern` and kept `read_typed_value` as a zero-cost wrapper. The sibling approach avoided updating ~30 existing call sites that would otherwise have to pass `None` for the new argument. When only a narrow slice of callers needs the new capability, a sibling function is cheaper and easier to review. +- **Do not overload `Value::String("")` as a "no match" sentinel.** A zero-width regex match (`^`, `a*`, lookaheads) returns a valid empty matched string that is not a miss. Use `Result, _>` or a dedicated sentinel variant when the reader needs to distinguish "found nothing" from "found zero bytes." The engine path must work from the `Option`, not from `is_empty()`. +- **Search advances by match-end, not window-end.** The GNU `file` contract is `anchor += match_idx + pattern.len()`; the full search window size is only used as a bound on the scan. Getting this wrong silently corrupts relative-offset children of every successful search rule with no test failure for any rule that does not chain children. +- **Pattern-bearing types reject non-equality operators.** `regex < "foo"` and `search & 0xff` are magic-file semantic bugs. The engine should return a structured error rather than falling through to `apply_operator`, which produces garbage ordering comparisons against the pattern literal. +- **Backtick every Rust identifier individually in doc comments.** Clippy `doc_markdown` fires on bare `TypeKind` even inside a sentence like "extends `read_typed_value` for TypeKind::Regex". Write `` `TypeKind::Regex` `` as a separate backticked span. + +## Testing + +- **Unit tests for `read_regex` and `read_search`** (added this session): basic match, no-match, case-insensitive flag, start-of-line anchor, non-zero offset handling, bounded search range, invalid/unparseable pattern error path, and binary (non-UTF-8) buffer handling. +- **Start-of-line anchoring negative test.** With `/l` enabled, a bare (unanchored) pattern like `"line"` that appears only mid-line must return the empty-string no-match. The `build_regex` helper's `^(?:...)` wrapper is what makes this correct — test it explicitly so a future refactor does not regress. +- **Anchor-advance regression tests.** After a successful `Regex` or `Search` match at offset `O` consuming `N` bytes, assert `EvaluationContext::last_match_end() == O + N`. Add a parallel test for the no-match path (anchor must not advance). +- **Sibling-after-regex integration test.** Construct a `MagicRule` tree where a `Regex` parent match is followed by a sibling with `OffsetSpec::Relative(+K)`; verify the sibling reads from `anchor + K`, not from absolute `K`. Repeat for `Search` and for `Relative(-K)` to cover both directions. +- **Property test hook.** Add `Regex` and `Search` arms to `arb_type_kind` in `tests/property_tests.rs` so the codegen round-trip and strength-calculation invariants exercise the new variants automatically. + +## Related Documentation + +- `GOTCHAS.md` S2.1 — TypeKind exhaustive-match checklist across 10+ files (ast, grammar, types, codegen, strength, property_tests, evaluator/types, output, grammar/tests); catch-all arms in `bytes_consumed` will fire `debug_assert` for variable-width variants. +- `GOTCHAS.md` S3.1 — parser type-keyword split between `src/parser/types.rs` (`parse_type_keyword` / `type_keyword_to_kind`) and `src/parser/grammar/mod.rs` for suffixes. +- `GOTCHAS.md` S1.2 / S1.3 — build.rs / codegen serialization boundary and generated-import sync (`generate_builtin_rules` in `src/parser/codegen.rs`). +- `GOTCHAS.md` S3.8 — `bytes_consumed` as source of truth for `EvaluationContext::last_match_end` anchor advance. +- `GOTCHAS.md` S8.1 — `enum_variant_names` clippy guidance for same-suffix variants; S10.3 — public enum variants require `# Examples` rustdoc (clippy enforced). +- `AGENTS.md` "Adding New Type Support" — 7-step procedure for new `TypeKind` variants. +- GitHub issue **#39** — parent ticket tracking regex and search type evaluator support. + +No prior solution doc specifically covers regex/search type matching, the build.rs/codegen indirect-error surface, or `clippy::doc_markdown` fixes. diff --git a/docs/src/ast-structures.md b/docs/src/ast-structures.md index 2058335b..fd0abb44 100644 --- a/docs/src/ast-structures.md +++ b/docs/src/ast-structures.md @@ -191,6 +191,17 @@ pub enum TypeKind { length_width: PStringLengthWidth, length_includes_itself: bool, }, + + /// Regular expression pattern matching + Regex { + flags: RegexFlags, + count: Option, + }, + + /// Bounded literal byte sequence search + Search { + range: NonZeroUsize, + }, } ``` @@ -328,6 +339,140 @@ let limited_pstring = TypeKind::PString { }; ``` +### Regex (Regular Expression Pattern Matching) + +The `Regex` variant matches POSIX-extended regular expression patterns against file buffers. Patterns are binary-safe and always compiled with multi-line mode enabled (matching `^` and `$` at line boundaries). The scan window is capped at 8192 bytes regardless of the count parameter. + +**Structure:** + +```rust +Regex { + flags: RegexFlags, + count: Option, +} +``` + +**Fields:** + +- `flags`: Modifier flags from the `/[csl]` suffix (case-insensitive, start-offset, line-based) +- `count`: Optional numeric scan limit, interpreted as bytes or lines depending on `flags.line_based` + +**Example:** + +```text +0 regex [0-9]+ Numeric content +0 regex/1l ^#!/ Shebang on first line +0 regex/cs json Case-insensitive "json" anywhere +``` + +**Behavior:** + +- Returns `Value::String` containing the matched text +- Scan window capped at 8192 bytes (GNU `file` `FILE_REGEX_MAX`) +- Multi-line mode unconditional (`^`/`$` match line boundaries, `.` does not match newlines) +- Zero-width matches (e.g., `^`, `a*`) return `Value::String("")` and are distinguished from no-match +- Only supports `Equal` and `NotEqual` operators; other comparison operators return `TypeReadError::UnsupportedType` + +### RegexFlags Struct + +The `RegexFlags` struct specifies regex behavior modifiers. All flags default to `false` via `RegexFlags::default`. + +```rust +pub struct RegexFlags { + /// `/c` - case-insensitive matching + pub case_insensitive: bool, + /// `/s` - advance anchor to match-start instead of match-end + pub start_offset: bool, + /// `/l` - measure scan window in lines instead of bytes + pub line_based: bool, +} +``` + +**Flag combinations:** + +- `/c` - case-insensitive matching +- `/s` - anchor advances to match-start (for chaining child rules) +- `/l` - count parameter measured in lines (80 bytes per line, capped at 8192 total) +- `/cs`, `/cl`, `/sl`, `/csl` - any combination of flags + +**Examples:** + +```rust +use libmagic_rs::parser::ast::{TypeKind, RegexFlags}; +use std::num::NonZeroU32; + +// Plain regex with 8192-byte default scan window +let plain_regex = TypeKind::Regex { + flags: RegexFlags::default(), + count: None, +}; + +// First line only (1 line, capped at 8192 bytes) +let first_line = TypeKind::Regex { + flags: RegexFlags { + line_based: true, + ..RegexFlags::default() + }, + count: NonZeroU32::new(1), +}; + +// Case-insensitive with anchor at match-start +let case_start = TypeKind::Regex { + flags: RegexFlags { + case_insensitive: true, + start_offset: true, + line_based: false, + }, + count: None, +}; +``` + +### Search (Bounded Literal Byte Sequence Search) + +The `Search` variant scans for a literal byte pattern within a bounded range. Unlike `String`, which matches only at the exact offset, `Search` scans forward up to `range` bytes for the first occurrence. + +**Structure:** + +```rust +Search { + range: NonZeroUsize, +} +``` + +**Fields:** + +- `range`: Mandatory scan window width in bytes (must be non-zero per GNU `file` magic(5) specification) + +**Example:** + +```text +0 search/256 PK\003\004 ZIP archive within first 256 bytes +``` + +**Behavior:** + +- Returns `Value::String` containing the matched bytes if found within range +- Anchor advances by the entire search window regardless of where the match was found +- Only supports `Equal` and `NotEqual` operators +- Range is mandatory; `search/0` or bare `search` are parse errors + +**Examples:** + +```rust +use libmagic_rs::parser::ast::TypeKind; +use std::num::NonZeroUsize; + +// Scan up to 256 bytes for the pattern +let bounded_search = TypeKind::Search { + range: NonZeroUsize::new(256).unwrap(), +}; + +// Scan up to 1024 bytes +let wide_search = TypeKind::Search { + range: NonZeroUsize::new(1024).unwrap(), +}; +``` + ### Endianness Options ```rust @@ -520,9 +665,11 @@ let script_rule = MagicRule { 1. **Use `Byte { signed }`** for single-byte values and flags, specifying signedness 2. **Use `Short/Long/Quad`** with explicit endianness and signedness for multi-byte integers -3. **Use `String`** with length limits for text patterns +3. **Use `String`** with length limits for text patterns at exact offsets 4. **Use `PString`** for Pascal-style length-prefixed strings -5. **Use `Bytes`** for exact binary sequences +5. **Use `Regex`** for pattern matching (complex patterns, line-based checks, case-insensitive matching) +6. **Use `Search`** for simple substring matching within a bounded range (faster than regex for literal patterns) +7. **Use `Bytes`** for exact binary sequences ### Performance Considerations diff --git a/docs/src/compatibility.md b/docs/src/compatibility.md index 444d5eca..3f5d7327 100644 --- a/docs/src/compatibility.md +++ b/docs/src/compatibility.md @@ -75,9 +75,9 @@ $ rmagic --json example.elf | Hierarchical rules | ✅ | ✅ | Complete | Parent-child relationships | | Indirect offsets | ✅ | ✅ | Complete | Pointer dereferencing | | Relative offsets | ✅ | ✅ | Complete | Position-relative addressing (PR #211) | -| Search patterns | ✅ | 📋 | Planned | Pattern searching in ranges | +| Search patterns | ✅ | ✅ | Complete | Pattern searching in ranges (PR #214) | | Bitwise operations | ✅ | ✅ | Complete | AND, XOR, NOT operations | -| String operations | ✅ | 📋 | Planned | Case-insensitive, regex | +| String operations | ✅ | ✅ | Complete | Case-insensitive, regex (PR #214) | | Date/time formats | ✅ | ✅ | Complete | 32-bit and 64-bit timestamps | | Floating point | ✅ | ✅ | Complete | Float, double with endianness | | Unicode support | ✅ | 📋 | Planned | UTF-8, UTF-16 strings | diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index b74d0711..258d7865 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -132,16 +132,30 @@ Interprets bytes according to type specifications. The types module is organized - **QDate**: 64-bit Unix timestamps (signed seconds since epoch) with configurable endianness and UTC/local time formatting - **String**: Byte sequences with length limits - **PString**: Pascal-style length-prefixed strings with 1-byte (`/B`), 2-byte (`/H` or `/h`), or 4-byte (`/L` or `/l`) length prefixes, supporting big-endian and little-endian byte order +- **Regex**: Binary-safe regex matching via `regex::bytes::Regex`; the `/c` flag enables case-insensitive matching and `/l` enables multi-line start-of-line anchoring +- **Search**: Bounded literal pattern scan via `memchr::memmem::find`; `search/N` caps the scan window to `N` bytes from the offset - **Bounds checking**: Prevents buffer overruns ```rust +// Non-pattern types use the 3-arg convenience wrapper: pub fn read_typed_value( buffer: &[u8], offset: usize, type_kind: &TypeKind, ) -> Result + +// Pattern-bearing types (Regex, Search) thread the rule's value operand +// through as the match pattern: +pub fn read_typed_value_with_pattern( + buffer: &[u8], + offset: usize, + type_kind: &TypeKind, + pattern: Option<&Value>, +) -> Result ``` +The engine uses `read_typed_value_with_pattern` uniformly and passes `Some(&rule.value)` for every rule; the convenience `read_typed_value` is a thin wrapper that forwards `pattern: None`. For pattern-bearing types a genuine "no match" is collapsed to `Value::String(String::new())` in the `read_typed_value_with_pattern` return so the back-compat `Value` shape is preserved; the engine instead calls `read_pattern_match` directly, which returns `Result, _>` so zero-width matches (e.g. `^`, `a*`) can be distinguished from genuine misses. + The `read_byte` function signature changed in v0.2.0 to accept three parameters (`buffer`, `offset`, and `signed`) instead of two, allowing explicit control over signed vs unsigned byte interpretation. **Floating-Point Type Reading (`evaluator/types/float.rs`):** diff --git a/docs/src/parser.md b/docs/src/parser.md index b59adde2..71ba7bef 100644 --- a/docs/src/parser.md +++ b/docs/src/parser.md @@ -285,6 +285,139 @@ The parser supports date and timestamp types for parsing Unix timestamps (signed The parser creates `TypeKind::Date` or `TypeKind::QDate` variants with appropriate endianness and UTC flags. During evaluation, timestamps are formatted as strings in the format "Www Mmm DD HH:MM:SS YYYY" to match GNU file output. +### Regex Type + +The parser supports regular expression matching through the `regex` keyword, enabling POSIX-extended regex patterns against file contents: + +**Type Keyword:** + +- `regex` - Regular expression match → `TypeKind::Regex { flags, count }` + +**Flag Support:** + +Regex rules accept three modifier flags via the `/[csl]` suffix: + +- `/c` - Case-insensitive matching → `RegexFlags::case_insensitive = true` +- `/s` - Advance anchor to match-start instead of match-end → `RegexFlags::start_offset = true` +- `/l` - Line-based counting (interpret count as line count) → `RegexFlags::line_based = true` + +Flags can be combined in any order (`/cl`, `/lc`, `/csl` are all equivalent). The parser also accepts interleaved flag-and-count syntax matching GNU `file` semantics: `regex/1l` and `regex/l1` both parse identically. + +**Optional Count Parameter:** + +An optional decimal count controls the scan window: + +- No count: scan 8192 bytes (default) +- `/N` (no `/l`): scan at most `N` bytes, capped at 8192 +- `/Nl` (with `/l`): scan at most `N` lines, effective byte cap is `min(N * 80, 8192)` + +The 8192-byte hard cap matches GNU `file`'s `FILE_REGEX_MAX` constant and prevents runaway regex scans against large buffers. + +**Parsing Examples:** + +```rust +// Plain regex (no flags, 8192-byte default scan window) +parse_type_and_operator("regex") +// → TypeKind::Regex { flags: RegexFlags::default(), count: None } + +// Case-insensitive flag +parse_type_and_operator("regex/c") +// → TypeKind::Regex { flags: RegexFlags { case_insensitive: true, .. }, count: None } + +// Line-based with explicit count +parse_type_and_operator("regex/1l") +// → TypeKind::Regex { flags: RegexFlags { line_based: true, .. }, count: Some(1) } + +// Combined flags and count (interleaved order accepted) +parse_type_and_operator("regex/c256s") +// → TypeKind::Regex { flags: RegexFlags { case_insensitive: true, start_offset: true, .. }, count: Some(256) } +``` + +**Usage in Magic Rules:** + +```rust +// Match lines starting with a digit +0 regex "^[0-9]" numeric prefix + +// Case-insensitive JSON detection +0 regex/c "\\{.*\"[^\"]+\"" possible JSON + +// Scan first line only for version string +>1 regex/1l "version [0-9]+" version line +``` + +**Regex Semantics:** + +- Patterns are compiled with multi-line mode always enabled (matching libmagic's unconditional `REG_NEWLINE`), so `^` and `$` match at line boundaries and `.` does not match `\n`. +- The scan window is always capped at 8192 bytes regardless of the `count` value. +- Zero-width matches (`^`, `a*`, lookaheads) are preserved as `Value::String("")` and distinguished from genuine misses. +- Regex rules only support `Operator::Equal` and `Operator::NotEqual`; other comparison operators are rejected at evaluation time. + +**Features:** + +- ✅ `regex` keyword recognition with suffix parsing +- ✅ Three modifier flags (`/c`, `/s`, `/l`) with arbitrary combination order +- ✅ Optional numeric count parameter (interleaved with flags per GNU `file` semantics) +- ✅ 8192-byte scan window cap matching `FILE_REGEX_MAX` +- ✅ Bare `regex/` with no valid modifier is a parse error +- ✅ `regex/0` is rejected (zero count has no valid semantics) +- ✅ `RegexFlags` struct representation for clean flag management + +### Search Type + +The parser supports bounded literal byte sequence searching through the `search` keyword: + +**Type Keyword:** + +- `search` - Multi-byte pattern search within bounded range → `TypeKind::Search { range }` + +**Mandatory Range Parameter:** + +Search rules require a decimal range suffix specifying the scan window width in bytes: + +- `/N` - Scan up to `N` bytes for the literal pattern, stored as `NonZeroUsize` + +Per GNU `file` magic(5) specification, the range is **mandatory**. Bare `search` (no `/N` suffix) and `search/0` are both rejected at parse time. + +**Parsing Examples:** + +```rust +// 256-byte search window +parse_type_and_operator("search/256") +// → TypeKind::Search { range: NonZeroUsize(256) } + +// Bare search is a parse error (range is mandatory) +parse_type_and_operator("search") +// → Err(...) + +// Zero-range search is rejected +parse_type_and_operator("search/0") +// → Err(...) +``` + +**Usage in Magic Rules:** + +```rust +// Scan up to 256 bytes for DOS MZ header +0 search/256 "MZ" DOS executable + +// Look for ZIP signature within first 1024 bytes +0 search/1024 "PK\x03\x04" ZIP archive +``` + +**Search Semantics:** + +- Unlike `TypeKind::String`, which only matches at the exact offset, `search` scans forward up to `range` bytes for the first occurrence of the literal pattern. +- The anchor advances to the end of the matched pattern (matching libmagic's `FILE_SEARCH` behavior in `softmagic.c::moffset()`). +- Search rules only support `Operator::Equal` and `Operator::NotEqual`; other comparison operators are rejected at evaluation time. + +**Features:** + +- ✅ `search` keyword recognition with mandatory `/N` suffix +- ✅ `NonZeroUsize` range representation (zero-width scan unrepresentable) +- ✅ Bare `search` and `search/0` rejected at parse time +- ✅ Binary-safe literal matching via `memchr::memmem::find` + ## Parser Design Principles ### Error Handling @@ -395,7 +528,6 @@ match detect_format(path)? { ### Not Yet Implemented - **Indirect Offsets**: Pointer dereferencing patterns (e.g., `(0x3c.l)`) -- **Regex Support**: Regular expression matching in rules - **Binary .mgc Format**: Compiled magic database format - **Strength Modifiers**: `!:strength` parsing for rule priority diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 4acf0a3a..cc74abbe 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -107,24 +107,87 @@ fn evaluate_single_rule_with_anchor( let absolute_offset = offset::resolve_offset_with_context(&rule.offset, buffer, last_match_end)?; - // Step 2: Read and interpret bytes at the resolved offset according to the rule's type - let read_value = types::read_typed_value(buffer, absolute_offset, &rule.typ) - .map_err(|e| LibmagicError::EvaluationError(e.into()))?; + // Step 2 & 3: Read and interpret bytes at the resolved offset according + // to the rule's type, and compute the logical match state. + // + // Pattern-bearing types (Regex, Search) take a different path from + // fixed-width types because the rule's `value` operand is the *pattern*, + // not an expected matched value. Running those through `apply_operator` + // would compare matched text ("123") against the pattern literal + // ("[0-9]+") and produce false negatives on any regex with + // metacharacters. Instead, `read_pattern_match` returns `Some(v)` on a + // successful match (possibly zero-width) and `None` on a genuine miss; + // the engine translates that directly into Equal / NotEqual. Any other + // operator on a pattern-bearing type is a magic-file semantic bug and + // surfaces as a hard error -- the fallthrough to `apply_operator` + // previously masked this by producing nonsense ordering comparisons + // against the pattern source text. + let (matched, read_value) = match &rule.typ { + crate::parser::ast::TypeKind::Regex { .. } + | crate::parser::ast::TypeKind::Search { .. } => { + let match_outcome = + types::read_pattern_match(buffer, absolute_offset, &rule.typ, Some(&rule.value)) + .map_err(|e| LibmagicError::EvaluationError(e.into()))?; + let pattern_found = match_outcome.is_some(); + let matched = match &rule.op { + crate::parser::ast::Operator::Equal => pattern_found, + crate::parser::ast::Operator::NotEqual => !pattern_found, + other => { + return Err(LibmagicError::EvaluationError( + types::TypeReadError::UnsupportedType { + type_name: format!( + "operator {other:?} is not supported for pattern-bearing type {:?}; only Equal (=) and NotEqual (!=) are allowed", + rule.typ + ), + } + .into(), + )); + } + }; + // For anchor-advance and output, present the match as a + // `Value::String`. A genuine miss is represented as an empty + // string to keep the downstream `RuleMatch.value` contract + // uniform; the engine already decided `matched` above so the + // placeholder value only affects display and + // `bytes_consumed_with_pattern` (which re-derives the match + // position from the pattern, not this value). + let value = + match_outcome.unwrap_or_else(|| crate::parser::ast::Value::String(String::new())); + (matched, value) + } + _ => { + // Value-based types: read the typed value and apply the operator + // against the rule's expected value. + let read_value = types::read_typed_value_with_pattern( + buffer, + absolute_offset, + &rule.typ, + Some(&rule.value), + ) + .map_err(|e| LibmagicError::EvaluationError(e.into()))?; - // Step 3: Coerce the rule's expected value to match the type's signedness/width. - // `coerce_value_to_type` returns `Cow::Borrowed` on the hot path so no - // allocation happens for pass-through values (e.g., string matches). - let expected_value = types::coerce_value_to_type(&rule.value, &rule.typ); - let expected_ref: &crate::parser::ast::Value = expected_value.as_ref(); + // Coerce the rule's expected value to match the type's + // signedness/width. `coerce_value_to_type` returns + // `Cow::Borrowed` on the hot path so no allocation happens for + // pass-through values (e.g., string matches). + let expected_value = types::coerce_value_to_type(&rule.value, &rule.typ); + let expected_ref: &crate::parser::ast::Value = expected_value.as_ref(); - // Step 4: Apply the operator to compare the read value with the expected value - // BitwiseNot needs type-aware bit-width masking so the complement is computed - // at the type's natural width (e.g., byte NOT of 0x00 = 0xFF, not u64::MAX). - let matched = match &rule.op { - crate::parser::ast::Operator::BitwiseNot => { - operators::apply_bitwise_not_with_width(&read_value, expected_ref, rule.typ.bit_width()) + // BitwiseNot needs type-aware bit-width masking so the + // complement is computed at the type's natural width (e.g., + // byte NOT of 0x00 = 0xFF, not u64::MAX). + let matched = match &rule.op { + crate::parser::ast::Operator::BitwiseNot => { + operators::apply_bitwise_not_with_width( + &read_value, + expected_ref, + rule.typ.bit_width(), + ) + } + op => operators::apply_operator(op, &read_value, expected_ref), + }; + (matched, read_value) } - op => operators::apply_operator(op, &read_value, expected_ref), }; Ok(matched.then_some((absolute_offset, read_value))) } @@ -280,7 +343,12 @@ pub fn evaluate_rules( // anchor. The anchor is updated unconditionally to the end of // this match -- it may move forward or backward depending on // where successive rules match (it is *not* a high-watermark). - let consumed = types::bytes_consumed(buffer, absolute_offset, &rule.typ); + let consumed = types::bytes_consumed_with_pattern( + buffer, + absolute_offset, + &rule.typ, + Some(&rule.value), + ); let new_anchor = absolute_offset.saturating_add(consumed); context.set_last_match_end(new_anchor); diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests.rs index 1583b386..db928437 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests.rs @@ -2384,3 +2384,249 @@ fn test_resource_exhaustion_large_buffer_completes_without_panic() { matches.len() ); } + +/// A regex rule whose pattern contains metacharacters must succeed when the +/// pattern actually matches the buffer. Prior to this fix, the engine compared +/// the matched text (e.g., "123") against the pattern literal ("[0-9]+") via +/// `apply_operator`, which failed for any real regex. +#[test] +fn test_regex_rule_with_metacharacters_matches() { + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }, + op: Operator::Equal, + value: Value::String("[0-9]+".to_string()), + message: "has digits".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_single_rule(&rule, b"abc123def", &mut context).unwrap(); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].message, "has digits"); +} + +/// A regex rule whose pattern does not match must not match, confirming that +/// the logical-match shortcut only fires on a non-empty reader result. +#[test] +fn test_regex_rule_with_metacharacters_no_match() { + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }, + op: Operator::Equal, + value: Value::String("[0-9]+".to_string()), + message: "has digits".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_single_rule(&rule, b"abcdef", &mut context).unwrap(); + assert!(matches.is_empty()); +} + +/// A search rule with `Operator::NotEqual` succeeds only when the literal +/// pattern is absent from the window. +#[test] +fn test_search_rule_not_equal_succeeds_when_pattern_absent() { + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Search { + range: ::std::num::NonZeroUsize::new(64).unwrap(), + }, + op: Operator::NotEqual, + value: Value::String("needle".to_string()), + message: "no needle".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_single_rule(&rule, b"plain haystack", &mut context).unwrap(); + assert_eq!(matches.len(), 1); +} + +/// A non-Equal/NotEqual operator on a pattern-bearing type must surface as +/// a hard error, not silently produce an ordering comparison against the +/// pattern source text. Pre-fix, `regex > "[0-9]+"` matched by coincidence +/// whenever the empty "no match" sentinel happened to lexicographically +/// exceed the pattern literal. +#[test] +fn test_regex_rule_with_ordering_operator_is_rejected() { + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }, + op: Operator::GreaterThan, + value: Value::String("[0-9]+".to_string()), + message: "bogus".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let result = evaluate_single_rule(&rule, b"abcdef", &mut context); + match result { + Err(LibmagicError::EvaluationError(_)) => {} + other => panic!("expected EvaluationError for ordering operator on regex, got {other:?}"), + } +} + +#[test] +fn test_search_rule_with_bitwise_operator_is_rejected() { + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Search { + range: ::std::num::NonZeroUsize::new(32).unwrap(), + }, + op: Operator::BitwiseAnd, + value: Value::String("needle".to_string()), + message: "bogus".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let result = evaluate_single_rule(&rule, b"plain haystack", &mut context); + assert!( + matches!(result, Err(LibmagicError::EvaluationError(_))), + "expected EvaluationError for bitwise operator on search" + ); +} + +/// A child rule with `OffsetSpec::Relative(0)` after a parent regex match +/// must resolve to `parent_absolute_offset + match_length`, so the byte the +/// child reads is the first byte *after* the parent's match. This is the +/// regression test GOTCHAS 2.1 warns about: if `bytes_consumed_with_pattern` +/// returns the wrong number for `TypeKind::Regex`, the child lands at the +/// wrong offset and either misses or matches the wrong byte. +#[test] +fn test_regex_parent_advances_anchor_for_relative_child() { + // Buffer: "abc123X" -- parent regex "abc" matches bytes 0..3, so a + // Relative(0) child should read byte 3 = '1' (0x31). A Relative(-1) + // child would read byte 2 = 'c' (0x63). + let child = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(u64::from(b'1')), + message: "first digit".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }, + op: Operator::Equal, + value: Value::String("abc".to_string()), + message: "abc prefix".to_string(), + children: vec![child], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&[parent], b"abc123X", &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "expected parent + child match, got {}: {matches:?}", + matches.len() + ); + assert_eq!(matches[0].message, "abc prefix"); + assert_eq!(matches[1].message, "first digit"); +} + +/// A child rule with `OffsetSpec::Relative(0)` after a parent search match +/// must land at `match_index + pattern.len()` — NOT at `window_end` (the +/// pre-fix window-size advance would land on a completely different byte). +#[test] +fn test_search_parent_advances_anchor_to_match_end_not_window_end() { + // Buffer: "XXXneedleYY_ZZ" -- parent `search/32 "needle"` finds the + // pattern at index 3, length 6, match-end = 9. A Relative(0) child + // should read byte 9 = 'Y' (0x59). With the bug, the anchor would + // advance by 32 bytes (way past the buffer) or (with range=14) by 14 + // to index 14 which is past the buffer end. + let child = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(u64::from(b'Y')), + message: "trailing Y".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Search { + range: ::std::num::NonZeroUsize::new(14).unwrap(), + }, + op: Operator::Equal, + value: Value::String("needle".to_string()), + message: "found needle".to_string(), + children: vec![child], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&[parent], b"XXXneedleYY_ZZ", &mut context).unwrap(); + assert_eq!(matches.len(), 2, "expected parent + child, got {matches:?}"); + assert_eq!(matches[1].message, "trailing Y"); +} + +/// Sanity check the negative: when the parent search finds the pattern +/// early in the window, a Relative(-N) child should still resolve against +/// the match-end anchor. This catches a class of bugs where the anchor +/// update uses the wrong base offset. +#[test] +fn test_search_parent_relative_child_at_positive_offset() { + // Buffer: "prefix_NEEDLE_after_stuff" -- "NEEDLE" is at index 7, len + // 6, match-end = 13. A Relative(1) child should read byte 14 = 'a'. + let child = MagicRule { + offset: OffsetSpec::Relative(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(u64::from(b'a')), + message: "a after".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Search { + range: ::std::num::NonZeroUsize::new(32).unwrap(), + }, + op: Operator::Equal, + value: Value::String("NEEDLE".to_string()), + message: "found".to_string(), + children: vec![child], + level: 0, + strength_modifier: None, + }; + + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&[parent], b"prefix_NEEDLE_after_stuff", &mut context).unwrap(); + assert_eq!(matches.len(), 2); + assert_eq!(matches[1].message, "a after"); +} diff --git a/src/evaluator/strength.rs b/src/evaluator/strength.rs index 1d0e7b4d..5caa3d70 100644 --- a/src/evaluator/strength.rs +++ b/src/evaluator/strength.rs @@ -77,6 +77,20 @@ pub fn calculate_default_strength(rule: &MagicRule) -> i32 { // Add bonus for limited-length strings (more constrained match) if max_length.is_some() { base + 5 } else { base } } + // Regex matches a pattern -- treat similarly to an unbounded string. + // A rule with an explicit `count` is more constrained (narrower scan + // window) and therefore more specific. + TypeKind::Regex { count, .. } => { + if count.is_some() { + 25 + } else { + 20 + } + } + // Search is always a bounded scan (the range is mandatory), so it + // gets the "constrained match" bonus unconditionally. This matches + // the max_length bonus used for String and PString. + TypeKind::Search { .. } => 25, // 64-bit types are most specific among numerics TypeKind::Quad { .. } | TypeKind::Double { .. } | TypeKind::QDate { .. } => 16, // 32-bit types are fairly specific diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index 2f149776..f98c0d31 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -9,6 +9,8 @@ mod date; mod float; mod numeric; +mod regex; +mod search; mod string; use crate::parser::ast::{TypeKind, Value}; @@ -19,6 +21,8 @@ use date::format_timestamp_value; pub use date::{read_date, read_qdate}; pub use float::{read_double, read_float}; pub use numeric::{read_byte, read_long, read_quad, read_short}; +pub use regex::read_regex; +pub use search::read_search; pub use string::{read_pstring, read_string}; /// Reads a fixed-size byte array from the buffer at the given offset. @@ -95,6 +99,18 @@ pub enum TypeReadError { /// Reads bytes according to the specified `TypeKind`. /// +/// This is the public dispatch entry point for type reading for non +/// pattern-bearing types. It preserves the original three-argument +/// signature used by external consumers -- fixed-width numeric, float, +/// date, string, and pstring types need no pattern operand, so the hot +/// path stays ergonomic. +/// +/// For pattern-bearing types (`TypeKind::Regex`, `TypeKind::Search`) this +/// function will return `TypeReadError::UnsupportedType` because the +/// pattern operand is mandatory. Callers that need to evaluate regex/search +/// rules should use [`read_typed_value_with_pattern`] and thread the rule +/// value operand through as `pattern`. +/// /// # Examples /// /// ``` @@ -102,7 +118,8 @@ pub enum TypeReadError { /// use libmagic_rs::parser::ast::{Endianness, TypeKind, Value}; /// /// let buffer = &[0x7f, 0x45, 0x4c, 0x46, 0x34, 0x12]; -/// let byte_result = read_typed_value(buffer, 0, &TypeKind::Byte { signed: false }).unwrap(); +/// let byte_result = +/// read_typed_value(buffer, 0, &TypeKind::Byte { signed: false }).unwrap(); /// assert_eq!(byte_result, Value::Uint(0x7f)); /// /// let short_type = TypeKind::Short { @@ -115,12 +132,59 @@ pub enum TypeReadError { /// /// # Errors /// -/// Returns `TypeReadError::BufferOverrun` when the requested value extends past -/// the buffer bounds. +/// Returns `TypeReadError::BufferOverrun` when the requested value extends +/// past the buffer bounds, `TypeReadError::UnsupportedType` when a +/// pattern-bearing type is evaluated without a pattern, or +/// `TypeReadError::InvalidPStringLength` for a malformed Pascal string +/// length prefix. pub fn read_typed_value( buffer: &[u8], offset: usize, type_kind: &TypeKind, +) -> Result { + read_typed_value_with_pattern(buffer, offset, type_kind, None) +} + +/// Reads bytes according to the specified `TypeKind`, threading a +/// `pattern` operand through for pattern-bearing types (`Regex`, `Search`). +/// +/// This is the internal dispatch entry point used by the evaluation engine +/// to evaluate pattern-bearing types. The engine threads the rule's value +/// operand through as `pattern` so the regex and search readers can +/// compile/locate it against the buffer. For fixed-width and non-pattern +/// types (numeric, float, date, string, pstring), the `pattern` parameter +/// is ignored; external callers for those types should prefer the simpler +/// three-argument [`read_typed_value`] wrapper. +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::evaluator::types::read_typed_value_with_pattern; +/// use libmagic_rs::parser::ast::{RegexFlags, TypeKind, Value}; +/// +/// let haystack = b"abc123def"; +/// let regex_type = TypeKind::Regex { +/// flags: RegexFlags::default(), +/// count: None, +/// }; +/// let pattern = Value::String("[0-9]+".to_string()); +/// let regex_result = +/// read_typed_value_with_pattern(haystack, 0, ®ex_type, Some(&pattern)).unwrap(); +/// assert_eq!(regex_result, Value::String("123".to_string())); +/// ``` +/// +/// # Errors +/// +/// Returns `TypeReadError::BufferOverrun` when the requested value extends +/// past the buffer bounds, `TypeReadError::UnsupportedType` when a regex +/// pattern fails to compile or a pattern-bearing type is evaluated without +/// a pattern, or `TypeReadError::InvalidPStringLength` for a malformed +/// Pascal string length prefix. +pub fn read_typed_value_with_pattern( + buffer: &[u8], + offset: usize, + type_kind: &TypeKind, + pattern: Option<&Value>, ) -> Result { match type_kind { TypeKind::Byte { signed } => read_byte(buffer, offset, *signed), @@ -143,6 +207,90 @@ pub fn read_typed_value( *length_width, *length_includes_itself, ), + TypeKind::Regex { flags, count } => { + let pattern_str = match pattern { + Some(Value::String(s)) => s.as_str(), + _ => { + return Err(TypeReadError::UnsupportedType { + type_name: "regex without string pattern".to_string(), + }); + } + }; + // Collapse `None` (no match) to `Value::String(String::new())` + // for back-compat with callers using the single-Value return + // shape. The engine path goes through `read_pattern_match` + // directly and preserves the `Option` so it can distinguish a + // zero-width match from a miss. + Ok(read_regex(buffer, offset, pattern_str, *flags, *count)? + .unwrap_or_else(|| Value::String(String::new()))) + } + TypeKind::Search { range } => { + let pattern_bytes: &[u8] = match pattern { + Some(Value::String(s)) => s.as_bytes(), + Some(Value::Bytes(b)) => b.as_slice(), + _ => { + return Err(TypeReadError::UnsupportedType { + type_name: "search without string/bytes pattern".to_string(), + }); + } + }; + Ok(read_search(buffer, offset, pattern_bytes, *range)? + .unwrap_or_else(|| Value::String(String::new()))) + } + } +} + +/// Engine entry point for pattern-bearing types (`Regex`, `Search`). +/// +/// Returns `Ok(None)` on a genuine "no match" outcome and `Ok(Some(value))` +/// on a successful match -- including zero-width matches (e.g., regex `^`, +/// `a*`, lookaheads). This is the contract the evaluator needs to +/// distinguish a real miss from a zero-width hit; [`read_typed_value_with_pattern`] +/// collapses both cases to `Value::String(String::new())` for back-compat. +/// +/// # Errors +/// +/// Returns [`TypeReadError`] for: +/// +/// * `BufferOverrun` when `offset >= buffer.len()` +/// * `UnsupportedType` if `type_kind` is not pattern-bearing, if the +/// pattern operand is missing, or if the pattern has the wrong +/// `Value` variant for the type +/// * `UnsupportedType` (via [`read_regex`]) if a regex pattern fails to +/// compile +pub(crate) fn read_pattern_match( + buffer: &[u8], + offset: usize, + type_kind: &TypeKind, + pattern: Option<&Value>, +) -> Result, TypeReadError> { + match type_kind { + TypeKind::Regex { flags, count } => { + let pattern_str = match pattern { + Some(Value::String(s)) => s.as_str(), + _ => { + return Err(TypeReadError::UnsupportedType { + type_name: "regex without string pattern".to_string(), + }); + } + }; + read_regex(buffer, offset, pattern_str, *flags, *count) + } + TypeKind::Search { range } => { + let pattern_bytes: &[u8] = match pattern { + Some(Value::String(s)) => s.as_bytes(), + Some(Value::Bytes(b)) => b.as_slice(), + _ => { + return Err(TypeReadError::UnsupportedType { + type_name: "search without string/bytes pattern".to_string(), + }); + } + }; + read_search(buffer, offset, pattern_bytes, *range) + } + _ => Err(TypeReadError::UnsupportedType { + type_name: format!("read_pattern_match called on non-pattern type: {type_kind:?}"), + }), } } @@ -202,7 +350,8 @@ pub fn coerce_value_to_type<'a>(value: &'a Value, type_kind: &TypeKind) -> Cow<' } } -/// Returns the anchor-advance distance for `type_kind` at `offset`. +/// Returns the anchor-advance distance for `type_kind` at `offset`, threading +/// the rule's value operand through for pattern-bearing types. /// /// This value is used by the evaluation engine to advance the GNU `file` /// "previous match" anchor for relative offset resolution. It reflects how @@ -223,6 +372,16 @@ pub fn coerce_value_to_type<'a>(value: &'a Value, type_kind: &TypeKind) -> Cow<' /// it after a successful read, so the defensive paths are belt-and-braces /// for any future caller that breaks that invariant. /// +/// For `TypeKind::Regex`, the pattern is required to re-run the match and +/// compute the consumed bytes. When the pattern is unavailable (or not a +/// string), the function returns `0` -- the anchor will then stay put and +/// the next relative offset resolves against the previous anchor position, +/// which is the same graceful-degradation behavior used by the other +/// defensive paths in this module. For `TypeKind::Search`, the pattern is +/// not needed because the consumed distance is the entire search window +/// regardless of where the match was found. Non-pattern types should pass +/// `pattern: None`. +/// /// # Semantics /// /// - **Fixed-width types** (Byte, Short, Long, Quad, Float, Double, Date, @@ -247,7 +406,12 @@ pub fn coerce_value_to_type<'a>(value: &'a Value, type_kind: &TypeKind) -> Cow<' /// buffer length so a malicious oversized length prefix cannot poison the /// anchor. #[must_use] -pub(crate) fn bytes_consumed(buffer: &[u8], offset: usize, type_kind: &TypeKind) -> usize { +pub(crate) fn bytes_consumed_with_pattern( + buffer: &[u8], + offset: usize, + type_kind: &TypeKind, + pattern: Option<&Value>, +) -> usize { if let Some(bits) = type_kind.bit_width() { let width = (bits as usize) / 8; // Bounds-check the fixed-width path so a misuse (offset past end of @@ -274,6 +438,39 @@ pub(crate) fn bytes_consumed(buffer: &[u8], offset: usize, type_kind: &TypeKind) *length_width, *length_includes_itself, ), + TypeKind::Regex { flags, count } => match pattern { + Some(Value::String(s)) => { + regex::regex_bytes_consumed(buffer, offset, s.as_str(), *flags, *count) + } + // Invariant: the engine only calls `bytes_consumed_with_pattern` + // after a successful `read_typed_value_with_pattern`/`read_pattern_match`, + // which requires `Some(Value::String(_))` for regex. If we land + // here the invariant is broken by a new caller and the anchor + // would silently stall instead of advancing. Fire a debug_assert + // so the mismatch is caught in dev/test builds. + other => { + debug_assert!( + false, + "bytes_consumed_with_pattern: TypeKind::Regex without Value::String pattern ({other:?}) -- engine invariant violated" + ); + 0 + } + }, + TypeKind::Search { range } => match pattern { + Some(Value::String(s)) => { + search::search_bytes_consumed(buffer, offset, s.as_bytes(), *range) + } + Some(Value::Bytes(b)) => { + search::search_bytes_consumed(buffer, offset, b.as_slice(), *range) + } + other => { + debug_assert!( + false, + "bytes_consumed_with_pattern: TypeKind::Search without Value::String/Bytes pattern ({other:?}) -- engine invariant violated" + ); + 0 + } + }, // A new variable-width TypeKind variant was added without updating // this match. Returning 0 here would silently corrupt the GNU `file` // anchor for any rule using relative offsets after a match of the diff --git a/src/evaluator/types/regex.rs b/src/evaluator/types/regex.rs new file mode 100644 index 00000000..28b7d58e --- /dev/null +++ b/src/evaluator/types/regex.rs @@ -0,0 +1,479 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Regular-expression matching for magic rule evaluation. +//! +//! Implements the `regex` `TypeKind` using `regex::bytes::RegexBuilder` so +//! that matching is binary-safe (patterns are applied to the raw byte +//! buffer, not a UTF-8 string). A successful match returns +//! `Ok(Some(Value::String(...)))` -- the matched bytes with invalid UTF-8 +//! replaced via `from_utf8_lossy`. A miss returns `Ok(None)`. The `Option` +//! is the structured "no match" signal, which lets the engine distinguish +//! a legitimate zero-width match (e.g., `^`, `a*`, lookaheads) from a +//! genuine miss -- both of which would otherwise collapse to +//! `Value::String(String::new())`. +//! +//! ## Semantics (matching GNU `file`) +//! +//! * **Multi-line mode is always on.** GNU `file`'s `alloc_regex` in +//! `src/softmagic.c` compiles every regex with `REG_NEWLINE` +//! unconditionally, so `^` and `$` match at line boundaries and `.` +//! does not match `\n`. The `/l` flag does **not** control this; it +//! controls whether the scan window is measured in bytes or lines. +//! +//! * **Scan window is always capped at [`REGEX_MAX_BYTES`] (8192).** This +//! matches libmagic's `FILE_REGEX_MAX` constant. An explicit `count` +//! larger than 8192 is clamped. An implicit count (no user-supplied +//! value) uses the 8192 default directly. +//! +//! * **Line-based window** (`/l` flag): when `flags.line_based` is set, +//! `count` is a line count. The scan window extends from `offset` +//! through the end of the Nth line terminator, capped at 8192 bytes. +//! Libmagic recognizes both `\n` (LF) and `\r\n` (CR+LF) as terminators +//! and counts them as single lines; this implementation uses the same +//! semantics via `memchr::memchr2(b'\n', b'\r', ...)`. +//! +//! * **`/s` flag** (`start_offset`): affects only the anchor advance +//! computed by [`regex_bytes_consumed`]. When set, the anchor moves by +//! `m.start()` (match-start) instead of `m.end()` (match-end), matching +//! libmagic's `REGEX_OFFSET_START` / `moffset()` logic. + +use super::TypeReadError; +use crate::parser::ast::{REGEX_MAX_BYTES, RegexFlags, Value}; +use regex::bytes::{Regex, RegexBuilder}; +use std::num::NonZeroU32; + +/// Compile `pattern` with the magic-rule regex flags applied. +/// +/// Multi-line mode is always enabled (unconditional in libmagic via +/// `REG_NEWLINE`) and `.` does not match newlines. The `case_insensitive` +/// flag is the only compile-time flag the magic-rule interface controls; +/// `line_based` and `start_offset` affect window computation and anchor +/// advance respectively, not regex compilation. +fn build_regex(pattern: &str, case_insensitive: bool) -> Result { + RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .multi_line(true) + .dot_matches_new_line(false) + .build() +} + +/// Compute the scan window for a regex rule at `offset`, applying the +/// 8192-byte cap and the `/l` line-count semantics when requested. +/// +/// Returns a slice of `buffer` starting at `offset`: +/// +/// * **Byte mode** (`flags.line_based == false`): window length is +/// `min(count.unwrap_or(REGEX_MAX_BYTES), REGEX_MAX_BYTES, remaining)`. +/// +/// * **Line mode** (`flags.line_based == true`): window extends from +/// `offset` through the end of the Nth line terminator (inclusive), +/// where N is `count.unwrap_or(u32::MAX)`. `\r\n` and `\n` both count as +/// one line terminator. If the Nth terminator is not found within +/// `REGEX_MAX_BYTES`, the window is truncated to 8192 bytes. If `count` +/// is `None` and no terminator is found at all, the window is the whole +/// buffer tail up to the 8192-byte cap. +fn compute_window( + buffer: &[u8], + offset: usize, + flags: RegexFlags, + count: Option, +) -> &[u8] { + let Some(remaining) = buffer.get(offset..) else { + return &[]; + }; + let byte_cap = remaining.len().min(REGEX_MAX_BYTES); + let capped = &remaining[..byte_cap]; + + if !flags.line_based { + let count_bytes = + count.map_or(REGEX_MAX_BYTES, |n| (n.get() as usize).min(REGEX_MAX_BYTES)); + return &capped[..count_bytes.min(capped.len())]; + } + + // Line mode: walk the byte-capped slice counting `\n` (and `\r\n` + // pairs as one terminator), stopping after the Nth terminator. + let target_lines = count.map_or(u32::MAX, NonZeroU32::get); + let mut lines_seen: u32 = 0; + let mut idx = 0usize; + while idx < capped.len() { + match capped[idx] { + b'\r' => { + // Treat CR and CRLF as a single terminator. + let advance = if idx + 1 < capped.len() && capped[idx + 1] == b'\n' { + 2 + } else { + 1 + }; + idx += advance; + lines_seen = lines_seen.saturating_add(1); + } + b'\n' => { + idx += 1; + lines_seen = lines_seen.saturating_add(1); + } + _ => idx += 1, + } + if lines_seen >= target_lines { + break; + } + } + &capped[..idx] +} + +/// Scan `buffer` starting at `offset` for the first match of `pattern`. +/// +/// # Arguments +/// +/// * `buffer` - File buffer to scan +/// * `offset` - Starting position within the buffer +/// * `pattern` - Regex source string (from the rule's `Value::String` +/// operand) +/// * `flags` - Regex modifier flags parsed from the `/[csl]` suffix +/// * `count` - Optional numeric count. Interpretation depends on +/// `flags.line_based`; see [`compute_window`] for the details. +/// +/// # Returns +/// +/// * `Ok(Some(Value::String(matched_text)))` on a successful match -- +/// invalid UTF-8 in the matched bytes is replaced with U+FFFD via +/// `from_utf8_lossy`. The matched text may legitimately be empty for +/// zero-width matches (e.g., `^`, `a*`, or lookaheads). +/// * `Ok(None)` when the pattern does not match anywhere in the scan +/// window. +/// +/// # Errors +/// +/// * `TypeReadError::BufferOverrun` if `offset >= buffer.len()`. +/// * `TypeReadError::UnsupportedType` if `pattern` fails to compile as a +/// regex (the error variant is reused to avoid adding a new enum +/// variant; the `type_name` field carries the compilation error +/// message). +pub fn read_regex( + buffer: &[u8], + offset: usize, + pattern: &str, + flags: RegexFlags, + count: Option, +) -> Result, TypeReadError> { + if offset >= buffer.len() { + return Err(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + }); + } + + let regex = build_regex(pattern, flags.case_insensitive).map_err(|e| { + TypeReadError::UnsupportedType { + type_name: format!("regex compile error: {e}"), + } + })?; + + let window = compute_window(buffer, offset, flags, count); + + Ok(regex + .find(window) + .map(|m| Value::String(String::from_utf8_lossy(m.as_bytes()).into_owned()))) +} + +/// Re-run `pattern` against `buffer` at `offset` and return the anchor +/// advance for the first match (number of bytes to add to the GNU `file` +/// previous-match anchor). +/// +/// When `flags.start_offset` is set (the `/s` modifier), the advance is +/// `m.start()` (match-start). Otherwise the advance is `m.end()` +/// (match-end). This matches libmagic's `REGEX_OFFSET_START` / `moffset()` +/// branch in `src/softmagic.c`. +/// +/// Returns `0` on any failure -- offset past buffer end, invalid pattern, +/// or no match. The `debug_assert` guards catch engine-invariant +/// violations (i.e., calls without a preceding successful `read_regex`) in +/// dev/test builds. +/// +/// Note: the regex is compiled twice per successful match -- once in +/// `read_regex` and again here. Caching the compiled `Regex` would require +/// threading it through `TypeReadError`/`Value` or adding a second return +/// channel, both of which complicate the reader API for a micro- +/// optimization. The duplicated compile is a deliberate simplicity-over- +/// caching trade-off. +#[must_use] +pub(super) fn regex_bytes_consumed( + buffer: &[u8], + offset: usize, + pattern: &str, + flags: RegexFlags, + count: Option, +) -> usize { + if buffer.get(offset..).is_none() { + debug_assert!( + false, + "regex_bytes_consumed: offset {offset} > buffer.len() {} -- engine invariant violated (called without a preceding successful read_regex)", + buffer.len() + ); + return 0; + } + let Ok(regex) = build_regex(pattern, flags.case_insensitive) else { + debug_assert!( + false, + "regex_bytes_consumed: failed to re-compile pattern {pattern:?} -- engine invariant violated (read_regex already succeeded)" + ); + return 0; + }; + let window = compute_window(buffer, offset, flags, count); + regex.find(window).map_or(0, |m| { + if flags.start_offset { + m.start() + } else { + m.end() + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn no_flags() -> RegexFlags { + RegexFlags::default() + } + + fn flags(case: bool, start: bool, line: bool) -> RegexFlags { + RegexFlags { + case_insensitive: case, + start_offset: start, + line_based: line, + } + } + + #[test] + fn test_read_regex_basic_match() { + let buffer = b"Hello, World!"; + let result = read_regex(buffer, 0, "World", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_regex_no_match_returns_none() { + let buffer = b"Hello, World!"; + let result = read_regex(buffer, 0, "xyz", no_flags(), None).unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_read_regex_case_insensitive() { + let buffer = b"Hello, World!"; + let result = read_regex(buffer, 0, "world", flags(true, false, false), None).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_regex_case_sensitive_no_match() { + let buffer = b"Hello, World!"; + let result = read_regex(buffer, 0, "world", no_flags(), None).unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_read_regex_multiline_anchor_across_lines() { + // libmagic always compiles regexes with REG_NEWLINE, so `^` and + // `$` match at internal line boundaries regardless of the `/l` + // flag. This test pins the behavior: `^second` on a two-line + // buffer matches the second line even with no flags set. + let buffer = b"first line\nsecond line"; + let result = read_regex(buffer, 0, "^second", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("second".to_string()))); + } + + #[test] + fn test_read_regex_dot_does_not_match_newline() { + // The REG_NEWLINE flag also makes `.` stop at newlines. A `.+` + // match against a multi-line buffer must not consume the `\n`. + let buffer = b"first\nsecond"; + let result = read_regex(buffer, 0, ".+", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("first".to_string()))); + } + + #[test] + fn test_read_regex_zero_width_start_anchor_matches() { + // `^` matches zero-width at position 0. Must be reported as + // `Some(Value::String(""))`, not `None`. Regression guard for C3. + let buffer = b"hello"; + let result = read_regex(buffer, 0, "^", no_flags(), None).unwrap(); + assert_eq!( + result, + Some(Value::String(String::new())), + "^ is a legitimate zero-width match, not a miss" + ); + } + + #[test] + fn test_read_regex_zero_width_star_matches_empty() { + let buffer = b"xyz"; + let result = read_regex(buffer, 0, "a*", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String(String::new()))); + } + + #[test] + fn test_read_regex_at_offset() { + let buffer = b"prefix_World!"; + let result = read_regex(buffer, 7, "World", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_regex_offset_past_end() { + let buffer = b"Hello"; + let result = read_regex(buffer, 10, "x", no_flags(), None); + assert!(matches!( + result, + Err(TypeReadError::BufferOverrun { + offset: 10, + buffer_len: 5 + }) + )); + } + + #[test] + fn test_read_regex_invalid_pattern() { + let buffer = b"Hello"; + let result = read_regex(buffer, 0, "[unclosed", no_flags(), None); + assert!(matches!(result, Err(TypeReadError::UnsupportedType { .. }))); + } + + #[test] + fn test_read_regex_binary_safe() { + let buffer = &[0x00, 0xff, 0xfe, 0x41, 0x42, 0x43]; + let result = read_regex(buffer, 0, "ABC", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("ABC".to_string()))); + } + + #[test] + fn test_read_regex_character_class() { + let buffer = b"abc123def"; + let result = read_regex(buffer, 0, "[0-9]+", no_flags(), None).unwrap(); + assert_eq!(result, Some(Value::String("123".to_string()))); + } + + // ------- V1: line-based window ------- + + #[test] + fn test_read_regex_line_based_one_line_caps_scan() { + // `regex/1l` with a pattern that appears on the second line must + // miss -- the scan window stops after the first newline. + let buffer = b"first line\nsecond line\n"; + let one = NonZeroU32::new(1); + let result = read_regex(buffer, 0, "second", flags(false, false, true), one).unwrap(); + assert_eq!(result, None, "scan should stop after the first line"); + } + + #[test] + fn test_read_regex_line_based_crlf_terminator() { + // CRLF (`\r\n`) counts as a single line terminator, matching + // libmagic's `memchr2('\n', '\r', ...)` logic. + let buffer = b"line1\r\nline2\r\n"; + let one = NonZeroU32::new(1); + let second = read_regex(buffer, 0, "line2", flags(false, false, true), one).unwrap(); + assert_eq!(second, None, "CRLF should end the first line"); + } + + #[test] + fn test_read_regex_line_based_counts_multiple_lines() { + // `regex/3l` scans up to the third line, so a pattern on line 3 + // matches, but a pattern on line 4 misses. + let buffer = b"line1\nline2\nline3\nline4\n"; + let three = NonZeroU32::new(3); + let line3 = read_regex(buffer, 0, "line3", flags(false, false, true), three).unwrap(); + assert_eq!(line3, Some(Value::String("line3".to_string()))); + + let line4 = read_regex(buffer, 0, "line4", flags(false, false, true), three).unwrap(); + assert_eq!(line4, None, "line4 is beyond the 3-line window"); + } + + // ------- V5: 8192-byte default cap ------- + + #[test] + fn test_read_regex_default_window_caps_at_8192_bytes() { + // A buffer larger than 8192 bytes with the pattern past 8192 + // must miss on an un-counted regex, because the scan window is + // capped at 8192 (FILE_REGEX_MAX). + let mut buffer = vec![b'a'; 9000]; + buffer.extend_from_slice(b"needle"); + let result = read_regex(&buffer, 0, "needle", no_flags(), None).unwrap(); + assert_eq!( + result, None, + "needle past byte 9000 must not match under the 8192 default cap" + ); + } + + #[test] + fn test_read_regex_explicit_count_larger_than_cap_still_capped() { + // Even an explicit `regex/100000` is clamped to 8192 bytes -- + // users cannot opt out of the hard cap. + let mut buffer = vec![b'a'; 9000]; + buffer.extend_from_slice(b"needle"); + let hundred_thousand = NonZeroU32::new(100_000); + let result = read_regex(&buffer, 0, "needle", no_flags(), hundred_thousand).unwrap(); + assert_eq!(result, None, "explicit count must still be clamped to 8192"); + } + + #[test] + fn test_read_regex_small_count_honored() { + // A small explicit count (e.g., 10 bytes) must be honored -- a + // pattern past byte 10 misses. + let buffer = b"abcdefghij_needle_here"; + let ten = NonZeroU32::new(10); + let result = read_regex(buffer, 0, "needle", no_flags(), ten).unwrap(); + assert_eq!(result, None); + } + + // ------- regex_bytes_consumed ------- + + #[test] + fn test_regex_bytes_consumed_match_end_by_default() { + let buffer = b"Hello, World!"; + assert_eq!( + regex_bytes_consumed(buffer, 0, "World", no_flags(), None), + 12 + ); + } + + #[test] + fn test_regex_bytes_consumed_no_match() { + let buffer = b"Hello"; + assert_eq!(regex_bytes_consumed(buffer, 0, "xyz", no_flags(), None), 0); + } + + #[test] + fn test_regex_bytes_consumed_zero_width_match_returns_zero() { + let buffer = b"hello"; + assert_eq!(regex_bytes_consumed(buffer, 0, "^", no_flags(), None), 0); + } + + // ------- V2: /s flag (start_offset) ------- + + #[test] + fn test_regex_bytes_consumed_start_offset_returns_match_start() { + // Buffer: "abcWorld", pattern "World" matches at index 3, length + // 5. Without `/s` the anchor advances by 8 (match-end). With `/s` + // it advances by 3 (match-start), matching libmagic's + // REGEX_OFFSET_START / moffset() zero-length path. + let buffer = b"abcWorld"; + let match_end = regex_bytes_consumed(buffer, 0, "World", no_flags(), None); + let match_start = regex_bytes_consumed(buffer, 0, "World", flags(false, true, false), None); + assert_eq!(match_end, 8, "default anchor advance is match-end"); + assert_eq!( + match_start, 3, + "/s flag advances anchor to match-start instead" + ); + } + + #[test] + fn test_regex_bytes_consumed_start_offset_no_match_returns_zero() { + // /s flag on a non-matching pattern still returns 0 (no advance). + let buffer = b"Hello"; + assert_eq!( + regex_bytes_consumed(buffer, 0, "xyz", flags(false, true, false), None), + 0 + ); + } +} diff --git a/src/evaluator/types/search.rs b/src/evaluator/types/search.rs new file mode 100644 index 00000000..070cb496 --- /dev/null +++ b/src/evaluator/types/search.rs @@ -0,0 +1,234 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Bounded literal pattern search for magic rule evaluation. +//! +//! Implements the `search` `TypeKind` as a forward scan for a literal byte +//! pattern within a bounded window. Unlike `TypeKind::String`, which only +//! matches at the exact offset, `search` advances through the buffer looking +//! for the first occurrence of the pattern anywhere in the window. The +//! search window is `buffer[offset..]` capped by the optional `range`. + +use super::TypeReadError; +use crate::parser::ast::Value; +use std::num::NonZeroUsize; + +/// Scan a bounded window of `buffer` for the first occurrence of `pattern`. +/// +/// # Arguments +/// +/// * `buffer` - File buffer to scan +/// * `offset` - Starting position within the buffer +/// * `pattern` - Literal bytes to search for (from the rule's value operand) +/// * `range` - Byte range to scan starting at `offset`. The window is the +/// smaller of `range` and the buffer remainder. Per GNU `file`'s +/// magic(5), the range is mandatory and is therefore a [`NonZeroUsize`] +/// in the type signature. +/// +/// # Returns +/// +/// * `Ok(Some(Value::String(pattern_text)))` on a successful match -- the +/// matched text is the literal pattern (search is a locate, not a +/// capture), with invalid UTF-8 replaced via `from_utf8_lossy`. +/// * `Ok(None)` when the pattern is not found in the window. `None` is the +/// structured "no match" signal; callers that need a compatibility +/// `Value::String(String::new())` should convert at the call site. +/// +/// # Errors +/// +/// * `TypeReadError::BufferOverrun` if `offset >= buffer.len()`. +pub fn read_search( + buffer: &[u8], + offset: usize, + pattern: &[u8], + range: NonZeroUsize, +) -> Result, TypeReadError> { + if offset >= buffer.len() { + return Err(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + }); + } + + let remaining = &buffer[offset..]; + let window_len = range.get().min(remaining.len()); + let window = &remaining[..window_len]; + + match memchr::memmem::find(window, pattern) { + Some(_) => Ok(Some(Value::String( + String::from_utf8_lossy(pattern).into_owned(), + ))), + None => Ok(None), + } +} + +/// Compute the anchor-advance distance for a successful search match. +/// +/// GNU `file` advances its previous-match anchor to the byte just past the +/// matched pattern -- `base_offset + match_index + pattern.len()`, not past +/// the full search window. See `src/softmagic.c` `moffset()` / `FILE_SEARCH` +/// branch (`vlen = m->vallen; o = ms->search.offset + vlen - offset;`) where +/// `ms->search.offset` has already been advanced by `idx` (the match index +/// within the window). +/// +/// This function re-runs the same `memchr::memmem::find` scan as +/// [`read_search`] and returns `match_index + pattern.len()`. On miss or +/// invalid state it returns `0`; the engine only calls it after a successful +/// read so the defensive paths are belt-and-braces. +/// +/// Note: like [`crate::evaluator::types::regex::regex_bytes_consumed`], this +/// pays the cost of a second scan rather than threading the match position +/// back through the reader API. Caching would require a second return +/// channel that complicates every non-pattern type. +#[must_use] +pub(super) fn search_bytes_consumed( + buffer: &[u8], + offset: usize, + pattern: &[u8], + range: NonZeroUsize, +) -> usize { + let Some(remaining) = buffer.get(offset..) else { + debug_assert!( + false, + "search_bytes_consumed: offset {offset} > buffer.len() {} -- engine invariant violated (called without a preceding successful read_search)", + buffer.len() + ); + return 0; + }; + let window_len = range.get().min(remaining.len()); + let window = &remaining[..window_len]; + memchr::memmem::find(window, pattern).map_or(0, |idx| idx + pattern.len()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn nz(n: usize) -> NonZeroUsize { + NonZeroUsize::new(n).expect("non-zero in test") + } + + #[test] + fn test_read_search_basic_match() { + let buffer = b"Hello, World!"; + let result = read_search(buffer, 0, b"World", nz(100)).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_search_no_match_returns_none() { + let buffer = b"Hello, World!"; + let result = read_search(buffer, 0, b"xyz", nz(100)).unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_read_search_bounded_range_finds() { + let buffer = b"abcdefWorldxyz"; + let result = read_search(buffer, 0, b"World", nz(14)).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_search_bounded_range_too_small() { + let buffer = b"abcdefWorldxyz"; + // Range only covers "abcde" -- World is past the window + let result = read_search(buffer, 0, b"World", nz(5)).unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_read_search_range_larger_than_buffer() { + let buffer = b"Hello"; + let result = read_search(buffer, 0, b"lo", nz(1000)).unwrap(); + assert_eq!(result, Some(Value::String("lo".to_string()))); + } + + #[test] + fn test_read_search_at_offset() { + let buffer = b"junk_prefix_World!"; + let result = read_search(buffer, 12, b"World", nz(100)).unwrap(); + assert_eq!(result, Some(Value::String("World".to_string()))); + } + + #[test] + fn test_read_search_offset_past_end() { + let buffer = b"Hello"; + let result = read_search(buffer, 10, b"x", nz(100)); + assert!(matches!( + result, + Err(TypeReadError::BufferOverrun { + offset: 10, + buffer_len: 5 + }) + )); + } + + #[test] + fn test_read_search_binary_pattern() { + let buffer = &[0x00, 0xff, 0xfe, 0xaa, 0xbb, 0xcc]; + let result = read_search(buffer, 0, &[0xaa, 0xbb], nz(100)).unwrap(); + // Invalid UTF-8 gets replaced with U+FFFD, but the match is still Some + match result { + Some(Value::String(s)) => assert!(!s.is_empty()), + other => panic!("Expected Some(Value::String), got {other:?}"), + } + } + + #[test] + fn test_read_search_empty_pattern_matches_at_offset() { + // memmem finds an empty pattern at position 0 in any non-empty + // window. This is a degenerate but well-defined outcome: the + // reader reports a match with an empty matched text. Magic files + // using `search` with an empty pattern are nonsensical; the + // grammar layer should reject them, not the reader. + let buffer = b"Hello"; + let result = read_search(buffer, 0, b"", nz(100)).unwrap(); + assert_eq!(result, Some(Value::String(String::new()))); + } + + #[test] + fn test_read_search_multi_char_pattern() { + let buffer = b"The quick brown fox jumps over the lazy dog"; + let result = read_search(buffer, 0, b"brown", nz(50)).unwrap(); + assert_eq!(result, Some(Value::String("brown".to_string()))); + } + + #[test] + fn test_search_bytes_consumed_matches_match_end_not_window_end() { + // GNU `file` advances the anchor past the matched pattern, not + // past the full search window. Regression guard for the pre-fix + // behavior which returned the whole window size. + let buffer = b"abcWorldxyz___more_data"; + // Window size 10 (`abcWorldxy`), pattern "World" at index 3, + // length 5, so match-end = 3 + 5 = 8. + assert_eq!( + search_bytes_consumed(buffer, 0, b"World", nz(10)), + 8, + "expected match-end (8), not window-end (10)" + ); + } + + #[test] + fn test_search_bytes_consumed_no_match_returns_zero() { + let buffer = b"abcdefghij"; + assert_eq!(search_bytes_consumed(buffer, 0, b"XYZ", nz(10)), 0); + } + + #[test] + fn test_search_bytes_consumed_range_caps_match() { + // Match exists past the window; bytes_consumed reports 0 because + // the scan only sees the window. + let buffer = b"abcdefWorldxyz"; + // Range 5 means window is "abcde" -- no "World" inside it. + assert_eq!(search_bytes_consumed(buffer, 0, b"World", nz(5)), 0); + } + + #[test] + fn test_search_bytes_consumed_match_at_window_end() { + // Pattern lands exactly at the window boundary: window is 8 + // bytes, pattern "def" occupies indices 3..6, match-end = 6. + let buffer = b"abcdefgh_ignored"; + assert_eq!(search_bytes_consumed(buffer, 0, b"def", nz(8)), 6); + } +} diff --git a/src/evaluator/types/tests.rs b/src/evaluator/types/tests.rs index 5e9fa908..9c33cf30 100644 --- a/src/evaluator/types/tests.rs +++ b/src/evaluator/types/tests.rs @@ -801,7 +801,7 @@ fn test_bytes_consumed_fixed_width_types() { ]; for (typ, expected) in cases { - let consumed = bytes_consumed(buf, 0, typ); + let consumed = bytes_consumed_with_pattern(buf, 0, typ, None); assert_eq!( consumed, *expected, "fixed-width width mismatch for {typ:?}" @@ -814,7 +814,7 @@ fn test_bytes_consumed_string_with_nul() { // "MZ\0" -> matches "MZ" and consumes 3 bytes (2 + NUL). let buf = b"MZ\x00rest"; let typ = TypeKind::String { max_length: None }; - assert_eq!(bytes_consumed(buf, 0, &typ), 3); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 3); } #[test] @@ -822,7 +822,7 @@ fn test_bytes_consumed_string_at_offset() { // String starting mid-buffer. let buf = b"PREFIXabc\x00tail"; let typ = TypeKind::String { max_length: None }; - assert_eq!(bytes_consumed(buf, 6, &typ), 4); // "abc" + NUL + assert_eq!(bytes_consumed_with_pattern(buf, 6, &typ, None), 4); // "abc" + NUL } #[test] @@ -830,7 +830,7 @@ fn test_bytes_consumed_string_no_nul_in_buffer() { // No NUL terminator -- consumes to end of buffer (no extra byte for NUL). let buf = b"NoNull"; let typ = TypeKind::String { max_length: None }; - assert_eq!(bytes_consumed(buf, 0, &typ), 6); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 6); } #[test] @@ -838,7 +838,7 @@ fn test_bytes_consumed_string_empty() { // Empty string at offset 0 -- just the NUL. let buf = b"\x00rest"; let typ = TypeKind::String { max_length: None }; - assert_eq!(bytes_consumed(buf, 0, &typ), 1); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 1); } #[test] @@ -848,7 +848,7 @@ fn test_bytes_consumed_string_max_length_caps() { let typ = TypeKind::String { max_length: Some(4), }; - assert_eq!(bytes_consumed(buf, 0, &typ), 4); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 4); } #[test] @@ -858,7 +858,7 @@ fn test_bytes_consumed_string_max_length_finds_nul() { let typ = TypeKind::String { max_length: Some(10), }; - assert_eq!(bytes_consumed(buf, 0, &typ), 6); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 6); } #[test] @@ -870,7 +870,7 @@ fn test_bytes_consumed_pstring_one_byte() { length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 6); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 6); } #[test] @@ -882,7 +882,7 @@ fn test_bytes_consumed_pstring_two_byte_be() { length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 7); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 7); } #[test] @@ -893,7 +893,7 @@ fn test_bytes_consumed_pstring_two_byte_le() { length_width: PStringLengthWidth::TwoByteLE, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 7); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 7); } #[test] @@ -904,7 +904,7 @@ fn test_bytes_consumed_pstring_four_byte_be() { length_width: PStringLengthWidth::FourByteBE, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 5); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 5); } #[test] @@ -916,7 +916,7 @@ fn test_bytes_consumed_pstring_j_flag() { length_width: PStringLengthWidth::OneByte, length_includes_itself: true, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 4); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 4); } #[test] @@ -928,7 +928,7 @@ fn test_bytes_consumed_pstring_empty() { length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 1); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 1); } #[test] @@ -940,7 +940,7 @@ fn test_bytes_consumed_pstring_max_length_caps() { length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 6); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 6); } #[test] @@ -952,7 +952,7 @@ fn test_bytes_consumed_pstring_j_flag_underflow_multi_byte() { length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: true, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 0); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 0); // /J with FourByteLE: stored length 3, prefix width 4 -> underflow -> 0. let buf = b"\x03\x00\x00\x00xx"; @@ -961,7 +961,7 @@ fn test_bytes_consumed_pstring_j_flag_underflow_multi_byte() { length_width: PStringLengthWidth::FourByteLE, length_includes_itself: true, }; - assert_eq!(bytes_consumed(buf, 0, &typ), 0); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 0); } #[test] @@ -976,7 +976,7 @@ fn test_bytes_consumed_pstring_clamps_oversized_prefix_be() { length_includes_itself: false, }; // 4 (prefix) + min(0xFFFFFFFF, 3) = 4 + 3 = 7 - assert_eq!(bytes_consumed(buf, 0, &typ), 7); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 7); } #[test] @@ -988,7 +988,7 @@ fn test_bytes_consumed_pstring_clamps_oversized_prefix_le() { length_includes_itself: false, }; // 4 + min(0xFFFFFFFF, 5) = 9 - assert_eq!(bytes_consumed(buf, 0, &typ), 9); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, None), 9); } #[test] @@ -998,7 +998,110 @@ fn test_bytes_consumed_string_at_past_end_returns_zero() { // a successful read, but the path is exercised here for the contract. let buf = b"abc"; let typ = TypeKind::String { max_length: None }; - assert_eq!(bytes_consumed(buf, 10, &typ), 0); + assert_eq!(bytes_consumed_with_pattern(buf, 10, &typ, None), 0); +} + +#[test] +fn test_bytes_consumed_regex_with_string_pattern() { + // Regression guard for GOTCHAS 2.1: variable-width variants must be + // matched explicitly in `bytes_consumed_with_pattern` or relative + // offsets silently corrupt. This test exercises the dispatch path + // and verifies the match-end byte count matches the reader's view. + let buf = b"prefix_World_suffix"; + let typ = TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }; + let pattern = Value::String("World".to_string()); + // "World" starts at index 7 in the buffer, length 5, so a scan from + // offset 0 consumes 7+5=12 bytes. + assert_eq!( + bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), + 12 + ); +} + +#[test] +fn test_bytes_consumed_regex_no_match_returns_zero() { + let buf = b"abcdef"; + let typ = TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }; + let pattern = Value::String("xyz".to_string()); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), 0); +} + +#[test] +fn test_bytes_consumed_regex_zero_width_match_returns_zero() { + // Zero-width match at position 0 means match_end=0 so the anchor + // stays put. Cross-check with the direct reader in regex.rs. + let buf = b"hello"; + let typ = TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }; + let pattern = Value::String("^".to_string()); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), 0); +} + +#[test] +fn test_bytes_consumed_regex_start_offset_flag_uses_match_start() { + // /s flag changes the anchor advance to match-start instead of + // match-end. Regression guard for V2. + let buf = b"prefix_World_suffix"; + let typ = TypeKind::Regex { + flags: crate::parser::ast::RegexFlags { + start_offset: true, + ..crate::parser::ast::RegexFlags::default() + }, + count: None, + }; + let pattern = Value::String("World".to_string()); + // Match-start for "World" at index 7 is 7, not 12. + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), 7); +} + +#[test] +fn test_bytes_consumed_search_with_pattern_is_match_end() { + // Regression guard for the pre-fix behavior that returned the + // entire window size instead of match-end. Per GNU `file` softmagic.c + // FILE_SEARCH, the anchor advances to `base + match_idx + pattern.len()`. + let buf = b"abcWorld_xyz"; + let typ = TypeKind::Search { + range: ::std::num::NonZeroUsize::new(10).unwrap(), + }; + let pattern = Value::String("World".to_string()); + // "World" is at index 3, length 5, match-end = 8. + assert_eq!( + bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), + 8, + "expected match-end (8), not window-end (10)" + ); +} + +#[test] +fn test_bytes_consumed_search_no_match_returns_zero() { + let buf = b"abcdefghij"; + let typ = TypeKind::Search { + range: ::std::num::NonZeroUsize::new(10).unwrap(), + }; + let pattern = Value::String("XYZ".to_string()); + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), 0); +} + +#[test] +fn test_bytes_consumed_search_bytes_pattern_works() { + // Value::Bytes is an alternative pattern shape for search -- verify + // the dispatch path accepts it and computes the same match-end as a + // Value::String pattern would. + let buf = &[0x00, 0xff, 0xde, 0xad, 0xbe, 0xef, 0x11]; + let typ = TypeKind::Search { + range: ::std::num::NonZeroUsize::new(7).unwrap(), + }; + let pattern = Value::Bytes(vec![0xde, 0xad, 0xbe, 0xef]); + // 0xde at index 2, length 4, match-end = 6. + assert_eq!(bytes_consumed_with_pattern(buf, 0, &typ, Some(&pattern)), 6); } #[test] @@ -1010,11 +1113,11 @@ fn test_bytes_consumed_fixed_width_returns_zero_past_end() { let buf = b"abc"; let typ = TypeKind::Byte { signed: false }; // offset == buf.len() leaves no room for a 1-byte read. - assert_eq!(bytes_consumed(buf, 3, &typ), 0); + assert_eq!(bytes_consumed_with_pattern(buf, 3, &typ, None), 0); // Way past end. - assert_eq!(bytes_consumed(buf, 100, &typ), 0); + assert_eq!(bytes_consumed_with_pattern(buf, 100, &typ, None), 0); // Last valid index: 1-byte read fits. - assert_eq!(bytes_consumed(buf, 2, &typ), 1); + assert_eq!(bytes_consumed_with_pattern(buf, 2, &typ, None), 1); // Multi-byte fixed-width type at the boundary. let typ_long = TypeKind::Long { @@ -1023,9 +1126,12 @@ fn test_bytes_consumed_fixed_width_returns_zero_past_end() { }; let buf4 = b"abcd"; // offset 0 + width 4 == buf.len() -> fits - assert_eq!(bytes_consumed(buf4, 0, &typ_long), 4); + assert_eq!(bytes_consumed_with_pattern(buf4, 0, &typ_long, None), 4); // offset 1 + width 4 == 5 > buf.len() -> 0 - assert_eq!(bytes_consumed(buf4, 1, &typ_long), 0); + assert_eq!(bytes_consumed_with_pattern(buf4, 1, &typ_long, None), 0); // overflow: offset = usize::MAX, width = 4 -> checked_add returns None -> 0 - assert_eq!(bytes_consumed(buf4, usize::MAX, &typ_long), 0); + assert_eq!( + bytes_consumed_with_pattern(buf4, usize::MAX, &typ_long, None), + 0 + ); } diff --git a/src/parser/ast.rs b/src/parser/ast.rs index dc6e7bd4..bcef818e 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -7,6 +7,7 @@ //! and their components, including offset specifications, type kinds, operators, and values. use serde::{Deserialize, Serialize}; +use std::num::{NonZeroU32, NonZeroUsize}; /// The width of the length prefix for Pascal strings. /// @@ -335,8 +336,141 @@ pub enum TypeKind { /// Whether the stored length includes the length field itself (`/J` flag) length_includes_itself: bool, }, + /// Regular expression matching against file contents + /// + /// Regex rules match a POSIX-extended regular expression pattern against the + /// file buffer. Patterns are compiled with multi-line mode always enabled + /// (matching libmagic's unconditional `REG_NEWLINE`), so `^` and `$` match + /// at line boundaries and `.` does not match `\n`. The `flags` control + /// case sensitivity, anchor advance semantics, and whether `count` is + /// measured in bytes or lines. The scan window is always capped at + /// [`REGEX_MAX_BYTES`] (8192) regardless of `count`. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::{TypeKind, RegexFlags}; + /// use std::num::NonZeroU32; + /// + /// // Plain `regex` -- no flags, default 8192-byte scan window. + /// let plain = TypeKind::Regex { + /// flags: RegexFlags::default(), + /// count: None, + /// }; + /// + /// // `regex/1l` -- scan the first line only (1 line, capped at 8192 bytes). + /// let first_line = TypeKind::Regex { + /// flags: RegexFlags { + /// line_based: true, + /// ..RegexFlags::default() + /// }, + /// count: NonZeroU32::new(1), + /// }; + /// + /// // `regex/cs` -- case-insensitive, anchor advances to match-start. + /// let case_insensitive_start = TypeKind::Regex { + /// flags: RegexFlags { + /// case_insensitive: true, + /// start_offset: true, + /// line_based: false, + /// }, + /// count: None, + /// }; + /// ``` + Regex { + /// Modifier flags from the `/[csl]` suffix. + flags: RegexFlags, + /// Optional numeric count from `regex/N[flags]`. Interpretation + /// depends on `flags.line_based`: + /// + /// * `None`: use the 8192-byte default scan window. + /// * `Some(n)` with `flags.line_based == false`: scan at most `n` + /// bytes, capped at 8192. + /// * `Some(n)` with `flags.line_based == true`: scan at most `n` + /// lines, with an effective byte cap of `min(n * 80, 8192)`. + /// + /// The 8192-byte hard cap matches GNU `file`'s `FILE_REGEX_MAX` and + /// prevents runaway regex scans against large buffers. + count: Option, + }, + /// Multi-byte pattern search within a bounded range + /// + /// Search rules look for a literal byte pattern within `range` bytes of + /// the offset. Unlike [`TypeKind::String`], which only matches at the + /// exact offset, `search` scans forward up to `range` bytes for the + /// first occurrence. The range is **mandatory** per GNU `file`'s + /// magic(5) specification and is stored as a [`NonZeroUsize`] so a + /// zero-range search is unrepresentable. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::TypeKind; + /// use std::num::NonZeroUsize; + /// + /// // `search/256` -- scan up to 256 bytes for the literal pattern. + /// let bounded = TypeKind::Search { + /// range: NonZeroUsize::new(256).unwrap(), + /// }; + /// ``` + Search { + /// Scan window width in bytes, starting at the rule's offset. + range: NonZeroUsize, + }, } +/// Regex modifier flags parsed from the `/[csl]` suffix on a `regex` rule. +/// +/// All flags default to `false` via [`RegexFlags::default`]. The `Default` +/// impl is equivalent to a plain `regex` type with no suffix, which scans +/// 8192 bytes in byte mode and advances the anchor to match-end. +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::parser::ast::RegexFlags; +/// +/// let plain = RegexFlags::default(); +/// assert!(!plain.case_insensitive); +/// assert!(!plain.start_offset); +/// assert!(!plain.line_based); +/// +/// let case_and_line = RegexFlags { +/// case_insensitive: true, +/// start_offset: false, +/// line_based: true, +/// }; +/// assert!(case_and_line.case_insensitive); +/// assert!(case_and_line.line_based); +/// ``` +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct RegexFlags { + /// `/c` -- case-insensitive matching. When `true`, ASCII letter + /// casing is ignored during pattern matching. + pub case_insensitive: bool, + /// `/s` -- advance the GNU `file` previous-match anchor to the start + /// of the matched region instead of its end. Matches libmagic's + /// `REGEX_OFFSET_START` flag, which zeros the length contribution in + /// `moffset()` for `FILE_REGEX`. Useful for chaining child rules that + /// need to re-match from the position where the parent regex began. + pub start_offset: bool, + /// `/l` -- measure the scan window in lines instead of bytes. When + /// `true`, `count` is interpreted as a line count rather than a byte + /// count. The effective byte window is still capped at 8192 bytes + /// regardless (see [`TypeKind::Regex::count`] for the details). + /// + /// Note: this flag does **not** control multi-line regex matching; + /// libmagic always compiles patterns with `REG_NEWLINE`, so `^`/`$` + /// match at line boundaries regardless of `/l`. + pub line_based: bool, +} + +/// The hard upper bound on regex scan window size, matching GNU `file`'s +/// `FILE_REGEX_MAX` constant in `src/file.h`. Any regex rule -- including +/// ones with explicit counts larger than this -- is capped at this many +/// bytes to prevent runaway scans against large buffers. +pub const REGEX_MAX_BYTES: usize = 8192; + impl TypeKind { /// Returns the bit width of integer types, or `None` for non-integer types (e.g., String). /// @@ -360,7 +494,10 @@ impl TypeKind { Self::Short { .. } => Some(16), Self::Long { .. } | Self::Float { .. } | Self::Date { .. } => Some(32), Self::Quad { .. } | Self::Double { .. } | Self::QDate { .. } => Some(64), - Self::String { .. } | Self::PString { .. } => None, + Self::String { .. } + | Self::PString { .. } + | Self::Regex { .. } + | Self::Search { .. } => None, } } } diff --git a/src/parser/codegen.rs b/src/parser/codegen.rs index bf36be8f..b826f90c 100644 --- a/src/parser/codegen.rs +++ b/src/parser/codegen.rs @@ -232,6 +232,25 @@ pub fn serialize_type_kind(typ: &TypeKind) -> String { length_includes_itself ), }, + TypeKind::Regex { flags, count } => { + let count_lit = match count { + Some(n) => format!("::std::num::NonZeroU32::new({}).unwrap()", n.get()), + None => String::new(), + }; + let count_expr = if count.is_some() { + format!("Some({count_lit})") + } else { + "None".to_string() + }; + format!( + "TypeKind::Regex {{ flags: libmagic_rs::parser::ast::RegexFlags {{ case_insensitive: {}, start_offset: {}, line_based: {} }}, count: {count_expr} }}", + flags.case_insensitive, flags.start_offset, flags.line_based + ) + } + TypeKind::Search { range } => format!( + "TypeKind::Search {{ range: ::std::num::NonZeroUsize::new({}).unwrap() }}", + range.get() + ), } } diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs index 59e1316b..50c76917 100644 --- a/src/parser/grammar/mod.rs +++ b/src/parser/grammar/mod.rs @@ -391,6 +391,7 @@ fn parse_pstring_suffix( /// /// # Errors /// Returns a nom parsing error if the input doesn't match the expected format +#[allow(clippy::too_many_lines)] pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option)> { use crate::parser::ast::PStringLengthWidth; @@ -410,6 +411,100 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option = None; + if type_name == "regex" + && let Some(suffix_rest) = input.strip_prefix('/') + { + let mut rest = suffix_rest; + let mut any_modifier = false; + + // Scan modifier sequence. Stop at whitespace or at operator + // boundary characters (`=`, `!`, `<`, `>`, `&`, `^`, `~`, `x`) so + // forms like `regex/c=...` or `regex/l!=...` leave the operator + // for `parse_operator` to handle. + loop { + if let Some(next) = rest.strip_prefix('c') { + regex_flags.case_insensitive = true; + rest = next; + any_modifier = true; + } else if let Some(next) = rest.strip_prefix('s') { + regex_flags.start_offset = true; + rest = next; + any_modifier = true; + } else if let Some(next) = rest.strip_prefix('l') { + regex_flags.line_based = true; + rest = next; + any_modifier = true; + } else if rest.starts_with(|c: char| c.is_ascii_digit()) { + let (after_number, n) = parse_decimal_number(rest).map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit)) + })?; + // `0` is a valid sentinel in libmagic (means "unset"), but + // with a dedicated 8192-byte default we don't need a + // sentinel. Reject 0 explicitly so callers get a clear + // parse error instead of a silently-dropped count. + let count_value = u32::try_from(n) + .ok() + .and_then(::std::num::NonZeroU32::new) + .ok_or_else(|| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit)) + })?; + regex_count = Some(count_value.get()); + rest = after_number; + any_modifier = true; + } else { + match rest.chars().next() { + Some(c) if c.is_whitespace() => break, + None | Some('=' | '!' | '<' | '>' | '&' | '^' | '~' | 'x') => break, + Some(_) => { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::Tag, + ))); + } + } + } + } + + // A bare `regex/` with no valid modifier is a parse error. + if !any_modifier { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::Tag, + ))); + } + + input = rest; + } + + // Handle search suffix: required decimal range (e.g., `search/256`). + // Per GNU `file` magic(5), the range is mandatory. `search/0` and + // bare `search` are rejected at parse time via `NonZeroUsize`. + let mut search_range: Option<::std::num::NonZeroUsize> = None; + if type_name == "search" + && let Some(suffix_rest) = input.strip_prefix('/') + { + let (rest, n) = parse_decimal_number(suffix_rest).map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit)) + })?; + let range_value = usize::try_from(n) + .ok() + .and_then(::std::num::NonZeroUsize::new) + .ok_or_else(|| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit)) + })?; + search_range = Some(range_value); + input = rest; + } + // Check for attached operator with mask (like &0xf0000000) // Uses unsigned parsing so full u64 masks (e.g. 0xffffffffffffffff) are supported. // If '&' is followed by digits/0x but the mask parse fails (overflow, etc.), @@ -437,15 +532,35 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option TypeKind::Regex { + flags: regex_flags, + count: regex_count.and_then(::std::num::NonZeroU32::new), + }, + "search" => { + // Mandatory range: reject bare `search` at parse time. + let range = search_range.ok_or_else(|| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Tag)) + })?; + TypeKind::Search { range } + } + _ => { + let mut kind = crate::parser::types::type_keyword_to_kind(type_name); + if let TypeKind::PString { max_length, .. } = kind { + kind = TypeKind::PString { + max_length, + length_width: pstring_length_width, + length_includes_itself: pstring_length_includes_itself, + }; + } + kind + } + }; Ok((input, (type_kind, attached_op))) } diff --git a/src/parser/grammar/tests/mod.rs b/src/parser/grammar/tests/mod.rs index c6a69589..dd9551ef 100644 --- a/src/parser/grammar/tests/mod.rs +++ b/src/parser/grammar/tests/mod.rs @@ -2275,3 +2275,172 @@ fn test_parse_type_and_operator_pstring_suffixes() { } } } + +#[test] +fn test_parse_type_and_operator_regex_and_search_suffixes() { + use crate::parser::ast::{RegexFlags, TypeKind}; + use std::num::{NonZeroU32, NonZeroUsize}; + + fn rx(case: bool, start: bool, line: bool, count: Option) -> TypeKind { + TypeKind::Regex { + flags: RegexFlags { + case_insensitive: case, + start_offset: start, + line_based: line, + }, + count: count.and_then(NonZeroU32::new), + } + } + fn sr(n: usize) -> TypeKind { + TypeKind::Search { + range: NonZeroUsize::new(n).unwrap(), + } + } + + let cases: &[(&str, TypeKind, &str)] = &[ + ("regex", rx(false, false, false, None), ""), + ("regex/c", rx(true, false, false, None), ""), + ("regex/l", rx(false, false, true, None), ""), + ("regex/s", rx(false, true, false, None), ""), + ("regex/cl", rx(true, false, true, None), ""), + ("regex/lc", rx(true, false, true, None), ""), + ("regex/cs", rx(true, true, false, None), ""), + ("regex/csl", rx(true, true, true, None), ""), + ("regex/1l", rx(false, false, true, Some(1)), ""), + ("regex/l1", rx(false, false, true, Some(1)), ""), + ("regex/1c", rx(true, false, false, Some(1)), ""), + ("regex/256", rx(false, false, false, Some(256)), ""), + ("regex/c =", rx(true, false, false, None), "="), + ("search/256", sr(256), ""), + ("search/1", sr(1), ""), + ("search/256 =", sr(256), "="), + ]; + for &(input, ref expected_kind, expected_rest) in cases { + let (rest, (kind, op)) = parse_type_and_operator(input).expect(input); + assert_eq!(rest, expected_rest, "rest for input: {input}"); + assert!(op.is_none(), "operator for input: {input}"); + assert_eq!(&kind, expected_kind, "kind for input: {input}"); + } +} + +#[test] +fn test_parse_type_and_operator_search_requires_range() { + // Bare `search` (no /N suffix) is a hard parse error per GNU `file`. + assert!(parse_type_and_operator("search").is_err()); + // `search/0` is also rejected -- `NonZeroUsize` makes a zero-width + // scan unrepresentable. + assert!(parse_type_and_operator("search/0").is_err()); +} + +#[test] +fn test_parse_type_and_operator_regex_invalid_suffix() { + // Bare slash with no flags or count + assert!(parse_type_and_operator("regex/").is_err()); + // Unrecognized flag letter + assert!(parse_type_and_operator("regex/z").is_err()); + // Non-operator trailing character is still rejected + assert!(parse_type_and_operator("regex/cz").is_err()); + // regex/0 is rejected because a zero count has no valid semantics + // (our parser uses NonZeroU32 to express "user specified a count"). + assert!(parse_type_and_operator("regex/0").is_err()); +} + +#[test] +fn test_parse_type_and_operator_regex_operator_adjacent() { + use crate::parser::ast::{Operator, RegexFlags, TypeKind}; + + // `regex/c=` should leave `=` for parse_operator, matching the `regex/c =` + // (space-separated) behavior and mirroring `search/256=`. + let (rest, (kind, op)) = parse_type_and_operator("regex/c=").expect("regex/c="); + assert_eq!(rest, "="); + assert!(op.is_none()); + assert_eq!( + kind, + TypeKind::Regex { + flags: RegexFlags { + case_insensitive: true, + ..RegexFlags::default() + }, + count: None, + } + ); + + // `regex/l!=` should leave `!=` for parse_operator. + let (rest, (kind, op)) = parse_type_and_operator("regex/l!=").expect("regex/l!="); + assert_eq!(rest, "!="); + assert!(op.is_none()); + assert_eq!( + kind, + TypeKind::Regex { + flags: RegexFlags { + line_based: true, + ..RegexFlags::default() + }, + count: None, + } + ); + + // Confirm the full pipeline parses the operator correctly through + // parse_type_and_operator + parse_operator chaining. + let (rest, (_, _)) = parse_type_and_operator("regex/c=foo").expect("regex/c=foo"); + let (rest_after_op, op) = crate::parser::grammar::parse_operator(rest).expect("operator"); + assert_eq!(op, Operator::Equal); + assert_eq!(rest_after_op, "foo"); +} + +#[test] +fn test_parse_magic_rule_regex_and_search() { + use crate::parser::ast::RegexFlags; + use std::num::{NonZeroU32, NonZeroUsize}; + + // regex/c: case-insensitive flag + let input = r#"0 regex/c "hello" case-insensitive match"#; + let (remaining, rule) = parse_magic_rule(input).unwrap(); + assert_eq!(remaining, ""); + assert_eq!(rule.offset, OffsetSpec::Absolute(0)); + assert_eq!( + rule.typ, + TypeKind::Regex { + flags: RegexFlags { + case_insensitive: true, + ..RegexFlags::default() + }, + count: None, + } + ); + assert_eq!(rule.op, Operator::Equal); + assert_eq!(rule.value, Value::String("hello".to_string())); + assert_eq!(rule.message, "case-insensitive match"); + + // search/256 + let input = r#"0 search/256 "MZ" DOS executable"#; + let (remaining, rule) = parse_magic_rule(input).unwrap(); + assert_eq!(remaining, ""); + assert_eq!( + rule.typ, + TypeKind::Search { + range: NonZeroUsize::new(256).unwrap(), + } + ); + assert_eq!(rule.op, Operator::Equal); + assert_eq!(rule.value, Value::String("MZ".to_string())); + assert_eq!(rule.message, "DOS executable"); + + // regex/1l: line-based with a count of 1 (mirrors regex-eol.magic + // syntax). The count is now preserved, not discarded. + let input = r#">1 regex/1l "[0-9]+" version line"#; + let (remaining, rule) = parse_magic_rule(input).unwrap(); + assert_eq!(remaining, ""); + assert_eq!(rule.level, 1); + assert_eq!( + rule.typ, + TypeKind::Regex { + flags: RegexFlags { + line_based: true, + ..RegexFlags::default() + }, + count: NonZeroU32::new(1), + } + ); + assert_eq!(rule.message, "version line"); +} diff --git a/src/parser/types.rs b/src/parser/types.rs index fd3b64ab..a9b4538b 100644 --- a/src/parser/types.rs +++ b/src/parser/types.rs @@ -95,8 +95,8 @@ pub fn parse_type_keyword(input: &str) -> IResult<&str, &str> { tag("ledate"), tag("date"), )), - // String types - alt((tag("pstring"), tag("string"))), + // String types (and regex/search, which share the string-type family) + alt((tag("pstring"), tag("search"), tag("regex"), tag("string"))), )) .parse(input) } @@ -301,6 +301,25 @@ pub fn type_keyword_to_kind(type_name: &str) -> TypeKind { length_includes_itself: false, }, + // REGEX type -- suffix parsing (flags and count) handled in + // `parse_type_and_operator` in grammar/mod.rs, which constructs + // the final `TypeKind::Regex` directly. The value returned here + // is a bare-`regex` placeholder used only by the round-trip + // keyword test; grammar never observes it. + "regex" => TypeKind::Regex { + flags: crate::parser::ast::RegexFlags::default(), + count: None, + }, + + // SEARCH type -- range parsing handled in grammar/mod.rs, which + // constructs the final `TypeKind::Search` directly from the + // mandatory `/N` suffix. The value returned here is a placeholder + // with `range = 1` used only by the round-trip keyword test; a + // real search rule always has its range set by the grammar layer. + "search" => TypeKind::Search { + range: ::std::num::NonZeroUsize::new(1).expect("1 is nonzero"), + }, + _ => unreachable!("type_keyword_to_kind called with unknown type: {type_name}"), } } @@ -546,7 +565,8 @@ mod tests { "long", "ulong", "lelong", "ulelong", "belong", "ubelong", "quad", "uquad", "lequad", "ulequad", "bequad", "ubequad", "float", "befloat", "lefloat", "double", "bedouble", "ledouble", "date", "ldate", "bedate", "beldate", "ledate", "leldate", "qdate", - "qldate", "beqdate", "beqldate", "leqdate", "leqldate", "pstring", "string", + "qldate", "beqdate", "beqldate", "leqdate", "leqldate", "pstring", "string", "regex", + "search", ]; for keyword in keywords { let (rest, parsed) = parse_type_keyword(keyword).unwrap(); diff --git a/tests/evaluator_tests.rs b/tests/evaluator_tests.rs index 649373de..2b3a21aa 100644 --- a/tests/evaluator_tests.rs +++ b/tests/evaluator_tests.rs @@ -495,3 +495,167 @@ fn test_evaluate_float_rule_no_match() { "Float equal rule should not match when value differs" ); } + +// ============================================================ +// Third-Party Corpus: regex-eol +// ============================================================ + +/// Integration test for the `regex-eol.magic` corpus test from the upstream +/// `file` project. The magic file itself uses two syntaxes that the text +/// parser does not yet accept -- a bare unquoted `$ANSIBLE_VAULT` string +/// value (see GOTCHAS S3.6) and `>&1` / `>>&1` relative-offset anchors (the +/// `&+N`/`&-N` parsing TODO in AGENTS.md) -- so this test temporarily +/// bypasses `MagicDatabase::load_from_file` and constructs the equivalent +/// rule tree programmatically. The testfile fixture at +/// `third_party/tests/regex-eol.testfile` is still read verbatim, so the +/// runtime evaluation path (string match, `regex/1l` line-anchored matching, +/// and `OffsetSpec::Relative` anchor advancement through +/// `EvaluationContext::last_match_end`) is exercised end-to-end. +/// +/// Once the parser learns unquoted string values and `&+N` relative offsets, +/// this test should be rewritten to call `MagicDatabase::load_from_file` +/// against the unmodified `regex-eol.magic` corpus file. +#[test] +fn test_regex_eol_corpus() { + let buffer = std::fs::read("third_party/tests/regex-eol.testfile") + .expect("failed to read regex-eol.testfile"); + + // Mirror of: + // 0 string $ANSIBLE_VAULT Ansible Vault text + // >&1 regex/1l [0-9]+(\.[0-9]+)+ \b, version %s + // >>&1 regex/1l [^;]+$ \b, using %s encryption + // + // Messages hardcode the captured tokens that libmagic's `%s` formatter + // would substitute (libmagic-rs does not yet implement format + // substitution), so the final description contains the literal + // `version`, `1.1`, and `AES256` strings. The match-value assertions + // below separately verify the regex engine actually captured those + // tokens from the buffer, so the test still fails if the regex + // behavior regresses. + // + // `max_length: Some(14)` caps read_string at the 14-byte target so the + // comparison succeeds on a buffer with no NUL terminator. `Relative(1)` + // on each child matches the `&+1` anchor offset (previous match end + 1, + // skipping the `;` separator). + // `regex/1l` == 1-line scan window, line_based = true, count = 1. + // Multi-line mode is always on so `^`/`$` match at line boundaries + // regardless; the `/l` flag controls only the scan window extent. + let one_line_regex = libmagic_rs::parser::ast::RegexFlags { + line_based: true, + ..libmagic_rs::parser::ast::RegexFlags::default() + }; + let one = ::std::num::NonZeroU32::new(1); + + let inner_regex = MagicRule { + offset: OffsetSpec::Relative(1), + typ: TypeKind::Regex { + flags: one_line_regex, + count: one, + }, + op: Operator::Equal, + value: Value::String("[^;]+$".to_string()), + message: "\u{0008}, using AES256 encryption".to_string(), + children: vec![], + level: 2, + strength_modifier: None, + }; + + let version_regex = MagicRule { + offset: OffsetSpec::Relative(1), + typ: TypeKind::Regex { + flags: one_line_regex, + count: one, + }, + op: Operator::Equal, + value: Value::String("[0-9]+(\\.[0-9]+)+".to_string()), + message: "\u{0008}, version 1.1".to_string(), + children: vec![inner_regex], + level: 1, + strength_modifier: None, + }; + + let ansible_vault = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::String { + max_length: Some("$ANSIBLE_VAULT".len()), + }, + op: Operator::Equal, + value: Value::String("$ANSIBLE_VAULT".to_string()), + message: "Ansible Vault text".to_string(), + children: vec![version_regex], + level: 0, + strength_modifier: None, + }; + + let config = EvaluationConfig::default(); + let mut context = EvaluationContext::new(config); + let matches = + evaluate_rules(&[ansible_vault], &buffer, &mut context).expect("evaluation failed"); + + // All three rules must fire in order: the top-level string, the version + // regex, and the encryption regex. + assert_eq!( + matches.len(), + 3, + "expected 3 matches (string + 2 regex), got {}: {matches:#?}", + matches.len() + ); + + // Verify the regex engine captured the expected tokens from the buffer. + // These assertions fail if regex evaluation or the relative-offset + // anchor advances incorrectly. + assert_eq!( + matches[0].value, + Value::String("$ANSIBLE_VAULT".to_string()), + "top-level string match should capture $ANSIBLE_VAULT" + ); + if let Value::String(s) = &matches[1].value { + assert!( + s.contains("1.1"), + "version regex should capture '1.1', got {s:?}" + ); + } else { + panic!( + "expected Value::String for version regex, got {:?}", + matches[1].value + ); + } + if let Value::String(s) = &matches[2].value { + assert!( + s.contains("AES256"), + "encryption regex should capture 'AES256', got {s:?}" + ); + } else { + panic!( + "expected Value::String for encryption regex, got {:?}", + matches[2].value + ); + } + + // Mirror `MagicDatabase::build_result` message concatenation: rules whose + // message starts with a backspace (`\b`) suppress the leading space. + let mut description = String::new(); + for m in &matches { + if let Some(rest) = m.message.strip_prefix('\u{0008}') { + description.push_str(rest); + } else if description.is_empty() { + description.push_str(&m.message); + } else { + description.push(' '); + description.push_str(&m.message); + } + } + + assert!( + description.contains("Ansible Vault"), + "expected 'Ansible Vault' in description, got: {description:?}" + ); + assert!( + description.contains("version"), + "expected 'version' in description, got: {description:?}" + ); + assert!( + description.contains("AES256"), + "expected 'AES256' in description, got: {description:?}" + ); +} diff --git a/tests/property_tests.rs b/tests/property_tests.rs index f36c70c4..ad7027d3 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -65,6 +65,25 @@ fn arb_type_kind() -> impl Strategy { length_width: width, length_includes_itself: includes_self, }), + ( + any::(), + any::(), + any::(), + prop::option::of(1u32..=4096u32), + ) + .prop_map(|(case_insensitive, start_offset, line_based, count)| { + TypeKind::Regex { + flags: libmagic_rs::parser::ast::RegexFlags { + case_insensitive, + start_offset, + line_based, + }, + count: count.and_then(::std::num::NonZeroU32::new), + } + }), + (1usize..=4096usize).prop_map(|range| TypeKind::Search { + range: ::std::num::NonZeroUsize::new(range).unwrap(), + }), ] } diff --git a/tests/regex_search_corpus_tests.rs b/tests/regex_search_corpus_tests.rs new file mode 100644 index 00000000..56f66776 --- /dev/null +++ b/tests/regex_search_corpus_tests.rs @@ -0,0 +1,370 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Corpus integration tests for issue #39 — regex and search types +//! +//! This file exercises the regex and search TypeKind variants end-to-end +//! against the test corpus files listed as "blocked" in issue #39: +//! +//! * `searchbug` — exercises `search/N` against a two-match binary buffer +//! * `json1`, `jsonlines1` — JSON text detection via regex +//! * `cmd1` — shell script detection via regex +//! * `gedcom` — GEDCOM genealogy file detection via regex +//! +//! Where a corpus file depends on magic-file features we do not yet +//! support (`use`/`name` directives, `offset` type, the `&+N`/`&-N` +//! parser for relative offsets), the test bypasses `parse_text_magic_file` +//! and builds the equivalent rule tree programmatically via the AST. +//! This pattern is documented in GOTCHAS 3.9. + +use libmagic_rs::evaluator::evaluate_rules; +use libmagic_rs::parser::ast::RegexFlags; +use libmagic_rs::{ + EvaluationConfig, EvaluationContext, MagicRule, OffsetSpec, Operator, TypeKind, Value, +}; +use std::num::NonZeroUsize; + +const CORPUS_DIR: &str = "third_party/tests"; + +fn load_corpus_file(name: &str) -> Vec { + let path = format!("{CORPUS_DIR}/{name}"); + std::fs::read(&path).unwrap_or_else(|e| panic!("failed to read {path}: {e}")) +} + +/// Run a flat list of rules against a buffer with a permissive config +/// and return the vector of matches for assertion. +fn run_rules(rules: &[MagicRule], buffer: &[u8]) -> Vec { + let config = EvaluationConfig::default(); + let mut context = EvaluationContext::new(config); + evaluate_rules(rules, buffer, &mut context).expect("evaluation should not fail") +} + +fn regex_rule( + offset: OffsetSpec, + pattern: &str, + flags: RegexFlags, + count: Option, + message: &str, + children: Vec, + level: u32, +) -> MagicRule { + MagicRule { + offset, + typ: TypeKind::Regex { + flags, + count: count.and_then(std::num::NonZeroU32::new), + }, + op: Operator::Equal, + value: Value::String(pattern.to_string()), + message: message.to_string(), + children, + level, + strength_modifier: None, + } +} + +fn search_rule( + offset: OffsetSpec, + pattern: &str, + range: usize, + message: &str, + children: Vec, + level: u32, +) -> MagicRule { + MagicRule { + offset, + typ: TypeKind::Search { + range: NonZeroUsize::new(range).expect("range must be non-zero"), + }, + op: Operator::Equal, + value: Value::String(pattern.to_string()), + message: message.to_string(), + children, + level, + strength_modifier: None, + } +} + +// ===================================================================== +// searchbug — search type hierarchical scan +// ===================================================================== + +/// `searchbug.magic` uses `use`/`name`/`offset`/`&0` features we do not +/// yet parse. The programmatic equivalent here models the same behavior: +/// a `TEST` header at offset 0 triggers a `search/12 "ABC"` scan, and +/// a byte rule reads the character immediately after the `ABC` match +/// (exercising the `Relative(N)` anchor advance after a search). +#[test] +fn test_searchbug_corpus_search_with_relative_child() { + let buffer = load_corpus_file("searchbug.testfile"); + assert!(buffer.starts_with(b"TEST"), "corpus should begin with TEST"); + + // Byte child reading the character immediately after "ABC". In the + // corpus file the first ABC is `ABC1` at offset 8, so after "ABC" + // (match-end at 11) the byte at offset 11 is '1' (0x31). + let after_abc = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(u64::from(b'1')), + message: "followed by 1".to_string(), + children: vec![], + level: 2, + strength_modifier: None, + }; + + // search/12 "ABC" with Relative(0) child. + let search_abc = search_rule( + OffsetSpec::Relative(0), + "ABC", + 12, + "found ABC", + vec![after_abc], + 1, + ); + + // Parent: TEST header at offset 0. + let root = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::String { + max_length: Some(4), + }, + op: Operator::Equal, + value: Value::String("TEST".to_string()), + message: "Testfmt".to_string(), + children: vec![search_abc], + level: 0, + strength_modifier: None, + }; + + let matches = run_rules(&[root], &buffer); + + // Expected chain: TEST header -> found ABC -> followed by 1. + assert_eq!( + matches.len(), + 3, + "expected 3 matches (header + search + byte child), got {matches:#?}" + ); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!(messages, ["Testfmt", "found ABC", "followed by 1"]); +} + +#[test] +fn test_searchbug_search_anchor_advance_not_window_end() { + // Regression guard: the search anchor must advance to match-end + // (8 + 3 = 11), NOT to the window end (first search starts at + // offset 0, window 12 would land at offset 12). If it advanced to + // window-end, the Relative(0) child would read byte 12 which is + // 'x' (0x78), not '1' (0x31). + let buffer = load_corpus_file("searchbug.testfile"); + + let wrong_byte = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(u64::from(b'x')), + message: "window-end bug -- must NOT match".to_string(), + children: vec![], + level: 2, + strength_modifier: None, + }; + + let search_abc = search_rule( + OffsetSpec::Relative(0), + "ABC", + 12, + "found ABC", + vec![wrong_byte], + 1, + ); + + let root = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::String { + max_length: Some(4), + }, + op: Operator::Equal, + value: Value::String("TEST".to_string()), + message: "Testfmt".to_string(), + children: vec![search_abc], + level: 0, + strength_modifier: None, + }; + + let matches = run_rules(&[root], &buffer); + // Should see Testfmt + found ABC but NOT the wrong_byte child. + assert_eq!( + matches.len(), + 2, + "wrong_byte should not match: {matches:#?}" + ); + assert_eq!(matches[1].message, "found ABC"); +} + +// ===================================================================== +// json1 / jsonlines1 — JSON text detection via regex +// ===================================================================== + +/// JSON detection: a buffer starting with `{` or `[` (after optional +/// whitespace) is a JSON document. This is the simplified detection +/// pattern used by libmagic's json.magic for the fast path. +#[test] +fn test_json1_corpus_detected_by_regex() { + let buffer = load_corpus_file("json1.testfile"); + + // `^\s*[\{\[]` — optional leading whitespace followed by an object + // or array opener. Multi-line mode is always on, so `^` matches the + // buffer start. + let json_rule = regex_rule( + OffsetSpec::Absolute(0), + r"^\s*[\{\[]", + RegexFlags::default(), + None, + "JSON text data", + vec![], + 0, + ); + + let matches = run_rules(&[json_rule], &buffer); + assert_eq!(matches.len(), 1, "json1 should match: {matches:#?}"); + assert_eq!(matches[0].message, "JSON text data"); +} + +#[test] +fn test_jsonlines1_corpus_detected_by_regex() { + let buffer = load_corpus_file("jsonlines1.testfile"); + + // JSON Lines detection: each line is an independent JSON document + // so we can reuse the same opener check on the first line. + let jsonlines_rule = regex_rule( + OffsetSpec::Absolute(0), + r"^\s*[\{\[]", + RegexFlags::default(), + None, + "JSON Lines text", + vec![], + 0, + ); + + let matches = run_rules(&[jsonlines_rule], &buffer); + assert_eq!(matches.len(), 1, "jsonlines1 should match: {matches:#?}"); +} + +// ===================================================================== +// cmd1 — shell script detection via regex +// ===================================================================== + +/// Shell script detection: a buffer starting with `#!` is a script. We +/// use a regex anchored at offset 0 to verify the shebang and capture +/// the interpreter path for a stronger match. +#[test] +fn test_cmd1_corpus_detected_by_regex() { + let buffer = load_corpus_file("cmd1.testfile"); + + let shebang_rule = regex_rule( + OffsetSpec::Absolute(0), + r"^#![ \t]*/\S+", + RegexFlags::default(), + None, + "a shell script", + vec![], + 0, + ); + + let matches = run_rules(&[shebang_rule], &buffer); + assert!(!matches.is_empty(), "cmd1 should match: {matches:#?}"); + assert_eq!(matches[0].message, "a shell script"); +} + +// ===================================================================== +// gedcom — genealogy file detection via regex +// ===================================================================== + +/// GEDCOM files begin with `0 HEAD` on the first line followed by +/// `1 SOUR ` and `2 VERS `. A simple regex on the +/// head line (with the `/l` line limit) is enough to detect the format. +#[test] +fn test_gedcom_corpus_detected_by_line_based_regex() { + let buffer = load_corpus_file("gedcom.testfile"); + + // `regex/1l "^0 HEAD"` — scan only the first line for the header. + let head_line_flags = RegexFlags { + line_based: true, + ..RegexFlags::default() + }; + + let gedcom_rule = regex_rule( + OffsetSpec::Absolute(0), + r"^0 HEAD", + head_line_flags, + Some(1), + "GEDCOM genealogy data", + vec![], + 0, + ); + + let matches = run_rules(&[gedcom_rule], &buffer); + assert_eq!(matches.len(), 1, "gedcom should match: {matches:#?}"); + assert_eq!(matches[0].message, "GEDCOM genealogy data"); +} + +// ===================================================================== +// regex-eol — simplified version extraction smoke test +// ===================================================================== + +/// Smoke test that the simpler non-hierarchical part of the regex-eol +/// scenario still works after the flag semantic change. Full +/// hierarchical coverage lives in the `test_regex_eol_corpus` test in +/// `tests/evaluator_tests.rs`. +#[test] +fn test_regex_eol_version_extraction() { + let buffer = load_corpus_file("regex-eol.testfile"); + + // Match a version number anywhere in the first line. + let version_rule = regex_rule( + OffsetSpec::Absolute(0), + r"[0-9]+(\.[0-9]+)+", + RegexFlags { + line_based: true, + ..RegexFlags::default() + }, + Some(1), + "version found", + vec![], + 0, + ); + + let matches = run_rules(&[version_rule], &buffer); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].message, "version found"); + // The matched value should look like a version number. + match &matches[0].value { + Value::String(s) => assert!( + s.chars().all(|c| c.is_ascii_digit() || c == '.'), + "matched text should be a version number, got {s:?}" + ), + other => panic!("expected Value::String, got {other:?}"), + } +} + +// ===================================================================== +// Meta: corpus files exist +// ===================================================================== + +#[test] +fn test_corpus_files_exist() { + for name in [ + "searchbug.testfile", + "json1.testfile", + "jsonlines1.testfile", + "cmd1.testfile", + "gedcom.testfile", + "regex-eol.testfile", + ] { + let path = format!("{CORPUS_DIR}/{name}"); + assert!( + std::path::Path::new(&path).exists(), + "corpus file missing: {path}" + ); + } +}