From 0c3b3982802bb9f5e913f69895fd8ace76031182 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Tue, 21 Apr 2026 23:27:07 -0400 Subject: [PATCH 01/16] chore(lib): remove duplicate author entry in Cargo.toml Signed-off-by: UncleSp1d3r --- Cargo.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fd9bae6..16ede5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,10 +2,7 @@ name = "libmagic-rs" version = "0.5.0" edition = "2024" -authors = [ - "UncleSp1d3r ", - "KryptoKat ", -] +authors = ["UncleSp1d3r "] description = "A pure-Rust implementation of libmagic for file type identification" license = "Apache-2.0" repository = "https://github.com/EvilBit-Labs/libmagic-rs" From 2e5a80157488ac2129ef32aaf96d5f330919aa84 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Tue, 21 Apr 2026 23:27:29 -0400 Subject: [PATCH 02/16] chore(lib): add provenance information for actionlint tools Signed-off-by: UncleSp1d3r --- mise.lock | 100 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 34 deletions(-) diff --git a/mise.lock b/mise.lock index 677a0a1..cb5819e 100644 --- a/mise.lock +++ b/mise.lock @@ -7,26 +7,32 @@ backend = "aqua:rhysd/actionlint" [tools.actionlint."platforms.linux-arm64"] checksum = "sha256:325e971b6ba9bfa504672e29be93c24981eeb1c07576d730e9f7c8805afff0c6" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_arm64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.linux-arm64-musl"] checksum = "sha256:325e971b6ba9bfa504672e29be93c24981eeb1c07576d730e9f7c8805afff0c6" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_arm64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.linux-x64"] checksum = "sha256:8aca8db96f1b94770f1b0d72b6dddcb1ebb8123cb3712530b08cc387b349a3d8" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.linux-x64-baseline"] checksum = "sha256:8aca8db96f1b94770f1b0d72b6dddcb1ebb8123cb3712530b08cc387b349a3d8" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.linux-x64-musl"] checksum = "sha256:8aca8db96f1b94770f1b0d72b6dddcb1ebb8123cb3712530b08cc387b349a3d8" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.linux-x64-musl-baseline"] checksum = "sha256:8aca8db96f1b94770f1b0d72b6dddcb1ebb8123cb3712530b08cc387b349a3d8" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_linux_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.macos-arm64"] checksum = "sha256:aba9ced2dee8d27fecca3dc7feb1a7f9a52caefa1eb46f3271ea66b6e0e6953f" @@ -36,18 +42,22 @@ provenance = "github-attestations" [tools.actionlint."platforms.macos-x64"] checksum = "sha256:5b44c3bc2255115c9b69e30efc0fecdf498fdb63c5d58e17084fd5f16324c644" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_darwin_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.macos-x64-baseline"] checksum = "sha256:5b44c3bc2255115c9b69e30efc0fecdf498fdb63c5d58e17084fd5f16324c644" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_darwin_amd64.tar.gz" +provenance = "github-attestations" [tools.actionlint."platforms.windows-x64"] checksum = "sha256:6e7241b51e6817ea6a047693d8e6fed13b31819c9a0dd6c5a726e1592d22f6e9" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_windows_amd64.zip" +provenance = "github-attestations" [tools.actionlint."platforms.windows-x64-baseline"] checksum = "sha256:6e7241b51e6817ea6a047693d8e6fed13b31819c9a0dd6c5a726e1592d22f6e9" url = "https://github.com/rhysd/actionlint/releases/download/v1.7.12/actionlint_1.7.12_windows_amd64.zip" +provenance = "github-attestations" [[tools.bun]] version = "1.3.11" @@ -373,62 +383,62 @@ version = "3.14.4" backend = "core:python" [tools.python."platforms.linux-arm64"] -checksum = "sha256:c84d61ae07e3b255f8bac6a28147bd373c3c1862d1d5598a9543a5b103fcb595" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:b8b597fdb2f8dccdc502c11947b60a4b65eb6bce79cfa60c7ccf9b6e8352c60a" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-arm64-musl"] -checksum = "sha256:c84d61ae07e3b255f8bac6a28147bd373c3c1862d1d5598a9543a5b103fcb595" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:b8b597fdb2f8dccdc502c11947b60a4b65eb6bce79cfa60c7ccf9b6e8352c60a" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-aarch64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-x64"] -checksum = "sha256:c838ac128a6e9c944b30301880472349eca8cd1abc79fb0d359ac2a358569389" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:fe9a9c32d13870af632cbac3dfc7528ae53597e94472aa4c7d6a42e8166136cd" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-x64-baseline"] -checksum = "sha256:c838ac128a6e9c944b30301880472349eca8cd1abc79fb0d359ac2a358569389" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:fe9a9c32d13870af632cbac3dfc7528ae53597e94472aa4c7d6a42e8166136cd" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-x64-musl"] -checksum = "sha256:c838ac128a6e9c944b30301880472349eca8cd1abc79fb0d359ac2a358569389" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:fe9a9c32d13870af632cbac3dfc7528ae53597e94472aa4c7d6a42e8166136cd" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.linux-x64-musl-baseline"] -checksum = "sha256:c838ac128a6e9c944b30301880472349eca8cd1abc79fb0d359ac2a358569389" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" +checksum = "sha256:fe9a9c32d13870af632cbac3dfc7528ae53597e94472aa4c7d6a42e8166136cd" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.macos-arm64"] -checksum = "sha256:f1ce5d79bceecbed25a37b611d6dae147b27857dda8181e3a5e29e73dd1c57c3" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-aarch64-apple-darwin-install_only_stripped.tar.gz" +checksum = "sha256:6f304f4ec30854611f23316578302235fb517cd970519ecdd11a8c4db87fd843" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-aarch64-apple-darwin-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.macos-x64"] -checksum = "sha256:1c9615f872058332b74b2be1b248078c636b6f4b116eed3e17f50eb7efe1a989" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-apple-darwin-install_only_stripped.tar.gz" +checksum = "sha256:d51250a32fa5d9f0799c7bcb71720c27b10a3afd4a7de288120f96085d508a5a" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-apple-darwin-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.macos-x64-baseline"] -checksum = "sha256:1c9615f872058332b74b2be1b248078c636b6f4b116eed3e17f50eb7efe1a989" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-apple-darwin-install_only_stripped.tar.gz" +checksum = "sha256:d51250a32fa5d9f0799c7bcb71720c27b10a3afd4a7de288120f96085d508a5a" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-apple-darwin-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.windows-x64"] -checksum = "sha256:472f4f0d91429661db1b28a72e36adc593b6bd5d19db222d7faca635b3005313" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" +checksum = "sha256:a976991dcd085c1bb5d9a8084823a6bc8b7f9b079d8c432574a6ddd68c3a6fe1" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" provenance = "github-attestations" [tools.python."platforms.windows-x64-baseline"] -checksum = "sha256:472f4f0d91429661db1b28a72e36adc593b6bd5d19db222d7faca635b3005313" -url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260408/cpython-3.14.4+20260408-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" +checksum = "sha256:a976991dcd085c1bb5d9a8084823a6bc8b7f9b079d8c432574a6ddd68c3a6fe1" +url = "https://github.com/astral-sh/python-build-standalone/releases/download/20260414/cpython-3.14.4+20260414-x86_64-pc-windows-msvc-install_only_stripped.tar.gz" provenance = "github-attestations" [[tools.rust]] -version = "1.94.1" +version = "1.95.0" backend = "core:rust" [[tools.scorecard]] @@ -438,57 +448,79 @@ backend = "aqua:ossf/scorecard" [tools.scorecard."platforms.linux-arm64"] checksum = "sha256:3f8b6354c62ec0287a8e9694481d834e16bff8451cf5b5dca435e8400ce5adaf" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_arm64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-arm64".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.linux-arm64-musl"] checksum = "sha256:3f8b6354c62ec0287a8e9694481d834e16bff8451cf5b5dca435e8400ce5adaf" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_arm64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-arm64-musl".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.linux-x64"] checksum = "sha256:e5183aeaa5aa548fbb7318a6deb3e1038be0ef9aca24e655422ae88dfbe67502" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-x64".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.linux-x64-baseline"] checksum = "sha256:e5183aeaa5aa548fbb7318a6deb3e1038be0ef9aca24e655422ae88dfbe67502" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-x64-baseline".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.linux-x64-musl"] checksum = "sha256:e5183aeaa5aa548fbb7318a6deb3e1038be0ef9aca24e655422ae88dfbe67502" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-x64-musl".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.linux-x64-musl-baseline"] checksum = "sha256:e5183aeaa5aa548fbb7318a6deb3e1038be0ef9aca24e655422ae88dfbe67502" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_linux_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.linux-x64-musl-baseline".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.macos-arm64"] checksum = "sha256:2c672695a27d35537dd4054f690f31fa1d6a72b0957598f45181296487f537f4" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_darwin_arm64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.macos-arm64".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.macos-x64"] checksum = "sha256:2abfec13b8eecc9b730e3782c9b3a9544d31ae861ce21ea7fe6a369d887d7c89" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_darwin_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.macos-x64".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.macos-x64-baseline"] checksum = "sha256:2abfec13b8eecc9b730e3782c9b3a9544d31ae861ce21ea7fe6a369d887d7c89" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_darwin_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.macos-x64-baseline".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.windows-x64"] checksum = "sha256:f7d0ece0dde703e4baa5f96e9b6ed33e6e786138c90db8de2c4943f24015b9ff" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_windows_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.windows-x64".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [tools.scorecard."platforms.windows-x64-baseline"] checksum = "sha256:f7d0ece0dde703e4baa5f96e9b6ed33e6e786138c90db8de2c4943f24015b9ff" url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/scorecard_5.4.0_windows_amd64.tar.gz" -provenance = "slsa" + +[tools.scorecard."platforms.windows-x64-baseline".provenance.slsa] +url = "https://github.com/ossf/scorecard/releases/download/v5.4.0/multiple.intoto.jsonl" [[tools.shellcheck]] version = "0.11.0" From d155a3b10ddee6de4c9c06c94dba59891a70f153 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Tue, 21 Apr 2026 23:49:21 -0400 Subject: [PATCH 03/16] refactor(parser): reorganize grammar and types modules into submodules Signed-off-by: UncleSp1d3r --- AGENTS.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9c49476..b9456ee 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -82,9 +82,12 @@ pub enum Operator { parser/ ├── mod.rs // Public parser interface ├── ast.rs // AST node definitions -├── grammar/ // Magic file DSL parsing (nom) -│ ├── mod.rs // Grammar parsing logic -│ └── tests.rs // Grammar parser tests +├── grammar/ // Magic file DSL parsing (nom) -- split into focused submodules +│ ├── mod.rs // Top-level parse_magic_rule_line, dispatch +│ ├── numbers.rs // parse_number, parse_unsigned_number +│ ├── value.rs // parse_value (quoted strings, numeric literals) +│ ├── type_suffix.rs // pstring /B/H/L, regex /c/s, search /N suffixes +│ └── tests/ // Grammar test modules ├── types.rs // Type keyword parsing and TypeKind conversion └── codegen.rs // Serialization for code generation (shared with build.rs) @@ -95,7 +98,13 @@ evaluator/ ├── engine/ // Core evaluation engine submodule │ ├── mod.rs // evaluate_single_rule, evaluate_rules, evaluate_rules_with_config │ └── tests.rs // Engine unit tests -├── types.rs // Type interpretation with endianness +├── types/ // Type interpretation with endianness (directory module, issue #63) +│ ├── mod.rs // read_typed_value, read_pattern_match, bytes_consumed_with_pattern +│ ├── numeric.rs // byte/short/long/quad readers +│ ├── string.rs // string/pstring readers +│ ├── float.rs // float/double readers +│ ├── date.rs // date/qdate readers and timestamp formatting +│ └── regex.rs // regex/search readers, REGEX_MAX_BYTES cap, thread-local cache ├── strength.rs // Strength modifier application ├── offset/ // Offset resolution submodule │ ├── mod.rs // Dispatcher (resolve_offset) and re-exports From 6f8ae2cc2906f1a6045f9764f65bc511b64e47b1 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 00:05:39 -0400 Subject: [PATCH 04/16] docs(agents): update magic file compatibility and limitations for v0.5.x Signed-off-by: UncleSp1d3r --- AGENTS.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index b9456ee..d997d00 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -211,7 +211,7 @@ cargo test --doc # Test documentation examples ## Magic File Compatibility -### Currently Implemented (v0.5.0) +### Currently Implemented (v0.5.x, unreleased) - **Offsets**: Absolute, from-end, indirect, and relative specifications (relative offsets `&+N`/`&-N` are evaluated using GNU `file` semantics -- the previous-match anchor) - **Types**: `byte`, `short`, `long`, `quad`, `float`, `double`, `string`, `pstring` with endianness support; unsigned variants `ubyte`, `ushort`/`ubeshort`/`uleshort`, `ulong`/`ubelong`/`ulelong`, `uquad`/`ubequad`/`ulequad`; float/double endian variants `befloat`/`lefloat`, `bedouble`/`ledouble`; 32-bit date/timestamp types `date`/`ldate`/`bedate`/`beldate`/`ledate`/`leldate`; 64-bit date/timestamp types `qdate`/`qldate`/`beqdate`/`beqldate`/`leqdate`/`leqldate`; `pstring` is a Pascal string (length-prefixed) with support for 1/2/4-byte length prefixes via `/B`, `/H` (2-byte BE), `/h` (2-byte LE), `/L` (4-byte BE), `/l` (4-byte LE) suffixes, and the `/J` flag (stored length includes prefix width, JPEG convention) which is combinable with width suffixes (e.g., `pstring/HJ`); date values formatted as "Www Mmm DD HH:MM:SS YYYY" matching GNU `file` output; types are signed by default (libmagic-compatible) @@ -220,20 +220,22 @@ cargo test --doc # Test documentation examples - **String Matching**: Exact string matching with null-termination and Pascal string (length-prefixed) support - **Regex type**: Binary-safe regex matching via `regex::bytes::Regex`. Full flag support: `/c` (case-insensitive), `/s` (anchor advances to match-start instead of match-end), `/l` (scan window is measured in lines instead of bytes). Flags combine in any order (`regex/cs`, `regex/csl`, `regex/lc`). Numeric counts are honored: `regex/100` scans at most 100 bytes; `regex/1l` scans at most 1 line. Multi-line regex matching is always on (matching libmagic's unconditional `REG_NEWLINE`), so `^` and `$` match at line boundaries regardless of `/l`. Every scan window is capped at 8192 bytes (`FILE_REGEX_MAX`) regardless of the user's count. - **Search type**: Bounded literal pattern scan via `memchr::memmem::find`; `search/N` caps the scan window to `N` bytes from the offset. The range is **mandatory** and stored as `NonZeroUsize`, so bare `search` and `search/0` are parse errors (matching GNU `file` magic(5)). Anchor advance follows GNU `file` semantics (match-end, not window-end) so relative-offset children resolve to the byte immediately after the matched pattern. +- **Meta-type directives**: `default`, `clear`, `name `, `use `, `indirect` are parsed into `TypeKind::Meta(MetaType::...)` and preserved through codegen. The evaluator currently treats all five as silent no-ops (returns `Ok(None)` in `evaluate_single_rule_with_anchor`); control-flow semantics will be wired up in a subsequent phase. -See **Development Phases** below for the planned roadmap of features not yet implemented (Aho-Corasick multi-pattern optimization, compiled-regex caching, `!:mime`/`!:ext`/`!:apple` directive evaluation, and `use`/`name` named test directives). +See **Development Phases** below for the planned roadmap of features not yet implemented (Aho-Corasick multi-pattern optimization, compiled-regex caching, `!:mime`/`!:ext`/`!:apple` directive evaluation, and evaluator wiring for the parsed meta-type directives `default`/`clear`/`name`/`use`/`indirect`). -## Current Limitations (v0.5.0) +## Current Limitations (v0.5.x, unreleased) ### Type System - 64-bit integer types: `quad`/`uquad`, `bequad`/`ubequad`, `lequad`/`ulequad` are implemented; `qquad` (128-bit) is not yet supported -- String evaluation reads until first NUL or end-of-buffer by default; `pstring` reads a length-prefixed Pascal string; `max_length: Some(_)` is supported internally but no dedicated fixed-length string parser syntax exists yet +- `string` evaluation reads until first NUL or end-of-buffer; `max_length: Some(_)` is supported programmatically (via the AST) but libmagic itself has no corresponding surface syntax, so this is not a parity gap +- `string` type modifier flags are not supported: `/B` (compact whitespace), `/b` (compact blanks), `/c`/`/C` (case-insensitive), `/t`/`/T` (force text/binary), `/w`/`/W` (whitespace optional). Only `pstring` has suffix parsing today. - `pstring` supports 1-byte (`/B`), 2-byte big-endian (`/H`), 2-byte little-endian (`/h`), 4-byte big-endian (`/L`), and 4-byte little-endian (`/l`) length prefixes, plus the `/J` flag (stored length includes prefix width). All flags are combinable (e.g., `pstring/HJ`) and fully implemented. ### Operators -- BitwiseAnd supports mask values but not all libmagic mask syntax +- Parser handles `&`, `&`, and `&0x` masks across the full `u64` range; compound forms like arithmetic expressions in mask position (`&(N+M)`) or post-mask modifiers are not parsed ### Offset Specifications @@ -244,7 +246,7 @@ See **Development Phases** below for the planned roadmap of features not yet imp - Limited support for special directives (only `!:strength` is parsed) - No support for `!:mime`, `!:ext`, `!:apple` directives in evaluation -- No support for named tests or use/name directives +- Meta-type directives (`default`, `clear`, `name`, `use`, `indirect`) are parsed into the AST but evaluated as silent no-ops; full control-flow semantics are deferred See issue #52 for the planned enhancement roadmap. From 33b6fc8c70068c3d61875a5ffef6a225f1f5d1dd Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 00:25:03 -0400 Subject: [PATCH 05/16] docs(gotchas): document requirements for adding MetaType variants Signed-off-by: UncleSp1d3r --- GOTCHAS.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/GOTCHAS.md b/GOTCHAS.md index 9f7cbe1..fa1d451 100644 --- a/GOTCHAS.md +++ b/GOTCHAS.md @@ -73,6 +73,10 @@ The cap is a DoS mitigation: without it, a malicious regex against a multi-GB bu Both `RegexCount::Default` (plain `regex`) and `RegexCount::Lines(None)` (the `regex/l` shorthand with no explicit count) produce the full 8192-byte capped window. `compute_window` handles them with a shared match arm, and `calculate_default_strength` gives them the same strength score (20, no "constrained scan" bonus). The two variants are kept distinct at the AST level because the magic-file surface syntax distinguishes them — `regex` and `regex/l` parse to different `RegexCount` variants and round-trip through codegen as different Rust expressions — but no runtime path treats them differently. If you write a test that passes `Lines(None)` expecting different behavior from `Default`, the test is wrong, not the implementation. See `test_read_regex_lines_none_is_equivalent_to_default_on_buffer_with_terminators` in `src/evaluator/types/regex.rs` for the regression guard that pins this equivalence. +### 2.11 `MetaType` Exhaustive Matches + +Adding a variant to the `MetaType` enum (nested inside `TypeKind::Meta`) requires updates in: `ast.rs` (variant definition + 3 test fixtures that iterate MetaType variants: `test_meta_type_variants_debug_clone_eq`, `test_meta_type_serde_roundtrip`, `test_type_kind_meta_bit_width_is_none`), `parser/types.rs` (`parse_type_keyword` tag + `type_keyword_to_kind` match arm + `test_roundtrip_all_keywords` array), `parser/codegen.rs` (`serialize_type_kind` -- the inner `TypeKind::Meta(meta)` arm), and `tests/property_tests.rs` (`arb_type_kind` `prop_oneof` branch). The evaluator does NOT need updates: `TypeKind::Meta(_)` is handled uniformly as a no-op via the wildcard arm in `engine::evaluate_single_rule_with_anchor`, `strength.rs::calculate_default_strength`, and `types/mod.rs::bytes_consumed_with_pattern`. + ## 3. Parser Architecture ### 3.1 Type Keyword Parsing Split @@ -98,9 +102,9 @@ The nom `tuple` combinator is deprecated. Use bare tuple syntax `(a, b, c)` dire `parse_number` handles `-` signs but not `+`. When parsing syntax like `+4` (e.g., indirect offset adjustments), consume the `+` character manually before calling `parse_number`. -### 3.6 `parse_value` Requires Quoted Strings +### 3.6 `parse_value` Requires Quoted Strings (But `parse_magic_rule` Has a Bare-Word Fallback) -`parse_value()` does not accept bare unquoted strings. String values in magic file rules must be quoted (e.g., `string "MZ"` not `string MZ`). Integration tests writing magic files must use `r#"0 string "MZ" description"#` format. +`parse_value()` itself does not accept bare unquoted strings -- `parse_value("xyz")` still returns `Err` (see `test_parse_value_invalid_input` in `grammar/tests/mod.rs`, which pins this behavior). However, `parse_magic_rule` adds a bare-word fallback (`parse_bare_string_value`) when the rule's type is string-family (`String`, `PString`, `Regex`, `Search`), so `0 string TEST` and `>0 search/12 ABC` parse successfully without quotes -- matching libmagic magic(5) surface syntax and allowing real-world fixtures like `third_party/tests/searchbug.magic` to load. For non-string-family types (byte/short/long/etc.), bare words still fail; integration tests exercising `parse_value` directly (as opposed to going through `parse_magic_rule`) must still quote string literals. ### 3.7 Indirect Offset Pointer Specifiers Follow GNU `file` Semantics From 6e4ba1a7979d5761ffb9b37332b97a22790387c4 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 19:29:43 -0400 Subject: [PATCH 06/16] feat: Implement libmagic meta-type directives and format substitution (#42) Complete issue #42 by landing the remaining Phase 2 work on top of the prior Phase 1 AST + parser + `name`/`use` dispatch. Closes byte-for-byte parity with GNU `file`'s `searchbug.result` fixture. Meta-type directives - `TypeKind::Meta(MetaType::{Default, Clear, Name, Use, Indirect, Offset})` all fully evaluated in `src/evaluator/engine/mod.rs`. - `default` fires only when no sibling at the same level has matched; `clear` resets the per-level `sibling_matched` flag so later `default` rules can fire again. - `name` subroutines are hoisted into a `NameTable` at load time (`parser::name_table::extract_name_table`). `use` invokes them via `RuleEnvironment` threaded through `EvaluationContext::rule_env`. - `indirect` re-applies the root rule set at the resolved offset via `AnchorScope`, bounded by `EvaluationConfig::max_recursion_depth`. - `offset` reports the resolved file offset as `Value::Uint(pos)` for printf-style substitution. Printf-style format substitution - New pure function `src/output/format.rs::format_magic_message` supporting `%d`, `%i`, `%u`, `%x`, `%X`, `%o`, `%s`, `%c`, `%%`, plus width/padding/length modifiers. Hex specifiers mask to the natural `TypeKind::bit_width()` so a signed byte carrying `-1` renders as `ff` rather than sign-extended `ffffffffffffffff`. - Alt-form width follows C printf: zero-pad inserts between prefix and digits (`%#06x` + `0xab` -> `0x00ab`), space-pad goes before the prefix, left-align trails the digits. - Wired into `MagicDatabase::concatenate_messages` before the existing `\b` backspace-suppression check (GOTCHAS S14.1). Parser and evaluator semantics - Parser consumes an optional `x` (AnyValue) token between a Meta keyword and its message so `offset x at_offset %lld` no longer leaks the operator into output. - Subroutine `base_offset` on `EvaluationContext`: inside a `use` body, positive `OffsetSpec::Absolute(n)` resolves to `base_offset + n`, matching magic(5) semantics. Negative absolute, `FromEnd`, `Relative`, and `Indirect` are unaffected (new GOTCHAS S3.10). - Continuation-sibling anchor reset: at `recursion_depth > 0`, each sibling resolves `&N` against the parent-level entry anchor rather than the previous sibling's advance. Top-level siblings (depth 0) keep chaining per GOTCHAS S3.8 (`relative_anchor_can_decrease_...` still passes). Tests - `test_searchbug_matches_full_result_string` un-ignored and passing against the GNU `file` `searchbug.testfile` fixture. - 18 unit tests for `format_magic_message` including regression guards for alt-form zero-padding. - 7 new engine tests for `MetaType::Offset` dispatch (match emission, offset-at-zero, out-of-bounds skip, non-`AnyValue` operator reject, children, anchor semantics, `sibling_matched` propagation). - Integration test file `tests/meta_types_integration.rs` covers `default`/`clear`/`indirect` synthetic scenarios plus the searchbug round-trip. Documentation - AGENTS.md "Currently Implemented" expanded with the six meta-types and printf-style substitution; "Current Limitations" updated. - GOTCHAS.md S14.2 rewritten (specifiers ARE substituted); new S3.10 for subroutine base offset; S3.9 renumbered to S3.11 with S14.1 cross-reference fixed. - `docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md` documents the three-layer parse-time / `ParsedMagic` / optional `RuleEnvironment` pattern. Build tooling - `dist generate` regenerated `.github/workflows/release.yml` to resolve cargo-dist staleness vs recent `actions/upload-artifact` dependabot bumps. Test results: 1348/1348 pass, 5 pre-existing skipped. `just ci-check` green (clippy `-D warnings`, fmt, audit, deny). Signed-off-by: UncleSp1d3r --- .github/workflows/release.yml | 10 +- AGENTS.md | 18 +- CHANGELOG.md | 5 + GOTCHAS.md | 46 +- ROADMAP.md | 2 +- benches/evaluation_bench.rs | 44 +- build.rs | 4 +- docs/ARCHITECTURE.md | 12 +- docs/MAGIC_FORMAT.md | 69 +- ...a-type-subroutine-dispatch-architecture.md | 203 +++++ docs/src/architecture.md | 24 +- docs/src/ast-structures.md | 55 ++ docs/src/evaluator.md | 60 +- docs/src/magic-format.md | 67 +- docs/src/parser.md | 72 +- src/build_helpers.rs | 6 +- src/error.rs | 55 ++ src/evaluator/engine/mod.rs | 714 ++++++++++++++- src/evaluator/engine/tests.rs | 838 +++++++++++++++++- src/evaluator/mod.rs | 68 ++ src/evaluator/offset/mod.rs | 35 +- src/evaluator/strength.rs | 115 +++ src/evaluator/types/mod.rs | 58 +- src/lib.rs | 84 +- src/output/format.rs | 623 +++++++++++++ src/output/mod.rs | 1 + src/parser/ast.rs | 191 +++- src/parser/codegen.rs | 52 +- src/parser/grammar/mod.rs | 195 +++- src/parser/grammar/tests/mod.rs | 201 +++++ src/parser/loader.rs | 119 ++- src/parser/mod.rs | 75 +- src/parser/name_table.rs | 255 ++++++ src/parser/types.rs | 48 +- tests/compatibility_tests.rs | 43 + tests/directory_loading_tests.rs | 41 +- tests/meta_types_integration.rs | 218 +++++ tests/parser_integration_tests.rs | 55 +- tests/property_tests.proptest-regressions | 9 + tests/property_tests.rs | 60 +- tests/regex_search_corpus_tests.rs | 16 +- 41 files changed, 4661 insertions(+), 205 deletions(-) create mode 100644 docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md create mode 100644 src/output/format.rs create mode 100644 src/parser/name_table.rs create mode 100644 tests/meta_types_integration.rs create mode 100644 tests/property_tests.proptest-regressions diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1561ce7..cd56279 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -66,7 +66,7 @@ jobs: shell: bash run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.31.0/cargo-dist-installer.sh | sh" - name: Cache dist - uses: actions/upload-artifact@v7.0.1 + uses: actions/upload-artifact@v7.0.0 with: name: cargo-dist-cache path: ~/.cargo/bin/dist @@ -82,7 +82,7 @@ jobs: cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v7.0.1 + uses: actions/upload-artifact@v7.0.0 with: name: artifacts-plan-dist-manifest path: plan-dist-manifest.json @@ -168,7 +168,7 @@ jobs: cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v7.0.1 + uses: actions/upload-artifact@v7.0.0 with: name: artifacts-build-local-${{ join(matrix.targets, '_') }} path: | @@ -233,7 +233,7 @@ jobs: find . -name '*.cdx.xml' | tee -a "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - name: "Upload artifacts" - uses: actions/upload-artifact@v7.0.1 + uses: actions/upload-artifact@v7.0.0 with: name: artifacts-build-global path: | @@ -279,7 +279,7 @@ jobs: cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v7.0.1 + uses: actions/upload-artifact@v7.0.0 with: # Overwrite the previous copy name: artifacts-dist-manifest diff --git a/AGENTS.md b/AGENTS.md index d997d00..45386e9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -220,9 +220,10 @@ cargo test --doc # Test documentation examples - **String Matching**: Exact string matching with null-termination and Pascal string (length-prefixed) support - **Regex type**: Binary-safe regex matching via `regex::bytes::Regex`. Full flag support: `/c` (case-insensitive), `/s` (anchor advances to match-start instead of match-end), `/l` (scan window is measured in lines instead of bytes). Flags combine in any order (`regex/cs`, `regex/csl`, `regex/lc`). Numeric counts are honored: `regex/100` scans at most 100 bytes; `regex/1l` scans at most 1 line. Multi-line regex matching is always on (matching libmagic's unconditional `REG_NEWLINE`), so `^` and `$` match at line boundaries regardless of `/l`. Every scan window is capped at 8192 bytes (`FILE_REGEX_MAX`) regardless of the user's count. - **Search type**: Bounded literal pattern scan via `memchr::memmem::find`; `search/N` caps the scan window to `N` bytes from the offset. The range is **mandatory** and stored as `NonZeroUsize`, so bare `search` and `search/0` are parse errors (matching GNU `file` magic(5)). Anchor advance follows GNU `file` semantics (match-end, not window-end) so relative-offset children resolve to the byte immediately after the matched pattern. -- **Meta-type directives**: `default`, `clear`, `name `, `use `, `indirect` are parsed into `TypeKind::Meta(MetaType::...)` and preserved through codegen. The evaluator currently treats all five as silent no-ops (returns `Ok(None)` in `evaluate_single_rule_with_anchor`); control-flow semantics will be wired up in a subsequent phase. +- **Meta-type directives**: `default`, `clear`, `name `, `use `, `indirect`, and `offset` are fully implemented. `name` blocks are hoisted into a `NameTable` at load time (`parser::name_table::extract_name_table`). `use` invokes subroutines at the resolved offset via `RuleEnvironment` threaded through `EvaluationContext::rule_env`; subroutine-local absolute offsets resolve relative to the use-site base (tracked via `EvaluationContext::base_offset`). `default` fires only when no sibling at the same level has matched; `clear` resets the per-level sibling-matched flag so a later `default` can fire. `indirect` re-applies the root rule set at the resolved offset, bounded by `EvaluationConfig::max_recursion_depth`. `offset` reports the resolved file offset as `Value::Uint(pos)` for format-string rendering. Continuation siblings (`recursion_depth > 0`) see the parent-level anchor on each iteration rather than chaining -- matching libmagic's `ms->c.li[cont_level]` model. Top-level siblings still chain (documented in GOTCHAS S3.8). +- **Printf-style format substitution**: Rule messages support `%d`, `%i`, `%u`, `%x`, `%X`, `%o`, `%s`, `%c`, and `%%`, along with width/padding modifiers (`%05d`, `%-5d`) and length modifiers (`l`, `ll`, `h`, etc. -- parsed and ignored). Hex specifiers respect the rule's `TypeKind::bit_width()` to mask sign-extended signed reads (so a signed byte carrying `-1` renders as `ff`, not `ffffffffffffffff`). Implemented in `src/output/format.rs::format_magic_message` and wired into `MagicDatabase::build_result`. Unrecognized specifiers pass through literally with a `debug!` log. -See **Development Phases** below for the planned roadmap of features not yet implemented (Aho-Corasick multi-pattern optimization, compiled-regex caching, `!:mime`/`!:ext`/`!:apple` directive evaluation, and evaluator wiring for the parsed meta-type directives `default`/`clear`/`name`/`use`/`indirect`). +See **Development Phases** below for the planned roadmap of features not yet implemented (Aho-Corasick multi-pattern optimization and `!:mime`/`!:ext`/`!:apple` directive evaluation). ## Current Limitations (v0.5.x, unreleased) @@ -246,7 +247,7 @@ See **Development Phases** below for the planned roadmap of features not yet imp - Limited support for special directives (only `!:strength` is parsed) - No support for `!:mime`, `!:ext`, `!:apple` directives in evaluation -- Meta-type directives (`default`, `clear`, `name`, `use`, `indirect`) are parsed into the AST but evaluated as silent no-ops; full control-flow semantics are deferred +- Meta-type directives (`default`, `clear`, `name`, `use`, `indirect`, `offset`) are all fully implemented with evaluator dispatch, including printf-style format substitution in message rendering (see "Currently Implemented" above for details). See issue #52 for the planned enhancement roadmap. @@ -321,6 +322,17 @@ sample.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) 6. Add tests for the new type 7. Update documentation +### Adding a new meta-type + +Meta-types sit inside `TypeKind::Meta(MetaType)` and do not read bytes. Adding a new variant requires: + +1. Add the variant to `MetaType` in `src/parser/ast.rs`. Update the three test fixtures that iterate `MetaType` variants: `test_meta_type_variants_debug_clone_eq`, `test_meta_type_serde_roundtrip`, `test_type_kind_meta_bit_width_is_none` (see GOTCHAS S2.11). +2. Add the keyword tag in `parse_type_keyword` and the arm in `type_keyword_to_kind` in `src/parser/types.rs`, plus the `test_roundtrip_all_keywords` array. +3. Update `serialize_type_kind` (the inner `TypeKind::Meta(meta)` arm) in `src/parser/codegen.rs`. +4. Update `arb_type_kind` in `tests/property_tests.rs` (`prop_oneof` branch for `MetaType`). +5. Decide semantics: does the new variant need inline loop-level dispatch in `evaluate_rules` (like `Use`, `Default`, `Clear`, `Indirect` — each of which mutates the match vector or `sibling_matched` flag) or is it a silent no-op via the `Meta(_)` wildcard arm in `evaluate_single_rule_with_anchor`? Add the arm accordingly in `src/evaluator/engine/mod.rs`. +6. Add unit tests covering parse round-trip, the evaluator arm, and any new `RuleEnvironment` lookups. + ### Adding New Operators > **Note:** Currently implemented operators are `Equal`, `NotEqual`, `LessThan`, `GreaterThan`, `LessEqual`, `GreaterEqual`, `BitwiseAnd` (with `BitwiseAndMask`), `BitwiseXor`, `BitwiseNot`, and `AnyValue`. diff --git a/CHANGELOG.md b/CHANGELOG.md index 99afd0c..1f3116b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All notable changes to this project will be documented in this file. ### Features +- **parser**: Implement meta-type directives: `name`/`use` subroutines, `default`/`clear` per-level fallback, and `indirect` re-evaluation. `parse_text_magic_file` now returns `ParsedMagic { rules, name_table }` (breaking change from `Vec`). Named subroutines are hoisted into `NameTable` at load time and dispatched via `RuleEnvironment` in the evaluator. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. Resolves [#42](https://github.com/EvilBit-Labs/libmagic-rs/issues/42). - **evaluator**: Thread-local regex compile cache eliminates the double-compile paid by every successful regex match. `regex_bytes_consumed` now reuses the compiled `Regex` from `read_regex` instead of recompiling the pattern to derive the anchor advance. The cache is reset at the start of every `evaluate_rules_with_config` call, bounding memory to one evaluation. - **config**: `EvaluationConfig` is now `#[non_exhaustive]`; new builder-style setters (`with_max_recursion_depth`, `with_max_string_length`, `with_stop_at_first_match`, `with_mime_types`, `with_timeout_ms`) let external crates construct configurations without struct literals. - **parser**: `MagicRule::new()` smart constructor with `::with_children()`, `::with_strength_modifier()`, `::with_level()` builder methods and a `::validate()` method enforcing structural invariants (non-empty message, `level <= MAX_LEVEL`, children nested strictly deeper than parent). New `MagicRuleValidationError` error type. @@ -41,6 +42,10 @@ All notable changes to this project will be documented in this file. - `MagicRule::validate()` tests covering empty message, child level invariant, and max-depth rejection. - `RegexCache` population/clear/reuse tests. +### Breaking Changes + +- **parser**: `parse_text_magic_file` return type changed from `Result, ParseError>` to `Result`. Callers must destructure `ParsedMagic { rules, name_table }`. Low-level callers that only need the rule list can use `parsed.rules`. `load_magic_file` and `load_magic_directory` return the same new type. + ## [0.5.0] - 2026-03-07 ### Features diff --git a/GOTCHAS.md b/GOTCHAS.md index fa1d451..c4ed275 100644 --- a/GOTCHAS.md +++ b/GOTCHAS.md @@ -28,6 +28,8 @@ Serialization functions live in `src/parser/codegen.rs`, shared by both `build.r Adding a variant to `TypeKind` requires updating exhaustive matches in 10+ files: `ast`, `grammar`, `types`, `codegen` (`serialize_type_kind` -- easy to forget; build.rs is a separate compilation unit so the error surfaces there first), `strength`, `property_tests`, `evaluator/types/mod.rs` (`read_typed_value`, `coerce_value_to_type`, **`bytes_consumed`** -- variable-width variants must be matched explicitly or relative-offset anchors will silently corrupt), `output/mod.rs` (2 length matches), `output/json.rs` (`format_value_as_hex`), and `grammar/tests.rs` (stale assertions). Note: `coerce_value_to_type`, output matches, and `bytes_consumed` use catch-all `_ =>` so they compile without changes but may need semantic updates -- `bytes_consumed` will fire a `debug_assert` in test/dev builds for unhandled variable-width variants. +**Meta variants (`TypeKind::Meta(MetaType::...)`)** are structurally different. They do not read bytes, so `read_typed_value`, `coerce_value_to_type`, and `bytes_consumed` treat them as no-ops (catch-all or zero-width arms). The live dispatch for control-flow directives happens in `evaluator/engine/mod.rs::evaluate_rules` -- the *loop body*, not in `evaluate_single_rule_with_anchor`. `MetaType::Use`, `MetaType::Default`, `MetaType::Clear`, and `MetaType::Indirect` are all handled inline there so subroutine / control-flow effects can be spliced into the caller's match vector; `MetaType::Name` is hoisted out of the rule list at parse time by `parser::name_table::extract_name_table` and should normally never reach the evaluator at all. The evaluator's leaked-`Name` arm uses `debug!` (not `debug_assert!`) because `prop_arbitrary_rule_evaluation_never_panics` synthesizes arbitrary `TypeKind` values and a panic there would break the never-panics invariant. **This applies to every inline-dispatched meta variant that has a "not configured" fallback** (currently `Use` and `Indirect` for the no-`RuleEnvironment` case): property tests run with `rule_env = None`, so any `debug_assert!` on that path will fire. When adding a new `MetaType` variant: decide whether it needs inline loop-level dispatch (add to `evaluate_rules`) or is a silent no-op (add to the `_` arm of the `Meta(_)` match in `evaluate_single_rule_with_anchor`). See `docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md` for the full pattern. + ### 2.2 `Operator` Exhaustive Matches Adding a variant to `Operator` requires updating: `ast`, `grammar`, `codegen`, `strength`, `property_tests`, `evaluator/operators/`. @@ -75,7 +77,14 @@ Both `RegexCount::Default` (plain `regex`) and `RegexCount::Lines(None)` (the `r ### 2.11 `MetaType` Exhaustive Matches -Adding a variant to the `MetaType` enum (nested inside `TypeKind::Meta`) requires updates in: `ast.rs` (variant definition + 3 test fixtures that iterate MetaType variants: `test_meta_type_variants_debug_clone_eq`, `test_meta_type_serde_roundtrip`, `test_type_kind_meta_bit_width_is_none`), `parser/types.rs` (`parse_type_keyword` tag + `type_keyword_to_kind` match arm + `test_roundtrip_all_keywords` array), `parser/codegen.rs` (`serialize_type_kind` -- the inner `TypeKind::Meta(meta)` arm), and `tests/property_tests.rs` (`arb_type_kind` `prop_oneof` branch). The evaluator does NOT need updates: `TypeKind::Meta(_)` is handled uniformly as a no-op via the wildcard arm in `engine::evaluate_single_rule_with_anchor`, `strength.rs::calculate_default_strength`, and `types/mod.rs::bytes_consumed_with_pattern`. +Adding a variant to the `MetaType` enum (nested inside `TypeKind::Meta`) requires updates in: `ast.rs` (variant definition + 3 test fixtures that iterate MetaType variants: `test_meta_type_variants_debug_clone_eq`, `test_meta_type_serde_roundtrip`, `test_type_kind_meta_bit_width_is_none`), `parser/types.rs` (`parse_type_keyword` tag + `type_keyword_to_kind` match arm + `test_roundtrip_all_keywords` array), `parser/codegen.rs` (`serialize_type_kind` -- the inner `TypeKind::Meta(meta)` arm), and `tests/property_tests.rs` (`arb_type_kind` `prop_oneof` branch). + +**Evaluator decision rule.** The evaluator has two possible locations for a `MetaType` arm and the choice is load-bearing: + +- **Inline loop-level dispatch in `evaluate_rules` (in `src/evaluator/engine/mod.rs`)** — use this when the variant has side-effects that the single-rule path cannot express. `MetaType::Default` (needs the per-level `sibling_matched` flag), `MetaType::Clear` (mutates `sibling_matched`), `MetaType::Use` (splices subroutine matches into the caller's match vector from `RuleEnvironment::name_table`), and `MetaType::Indirect` (recurses into `RuleEnvironment::root_rules` with a reset anchor and a sliced buffer) all live here. These variants are dispatched **before** the loop body calls `evaluate_single_rule_with_anchor`. +- **Silent no-op via the `Meta(_)` wildcard arm in `evaluate_single_rule_with_anchor`** — use this for variants that produce no match, do not mutate `sibling_matched`, and do not recurse. `MetaType::Name` falls here (it is also defensively scrubbed at load time by `parser::name_table::extract_name_table`, so the evaluator arm is a `debug!`-and-return safety net rather than a normal code path). + +Similarly, `strength.rs::calculate_default_strength` and `types/mod.rs::bytes_consumed_with_pattern` treat all `Meta(_)` variants uniformly (zero strength contribution, zero bytes consumed) via catch-alls, so they do not need per-variant arms. ## 3. Parser Architecture @@ -116,10 +125,27 @@ Lowercase pointer specifiers (`.s`, `.l`, `.q`) map to **little-endian**, not na The load-bearing invariant is that the anchor is updated *before recursing into children* (so children and their followers see the new anchor). The current code also happens to set the anchor before `matches.push(...)`, but the push-ordering relative to `set_last_match_end` is incidental for anchor correctness -- only the ordering before the `evaluate_rules` recursion call matters. (Future code that reads the anchor while iterating `matches` would make this ordering load-bearing, so do not "optimize" the order without checking call sites first.) `bytes_consumed()` (in `evaluator/types/mod.rs`) is the source of truth for advance distance; for variable-width types it re-derives consumption from the buffer rather than trusting `Value::String.len()` (which can drift from the original byte length via `from_utf8_lossy`). Pascal-string consumption is also clamped against the remaining buffer to prevent attacker-controlled length prefixes from poisoning the anchor to `usize::MAX`. -### 3.9 `parse_text_magic_file` is Fail-Fast, Not Skip-on-Error +**Continuation-sibling exception (issue #42):** Inside a child recursion (`recursion_depth > 0`), the anchor IS reset to the sibling list's entry value between iterations. Continuation rules at the same indentation level (`>>&0 ubyte ...; >>&0 offset ...`) all resolve `&N` against the parent-level anchor rather than chaining off each other. This matches libmagic's `ms->c.li[cont_level]` continuation-level model and is required for the GNU `file` `searchbug.magic` fixture to produce `at_offset 11` (not `at_offset 12`). Top-level siblings (`recursion_depth == 0`) keep the chaining behavior described above. The `reset_anchor_between_siblings` flag in `evaluate_rules` gates this. Tests that assert top-level sibling chaining (e.g. `relative_anchor_can_decrease_when_later_sibling_matches_at_lower_position`) must stay at depth 0; tests that assert continuation-sibling parent-anchor resolution (e.g. `test_offset_does_not_advance_anchor_for_continuation_siblings`) must go through a parent rule's children. The two semantics coexist deliberately; do not "unify" them without reading the entire GNU `file` `searchbug` test chain. + +### 3.10 Subroutine Base Offset for `use` Bodies + +Inside a `MetaType::Use` subroutine body, `OffsetSpec::Absolute(n)` with `n >= 0` resolves to `base_offset + n`, where `base_offset` is the use-site offset. `EvaluationContext::base_offset` tracks this; `evaluate_use_rule` saves and restores it around the subroutine call, alongside `last_match_end`. This matches magic(5) / libmagic semantics: a subroutine written as `>0 search/12 ABC` invoked via `>>64 use part2` scans starting at file position 64, not 0. Negative absolute offsets (`FromEnd`-style), `Indirect` pointer reads, and `Relative(&N)` offsets are unaffected -- they already have well-defined reference frames. Callers of `offset::resolve_offset_with_base` pass `context.base_offset()`; the `base_offset` field is gated behind `pub(crate)` accessors so external consumers cannot inject arbitrary offset bias. + +### 3.11 `parse_text_magic_file` is Fail-Fast, Not Skip-on-Error `build_rule_hierarchy` propagates any `parse_magic_rule_line` error immediately, so a single unparseable rule (e.g., a child using unsupported `&+N` relative-offset syntax or an unquoted `$VAR` string value -- see S3.6) causes the **entire file load** to fail with `ParseError::InvalidSyntax`. There is no skip-and-continue mode. When writing corpus tests against third_party `.magic` files that mix supported and unsupported syntax, bypass the parser and build the equivalent `MagicRule` tree programmatically via the AST; the runtime evaluator can still be exercised end-to-end against the real testfile buffer. See `tests/evaluator_tests.rs::test_regex_eol_corpus` for a worked example. +### 3.10 `parse_text_magic_file` Returns `ParsedMagic`, Not `Vec` + +`parse_text_magic_file`, `load_magic_file`, and `load_magic_directory` all return `Result, name_table: NameTable }, ParseError>`. Top-level `Meta(Name(id))` rules are hoisted *out* of the flat rule list at parse time by `parser::name_table::extract_name_table` and placed into the `name_table` field keyed by identifier. Duplicate names keep the first definition and emit a `warn!`; nested `Name` rules (not well-defined in magic(5)) are scrubbed with a warning during extraction. + +- **Callers must destructure at the boundary.** Codegen consumers (`build.rs`, `src/build_helpers.rs`) use `parsed.rules` and discard the table. Runtime consumers (`MagicDatabase::load_from_file_with_config`) pattern-match `let ParsedMagic { rules, name_table } = ...;` and wrap both in `Arc`s before handing them to the database. +- **Directory loads merge name tables with first-wins semantics.** `load_magic_directory` merges per-file name tables alphabetically (same ordering as the rule merge); duplicate-name warnings during merge are distinct from per-file duplicate-name warnings during extraction. +- **Evaluator consumption is via `RuleEnvironment`** (`pub(crate)` in `evaluator/mod.rs`), threaded as an optional `Arc` on `EvaluationContext`. Low-level callers (`evaluate_rules`, `evaluate_rules_with_config`) leave this as `None` and `Use` rules no-op silently -- this preserves the low-level API for property tests and fuzz harnesses that construct rule trees by hand. `MagicDatabase::evaluate_buffer_internal` attaches the environment before dispatching. +- **Each subroutine in the name table must be strength-sorted** the same way top-level rules are (via `sort_rules_by_strength_recursive`), otherwise `use`-site evaluation is non-deterministic with respect to source order inside the `name` block. `MagicDatabase::load_from_file_with_config` iterates `name_table.values_mut()` to do this after the top-level sort. + +See `docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md` for the full three-layer pattern (parse-time hoist, `ParsedMagic` return type, optional `RuleEnvironment`). + ## 4. Module Visibility & Re-exports ### 4.1 Private Engine Module @@ -272,12 +298,22 @@ All tags and commits MUST be signed -- use `git tag -s` and `git commit -s -S`. - **Validation:** `timeout_ms` is clamped to `MAX_SAFE_TIMEOUT_MS` (5 minutes) by config validation and must be `> 0` if specified -- see the validation logic in `src/config.rs`. - **Note:** `Default` cannot be changed to set a timeout without breaking API expectations of callers who deliberately want no timeout (e.g., CLI one-shot invocations). The gotcha is that the unsafe default is the ergonomic choice; document the tradeoff prominently in any new consumer-facing docs. +### 13.2 `EvaluationConfig::default()` Stops at First Top-Level Match + +`EvaluationConfig::default()` sets `stop_at_first_match: true`, so once any top-level rule produces a match the evaluator halts further sibling iteration. Integration tests that rely on later siblings running (e.g., `default`/`clear`/`default` chains, `use`-followed-by-sibling continuation, anything exercising the per-level `sibling_matched` flag across the full chain) must use `MagicDatabase::load_from_file_with_config` with `EvaluationConfig::default().with_stop_at_first_match(false)`. Unit tests that go through `evaluate_rules` directly hit the same issue -- override the config at construction time. The existing `Use` tests (`test_use_child_rules_evaluated_after_subroutine`, `test_use_stop_at_first_match_short_circuits_siblings`) document both sides of this contract. + ## 14. Output Formatting ### 14.1 `\b` (Backspace) Prefix in Rule Messages Suppresses Leading Space -`MagicDatabase::build_result` concatenates rule messages with a space separator, **except** when a message starts with `\u{0008}` (backspace / `\b`), in which case the backspace is stripped and no leading space is inserted. This mirrors GNU `file`'s description formatting (used by rules like `>&1 regex/1l ... \b, version %s` to produce `Ansible Vault text, version 1.1` instead of `Ansible Vault text , version 1.1`). Tests that manually simulate the concatenation path (e.g., corpus tests that bypass `load_from_file` -- see S3.9) must honor this convention or their assertions will diverge from the real evaluator output. +`MagicDatabase::build_result` concatenates rule messages with a space separator, **except** when a message starts with `\u{0008}` (backspace / `\b`), in which case the backspace is stripped and no leading space is inserted. This mirrors GNU `file`'s description formatting (used by rules like `>&1 regex/1l ... \b, version %s` to produce `Ansible Vault text, version 1.1` instead of `Ansible Vault text , version 1.1`). Tests that manually simulate the concatenation path (e.g., corpus tests that bypass `load_from_file` -- see S3.11) must honor this convention or their assertions will diverge from the real evaluator output. + +### 14.2 Printf-Style Format Specifiers Are Substituted by `format_magic_message` + +Magic rule messages like `at_offset %lld` or `followed_by 0x%02x` are substituted with the rule's `RuleMatch.value` at description-assembly time, via `src/output/format.rs::format_magic_message`. The supported subset covers `%d`, `%i`, `%u`, `%x`, `%X`, `%o`, `%s`, `%c`, and `%%`, plus width/padding modifiers (`%05d`, `%-5d`) and length modifiers (`l`, `ll`, `h`, `hh`, `j`, `z`, `t`) which are parsed and ignored (all numeric rendering uses `u64`/`i64` width). + +Hex specifiers mask the value to the natural width of the rule's `TypeKind` -- a signed byte carrying `-1` renders as `ff`, not `ffffffffffffffff`. Unknown specifiers pass through literally with a `debug!` log, matching the evaluator's graceful-skip discipline. -### 14.2 `%s` (and Other printf-Style Format Specifiers) Are Not Substituted +Substitution runs BEFORE the backspace check in S14.1, so a rule emitting `\b, version %s` correctly composes with the preceding match without an intervening space after the value is substituted. -Magic rule messages like `\b, version %s` are passed through verbatim to the final concatenated description -- the evaluator does not implement printf-style format substitution. Captured values from regex/search/pattern matches live on `RuleMatch.value`, not embedded in `RuleMatch.message`. Tests or output checks that expect substituted text (e.g., "version 1.1") must either hardcode the expected token in the rule's message or assert against `RuleMatch.value` directly. +Tests that manually simulate the concatenation path must run their message strings through `format_magic_message` or construct `RuleMatch::message` strings that contain no `%` metacharacters. A literal `%` in user-facing data should be escaped as `%%`. diff --git a/ROADMAP.md b/ROADMAP.md index 176415e..fb3ed1d 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -34,7 +34,7 @@ See [GitHub Milestones](https://github.com/EvilBit-Labs/libmagic-rs/milestones) - [x] Float and double types ([#40](https://github.com/EvilBit-Labs/libmagic-rs/issues/40)) - [x] Date and timestamp types ([#41](https://github.com/EvilBit-Labs/libmagic-rs/issues/41)) - [x] Pascal string type ([#43](https://github.com/EvilBit-Labs/libmagic-rs/issues/43)) -- [ ] Meta-types: default, clear, name, use, indirect ([#42](https://github.com/EvilBit-Labs/libmagic-rs/issues/42)) +- [x] Meta-types: default, clear, name, use, indirect ([#42](https://github.com/EvilBit-Labs/libmagic-rs/issues/42)) ## v0.4.0 - API and UX Polish diff --git a/benches/evaluation_bench.rs b/benches/evaluation_bench.rs index 2dc1554..8580f5e 100644 --- a/benches/evaluation_bench.rs +++ b/benches/evaluation_bench.rs @@ -12,6 +12,7 @@ use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use libmagic_rs::{EvaluationConfig, MagicDatabase}; use std::hint::black_box; +use std::io::Write; /// Create a minimal ELF 64-bit header for testing fn create_elf64_header() -> Vec { @@ -201,10 +202,51 @@ fn bench_evaluation_configs(c: &mut Criterion) { group.finish(); } +/// Benchmark `name`/`use` subroutine dispatch overhead. +/// +/// Establishes a baseline for the meta-type machinery before future +/// Aho-Corasick or compiled-regex caching optimizations. The magic source +/// declares a `part2` subroutine that matches a byte and its top-level rule +/// invokes that subroutine at two offsets via `use part2`. +fn bench_name_use_subroutines(c: &mut Criterion) { + let magic_source = "0 string TEST Testfmt\n\ + >0 use part2\n\ + >4 use part2\n\ + 0 name part2\n\ + >0 byte 0x42 inner_match\n"; + + let tmp_dir = tempfile::tempdir().expect("create temp dir"); + let magic_path = tmp_dir.path().join("subroutines.magic"); + { + let mut f = std::fs::File::create(&magic_path).expect("create magic file"); + f.write_all(magic_source.as_bytes()) + .expect("write magic source"); + } + + let db = MagicDatabase::load_from_file(&magic_path).expect("should load subroutines magic"); + + let mut buf = b"TEST".to_vec(); + buf.push(0x42); + buf.extend_from_slice(&[0u8; 16]); + + let mut group = c.benchmark_group("name_use_subroutines"); + group.throughput(Throughput::Bytes(buf.len() as u64)); + group.bench_function("use_dispatch", |b| { + b.iter(|| { + let result = db + .evaluate_buffer(black_box(&buf)) + .expect("should evaluate"); + black_box(result) + }) + }); + group.finish(); +} + criterion_group!( benches, bench_file_type_detection, bench_buffer_sizes, - bench_evaluation_configs + bench_evaluation_configs, + bench_name_use_subroutines ); criterion_main!(benches); diff --git a/build.rs b/build.rs index e611329..ddb3753 100644 --- a/build.rs +++ b/build.rs @@ -52,7 +52,7 @@ fn main() { } }; - let rules = match parse_text_magic_file(&magic_content) { + let parsed = match parse_text_magic_file(&magic_content) { Ok(parsed) => parsed, Err(err) => { eprintln!("{}", format_parse_error(&err)); @@ -69,7 +69,7 @@ fn main() { }; let output_path = Path::new(&out_dir).join("builtin_rules.rs"); - let generated = codegen::generate_builtin_rules(&rules); + let generated = codegen::generate_builtin_rules(&parsed.rules); if let Err(err) = fs::write(&output_path, generated) { eprintln!("Failed to write {}: {err}", output_path.display()); diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 022d6dc..5a06c00 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -82,12 +82,13 @@ libmagic-rs/ │ ├── build_helpers.rs # Build script utilities │ │ │ ├── parser/ # Magic file parsing -│ │ ├── mod.rs # Parser interface -│ │ ├── ast.rs # AST definitions (MagicRule, TypeKind::Byte { signed: bool }, etc.) +│ │ ├── mod.rs # Parser interface, ParsedMagic { rules, name_table } return type +│ │ ├── ast.rs # AST definitions (MagicRule, TypeKind::Meta(MetaType), etc.) │ │ ├── grammar/ # nom-based parsing combinators │ │ │ ├── mod.rs # Rule and type parsing (796 lines) │ │ │ ├── numbers.rs # Decimal/hex number parsing │ │ │ └── value.rs # Value-literal parsing +│ │ ├── name_table.rs # Load-time extraction of `name ` subroutine blocks into NameTable │ │ └── loader.rs # Magic file loading and format detection │ │ │ ├── evaluator/ # Rule evaluation engine @@ -233,8 +234,10 @@ The main entry point for users. Manages rule loading and evaluation. ```rust pub struct MagicDatabase { - rules: Vec, // Parsed magic rules - config: EvaluationConfig, // Evaluation settings + rules: Arc<[MagicRule]>, // Parsed magic rules (shared, immutable) + root_rules: Arc<[MagicRule]>, // Full top-level rule list for `indirect` re-entry + name_table: Arc, // `name`/`use` subroutine dispatch table + config: EvaluationConfig, // Evaluation settings source_path: Option, // Where rules came from } ``` @@ -297,6 +300,7 @@ pub struct MagicRule { - `String { max_length: Option }` - Null-terminated string - `Regex { flags: RegexFlags, count: RegexCount }` - Regular expression matching (see `RegexCount` for the `Default` / `Bytes(n)` / `Lines(Option)` variants) - `Search { range: NonZeroUsize }` - Bounded literal pattern search +- `Meta(MetaType)` - Control-flow directive: `Default`, `Clear`, `Name(id)`, `Use(id)`, `Indirect` **Hierarchical Structure:** diff --git a/docs/MAGIC_FORMAT.md b/docs/MAGIC_FORMAT.md index 6c1e292..349fad9 100644 --- a/docs/MAGIC_FORMAT.md +++ b/docs/MAGIC_FORMAT.md @@ -480,6 +480,73 @@ Output: `GIF image data, version 89a` --- +## Meta-types / Control Directives + +Meta-types are pseudo-types that do not read bytes from the buffer. Instead, they control the evaluation flow: defining named subroutines, invoking them, providing fallbacks when no sibling matched, resetting per-level match state, or re-applying the entire rule database at a resolved offset. + +| Keyword | Syntax | Description | +| ----------- | ---------------------- | ------------------------------------------------------------------ | +| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | +| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | +| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | +| `clear` | `0 clear` | Resets the per-level sibling-matched flag | +| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | + +### `name` and `use` — Named Subroutines + +`name ` defines a named subroutine block at the top level; its children are the subroutine body. `use ` invokes that subroutine at a given offset. + +```text +# Define a reusable subroutine +0 name part2 +>0 search/64 ABC found_ABC +>>&0 byte x followed_by 0x%x + +# Top-level rule that invokes the subroutine +0 string TEST Testfmt +>0 use part2 +>64 use part2 +``` + +Top-level `name` blocks are hoisted out of the flat rule list at parse time into a `NameTable` keyed by identifier. Duplicate names retain the first definition and emit a warning. `name` rules nested inside another rule's children are not well-defined in magic(5) and are scrubbed at load time. + +### `default` — Fallback Rule + +A `default` rule at a given level fires only when none of its siblings at the same level have matched. The operator is conventionally `x` (any-value), and the value column is ignored. + +```text +0 byte 0xAA Real-Match +0 default x DEFAULT-FALLBACK +``` + +Against a buffer starting with `0xAA`, only `Real-Match` fires. Against a buffer starting with any other byte, `DEFAULT-FALLBACK` fires. + +### `clear` — Reset Sibling-Matched Flag + +A `clear` directive resets the per-level "sibling matched" flag, so a subsequent `default` at the same level can fire again even after an earlier sibling matched. Pair with `EvaluationConfig::with_stop_at_first_match(false)` to walk all top-level siblings. + +```text +0 byte 0xAA Match-A +0 default x DEFAULT-SKIPPED +0 clear +0 default x DEFAULT-FIRES +``` + +Against a buffer starting with `0xAA`: `Match-A` fires, `DEFAULT-SKIPPED` is suppressed (a sibling matched), `clear` resets the flag, and `DEFAULT-FIRES` fires. + +### `indirect` — Re-apply Root Rules at a Resolved Offset + +An `indirect` rule resolves its offset, slices the buffer at that point, and re-applies the full rule database against the sub-buffer. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. + +```text +0 byte 0x42 Inner-Match +8 indirect x +``` + +Against a 16-byte buffer with `buf[8] = 0x42`: the top-level `byte` rule at offset 0 does not match, and the `indirect` rule re-applies the root rules at offset 8 — where `buf[8] = 0x42` matches the inner `byte` rule, producing `Inner-Match`. + +--- + ## Best Practices ### 1. Order Rules by Specificity @@ -571,8 +638,6 @@ Consider: - Regex patterns - Float types - 128-bit integer types -- Use/name directives -- Default rules ### Recently Added diff --git a/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md b/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md new file mode 100644 index 0000000..ef91194 --- /dev/null +++ b/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md @@ -0,0 +1,203 @@ +--- +title: Parse-time name table extraction and context-threaded RuleEnvironment for meta-type subroutines +date: 2026-04-22 +status: resolved +severity: medium +category: integration-issues +components: + - parser/name_table + - parser/mod + - parser/loader + - evaluator/mod + - evaluator/engine + - MagicDatabase +tags: + - rust + - parser + - evaluator + - meta-types + - name-use + - subroutine + - rule-environment + - recursion-guard + - control-flow + - architecture-pattern +issue: '#42' +branch: 42-parser-implement-default-clear-name-use-and-indirect-meta-types +applies_when: + - Implementing a new magic(5) control-flow directive (e.g. indirect, default, clear) + - Adding any whole-database state that evaluation needs to consult outside the current rule + - Considering breaking changes to evaluate_rules / evaluate_rules_with_config +root_cause: Control-flow directives do not fit the evaluator's "resolve offset -> read typed value -> apply operator" pipeline; whole-database state (name tables, root rule re-entry) must live somewhere +solution_files: + - src/parser/name_table.rs + - src/parser/mod.rs + - src/parser/loader.rs + - src/evaluator/mod.rs + - src/evaluator/engine/mod.rs + - src/error.rs + - src/lib.rs + - tests/meta_types_integration.rs +related_gotchas: + - S2.1 TypeKind exhaustive-match discipline still applies; the new Meta(Use) arm is dispatched from evaluate_rules, not evaluate_single_rule_with_anchor + - S3 parser architecture now produces ParsedMagic { rules, name_table }, not Vec + - Property tests synthesize arbitrary TypeKind values; evaluator arms for Meta must debug!-log rather than debug_assert!-panic +--- + +# Parse-time name table extraction and context-threaded RuleEnvironment for meta-type subroutines + +## Context + +The magic(5) grammar includes directives that are **control-flow, not value-reading**: `name`/`use` (callable subroutines), `indirect` (re-enter the full rule set against an offset computed from the current buffer), and `default`/`clear` (sibling-chain predicates that depend on whether any prior rule at the same level matched). These don't fit the evaluator's core pipeline -- `resolve offset -> read typed value -> coerce expected value -> apply operator -> produce RuleMatch`. There is nothing to read, no operator to apply, and the output is either "splice in another rule list's matches" (`use`, `indirect`) or "alter dispatch of the next sibling" (`default`, `clear`). + +Phase 1 (issue #42, earlier on the same branch) absorbed the grammar and AST by modeling these as `TypeKind::Meta(MetaType::...)` and treating them as silent no-ops in `evaluate_single_rule_with_anchor`. That was the right way to ship the parser without blocking the evaluator, but silent no-ops are not a viable endpoint: real third-party magic corpora (the GNU `file` distribution's `Magdir/` tree, fed into libmagic-rs via `third_party/`) make heavy use of `use` for shared subroutines (MS Office variants, archive headers, JPEG/EXIF chains), and silently dropping them produces visibly inferior classification output versus GNU `file`. + +The question this phase answered was: where in the **parse -> database -> evaluate** pipeline should whole-database concerns like "look up a subroutine by name" live? + +## Guidance: the three-layer pattern + +### Layer 1 -- parse-time extraction, not runtime lookup + +`Meta(Name(id))` rules are hoisted *out* of the flat rule list at load time by `src/parser/name_table.rs::extract_name_table`, which returns a `(Vec, NameTable)` pair. The evaluator's hot loop never sees a `Name` rule at all; duplicate-name detection is a one-shot `warn!` at parse time rather than a per-evaluation cost, and nested `Name` rules (not well-defined in magic(5)) are scrubbed with a warning. + +```rust +pub(crate) fn extract_name_table( + rules: Vec, +) -> (Vec, NameTable) { + // For each top-level rule: if Meta(Name(id)), move children into + // the table keyed by id; otherwise keep the rule and scrub any + // stray nested Name rules out of its children. +} +``` + +### Layer 2 -- `ParsedMagic` as the parser return type + +`parse_text_magic_file`, `load_magic_file`, and `load_magic_directory` now return `Result, name_table: NameTable }, ParseError>` instead of `Result, ParseError>`. Directory loads merge per-file name tables with a first-wins policy (matching GNU `file` behavior: earlier `Magdir/` files shadow later ones, logged at `warn!`). + +All callers destructure at the boundary: + +```rust +let ParsedMagic { rules, name_table } = parse_text_magic_file(&source)?; +// codegen uses `rules`; runtime attaches `name_table` to the database +``` + +### Layer 3 -- optional `RuleEnvironment` threaded through `EvaluationContext` + +Whole-database state lives in: + +```rust +pub(crate) struct RuleEnvironment { + name_table: Arc, +} +``` + +`EvaluationContext` gained a `rule_env: Option>` field. `MagicDatabase::evaluate_file` attaches the environment before calling `evaluate_rules`; programmatic consumers (`evaluate_rules_with_config`, property tests, fuzz harnesses) default to `None`, and `Use` rules then become silent no-ops. + +`Arc` (not `&`) because the context already outlives individual rule borrows, and property tests construct contexts without a lifetime parameter on `EvaluationContext`. + +A second field -- `root_rules: Arc<[MagicRule]>` -- is carried on the real struct to serve `indirect` when it lands. That is a deliberate YAGNI exception: `MagicDatabase` already holds an `Arc<[MagicRule]>` at construction time, so adding the re-entry point now costs one field-copy and zero future parser work. Do not extrapolate from it -- add environment state when the consuming directive is in the same phase, not speculatively. + +## Why this matters + +Four alternatives were considered and rejected. Each rejection is load-bearing for future meta-type work; revisit the rationale before reverting any of them. + +**Rejected: runtime lookup in the hot loop.** Walking the rule list every evaluation to resolve a `use name` target would turn a flat O(N) dispatch into an amortized quadratic one, and would make duplicate-name detection a *per-buffer* cost rather than *per-load*. The parse-time hoist pays the cost exactly once per magic file. + +**Rejected: non-optional `RuleEnvironment` / new required arg to `evaluate_rules`.** A cleaner API would have `RuleEnvironment` as a required field -- it is required for correct `use` evaluation. The concrete reason to make it optional is not API stability in the abstract; it is that every property test, fuzz harness, and in-tree integration test that calls `evaluate_rules` with a hand-built rule tree would have to synthesize and pass an empty environment to keep compiling. Under the "every meta-type will eventually need environment state" worldview that is cheap. Under the actual Phase 2 scope -- one directive needs it -- the churn buys nothing. Make it optional on the context now; tighten if we ever need to enforce "`use` must have an environment" as a contract. (session history) + +**Rejected: `debug_assert!` that `Name` rules never reach the evaluator.** `prop_arbitrary_rule_evaluation_never_panics` synthesizes arbitrary `TypeKind` instances, including `Meta(Name(_))`, and feeds them directly to `evaluate_single_rule`. A `debug_assert!` there would break the never-panics invariant the entire property test exists to enforce. The implementation uses `debug!` logging instead -- correct in production, non-fatal in property-test space. + +**Rejected: dispatching `Use` through `evaluate_single_rule_with_anchor`.** The single-rule helper returns `Result, _>` -- one match, one value. `Use` produces a *vector* of child matches that must be spliced into the caller's match buffer in document order. Pushing that semantic through the helper would have reshaped its return type to `Vec` and cascaded through every other `TypeKind` branch. Keeping `Use` at the `evaluate_rules` level is a cleaner seam. (session history) + +## When to apply + +The three-layer pattern is the template for every remaining magic(5) control-flow directive: + +- **`indirect`** (next phase): resolve an offset, reinterpret the bytes there as the beginning of a rule stream, and evaluate `env.root_rules` (already staged on `RuleEnvironment`) at that offset. Layer 1 is trivial (no hoist -- `indirect` is a value-position directive, not a top-level declaration); Layer 3 provides `root_rules` as the re-entry point. Note the anchor semantics differ from `use`: `indirect` starts fresh at the resolved offset and does **not** save/restore the caller's `last_match_end`, whereas `use` is a scoped subroutine that saves, seeds, and restores. (session history) +- **`default`/`clear`**: sibling-chain predicates. These need a new `MatchStateTracker` threaded alongside `last_match_end` in `EvaluationContext` (tracks "did any prior sibling at this level match"). The same "optional per-evaluation state field on the context, programmatic consumers default to off" pattern applies directly. +- **Future `!:mime` / `!:ext` / `!:apple` directive evaluation** (tracked under v0.6.0's `Directive` extension point): same shape -- extracted at parse time into a per-rule directive table, threaded via `RuleEnvironment`, consulted only by the match-accumulation path, not the hot read loop. + +The general rule: **if a directive's meaning depends on state outside the single rule being evaluated, hoist it at parse time into an environment that rides alongside the context. Never reach for the whole rule tree from inside the evaluation loop.** + +## Examples + +### The `Use` dispatch in `evaluate_rules` (`src/evaluator/engine/mod.rs`) + +```rust +if let TypeKind::Meta(MetaType::Use(name)) = &rule.typ { + match evaluate_use_rule(rule, name, buffer, context) { + Ok((Some(absolute_offset), subroutine_matches)) => { + matches.extend(subroutine_matches); + // Re-advance the anchor to the use-site offset so sibling + // rules resolve relative offsets from the use-site end. + context.set_last_match_end(absolute_offset); + } + Ok((None, _)) => { /* no env or name not found -- no-op */ } + // Error handling: demote buffer/offset errors to a debug log, + // propagate everything else. + Err(e) => return Err(e), // (simplified; see source for skip arms) + } + continue; +} +``` + +The anchor save/restore inside `evaluate_use_rule` seeds the subroutine with the use-site offset, then restores the caller's anchor; after returning, the outer loop re-advances to the use-site offset so sibling rules see the `use` as having "consumed" the use-site position. Mutual recursion (`a use b; b use a`) is caught by `RecursionGuard::enter(context)?` and surfaced as `EvaluationError::RecursionLimitExceeded`. + +One subtlety the first Phase 3 attempt got wrong: the `Use` rule's own *children* (continuation rules at deeper indentation following the `use` directive) must still be evaluated after the subroutine returns. The initial implementation skipped them, silently breaking valid libmagic chains. The fix evaluates the `use` rule's children after the named rule body completes. (session history) + +### The `ParsedMagic` destructure pattern at call sites + +```rust +// build.rs / src/build_helpers.rs -- codegen does not need the name table +let parsed = parse_text_magic_file(&source)?; +generate_rules_module(&parsed.rules, out_path)?; + +// src/lib.rs::MagicDatabase::load_from_file_with_config +let ParsedMagic { rules, name_table } = + parser::load_magic_file(path.as_ref())?; +// ... strength-sort `rules` and each subroutine in `name_table.values_mut()`, +// then construct the database with Arc-wrapped state. +``` + +Each subroutine body is strength-sorted recursively the same way top-level rules are, so evaluation of a `use` site is deterministic regardless of source order inside the `name` block. + +### Property-test-safe leaked-`Name` handling (`evaluate_single_rule_with_anchor`) + +```rust +TypeKind::Meta(MetaType::Name(name)) => { + // Normally hoisted at parse time; reaching here means a + // programmatic consumer (property test, fuzz harness) built + // the rule directly. Log and no-op -- a debug_assert would + // break prop_arbitrary_rule_evaluation_never_panics. + debug!( + "Name rule '{name}' reached evaluator (likely bypassed \ + name-table extraction); treating as no-op" + ); + return Ok(None); +} +TypeKind::Meta(MetaType::Use(_)) => { + // `Use` is dispatched inline by `evaluate_rules`. Reaching + // this arm means the rule went through the single-rule path + // (e.g. evaluate_single_rule) which lacks that wiring. + return Ok(None); +} +TypeKind::Meta(_) => return Ok(None), +``` + +The asymmetry between `debug!` (production-safe and test-safe) and `debug_assert!` (production-safe but test-hostile) is the load-bearing detail future maintainers will want to preserve when adding `indirect`, `default`, and `clear` arms here. + +## Prevention + +- When adding a new `MetaType` variant, add an explicit arm to the match in `evaluate_single_rule_with_anchor`. The catch-all `TypeKind::Meta(_) => return Ok(None)` is the default, but anything needing inline dispatch (like `Use`) should be handled at the `evaluate_rules` loop level, not in the single-rule helper. This is catalogued under GOTCHAS S2.1. +- The smoke test in `tests/meta_types_integration.rs` evaluates `third_party/tests/searchbug.magic` (the GNU `file` fixture exercising `name`/`use` + `search/N` + relative offsets). The assertion `result.description.starts_with("Testfmt")` guards the primary regression target for subroutine dispatch plus continuation rules -- a weaker non-empty check alone passes even when `use`-site children are silently skipped. +- The unit-test helper `build_name_table` in `src/evaluator/engine/tests.rs` goes through the real `extract_name_table` path rather than inserting directly into the `HashMap`. New subroutine tests should follow the same convention so they exercise the production extraction code. +- `RecursionGuard::enter(context)?` (not manual increment/decrement) inside any new meta-type dispatch. Mutual recursion between subroutines is a real failure mode; the guard is the only correct way to surface it as `EvaluationError::RecursionLimitExceeded` instead of a stack overflow. + +## Related + +- [`integration-issues/indirect-offset-parser-evaluator-sync.md`](indirect-offset-parser-evaluator-sync.md) -- closest sibling pattern: AST variant existed but was unreachable from `MagicDatabase::load_from_file()` until parser and evaluator were wired together. Different surface (offset syntax vs. directive dispatch) but same "parser-evaluator sync" shape. Consolidation review may be worthwhile once `indirect` meta-type lands. +- [`integration-issues/implementing-variable-width-typekind-variant.md`](implementing-variable-width-typekind-variant.md) -- same discipline around "adding a TypeKind variant that does not fit the fixed-shape `read_typed_value` pipeline"; relevant precedent for dispatch threading. +- [`logic-errors/indirect-offset-gnu-file-semantics.md`](../logic-errors/indirect-offset-gnu-file-semantics.md) -- precedent for honoring GNU `file` semantics in a meta-directive. +- [`developer-experience/rust-test-visibility-boundary.md`](../developer-experience/rust-test-visibility-boundary.md) -- the `pub(crate)` accessor pattern used for `RuleEnvironment` and `NameTable`. +- GOTCHAS.md S2.1 (TypeKind exhaustive matches), S3 (parser architecture -- now yields `ParsedMagic { rules, name_table }`), S13 (evaluation configuration -- `use` recursion bounded by the existing recursion-depth guard). +- GitHub issues: #42 (driving), #54 (parent epic: Type System Expansion), #48 (third_party/tests compatibility baseline). diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 9369bdc..1334d1f 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -52,7 +52,8 @@ The parser is responsible for converting magic files (text-based DSL) into an Ab - `mod.rs`: Main grammar dispatcher (796 lines) - `numbers.rs`: Numeric type parsing (decimal/hex, signed/unsigned) - `value.rs`: Value literal parsing (strings, floats, hex bytes) -- `mod.rs`: Parser interface, format detection, and hierarchical rule building (✅ Complete) +- `name_table.rs`: Load-time extraction of `name ` subroutine blocks into a `HashMap>` (the `NameTable` type). +- `mod.rs`: Parser interface, format detection, hierarchical rule building, and the `ParsedMagic { rules, name_table }` return type for `parse_text_magic_file` and `load_magic_directory` (✅ Complete) **Responsibilities:** @@ -104,6 +105,15 @@ pub enum TypeKind { length_width: PStringLengthWidth, length_includes_itself: bool, }, // Pascal string (length-prefixed) + Meta(MetaType), // Control-flow directive (see below) +} + +pub enum MetaType { + Default, // `default` fallback rule + Clear, // `clear` resets sibling-matched flag + Name(String), // `name ` subroutine declaration (hoisted at load time) + Use(String), // `use ` subroutine invocation + Indirect, // `indirect` re-applies root rules at the resolved offset } pub enum Operator { @@ -137,6 +147,10 @@ pub enum PStringLengthWidth { - **Type-safe**: Rust's type system prevents invalid rule combinations - **Explicit signedness**: `TypeKind::Byte` and integer types (Short, Long, Quad) distinguish signed from unsigned interpretations +**Parsed Output:** + +`parse_text_magic_file` and `load_magic_directory` return `ParsedMagic { rules: Vec, name_table: NameTable }` rather than a bare rule list. Top-level `name ` blocks are hoisted out of `rules` into `name_table` at load time so the evaluator can dispatch `MetaType::Use` invocations without a linear scan. + **PString Length Prefix Support:** The `PString` type supports multiple length prefix formats through the `length_width` field: @@ -155,9 +169,9 @@ The evaluator executes magic rules against file buffers to identify file types. **Structure:** -- `mod.rs`: Public API surface (~720 lines) with `EvaluationContext`, `RuleMatch` types, and re-exports +- `mod.rs`: Public API surface (~720 lines) with `EvaluationContext`, `RuleMatch` types, and re-exports. Also defines `pub(crate) struct RuleEnvironment { root_rules, name_table }` — the optional environment threaded through `EvaluationContext::rule_env` so the engine can dispatch `MetaType::Use` and `MetaType::Indirect` without taking an extra parameter on every function. - `engine/`: Core evaluation engine submodule - - `mod.rs`: `evaluate_single_rule`, `evaluate_rules`, and `evaluate_rules_with_config` functions + - `mod.rs`: `evaluate_single_rule`, `evaluate_rules`, and `evaluate_rules_with_config` functions. Inline dispatch for `MetaType::Default`, `MetaType::Clear`, `MetaType::Use`, and `MetaType::Indirect` lives in the `evaluate_rules` loop body. - `tests.rs`: Engine unit tests - `types/`: Type interpretation submodule - `mod.rs`: Public API surface with `read_typed_value`, `coerce_value_to_type`, and type re-exports @@ -427,6 +441,7 @@ pub enum EvaluationError { flowchart TD L[lib.rs
Public API and coordination
624 lines] C[config.rs
EvaluationConfig
307 lines] + NT[NameTable
name/use subroutines] L --> C L --> P[parser/
Magic file parsing] L --> E[evaluator/
Rule evaluation engine] @@ -435,6 +450,8 @@ flowchart TD L --> ER[error.rs
Error types] P --> ER + P --> NT + NT --> E E --> P E --> C E --> I @@ -443,6 +460,7 @@ flowchart TD style L fill:#2a1a4a,stroke:#b39ddb,color:#e0e0e0 style C fill:#1b3d1b,stroke:#66bb6a,color:#e0e0e0 + style NT fill:#2a1a4a,stroke:#b39ddb,color:#e0e0e0 style P fill:#4a3000,stroke:#ffb74d,color:#e0e0e0 style E fill:#4a3000,stroke:#ffb74d,color:#e0e0e0 style O fill:#4a3000,stroke:#ffb74d,color:#e0e0e0 diff --git a/docs/src/ast-structures.md b/docs/src/ast-structures.md index cff3ed5..916b66e 100644 --- a/docs/src/ast-structures.md +++ b/docs/src/ast-structures.md @@ -439,6 +439,61 @@ let case_start = TypeKind::Regex { }; ``` +### Meta-types (Control Directives) + +The `Meta` variant represents pseudo-types that do not read bytes from the buffer. They encode control-flow directives inherited from the libmagic magic(5) format. + +**Structure:** + +```rust +/// Control-flow directives that do not read bytes from the buffer. +Meta(MetaType), +``` + +`TypeKind::Meta(_)` returns `None` from `bit_width()` because meta-types consume zero on-disk bytes. + +**MetaType Enum:** + +```rust +pub enum MetaType { + /// `default` — fires only when no sibling at the same level has matched. + Default, + /// `clear` — resets the per-level sibling-matched flag. + Clear, + /// `name ` — defines a named subroutine (hoisted out of the rule list at load time). + Name(String), + /// `use ` — invokes a named subroutine at the resolved offset. + Use(String), + /// `indirect` — re-applies the full rule database at the resolved offset. + Indirect, +} +``` + +**Examples:** + +```rust +use libmagic_rs::parser::ast::{TypeKind, MetaType}; + +// Default fallback rule +let default_rule = TypeKind::Meta(MetaType::Default); + +// Clear sibling-matched flag +let clear_rule = TypeKind::Meta(MetaType::Clear); + +// Named subroutine declaration +let name_rule = TypeKind::Meta(MetaType::Name("part2".to_string())); + +// Subroutine invocation +let use_rule = TypeKind::Meta(MetaType::Use("part2".to_string())); + +// Re-entry into root rules +let indirect_rule = TypeKind::Meta(MetaType::Indirect); +``` + +**Parse-time Name Extraction:** + +Top-level `name ` rules are hoisted out of `ParsedMagic::rules` by `parser::name_table::extract_name_table` and placed into the `name_table: NameTable` field of `ParsedMagic` keyed by identifier. As a result, `MetaType::Name` variants in the final parsed rule list are expected only as an internal intermediate representation — `name` rules do not survive past the load boundary in normal operation. + ### Search (Bounded Literal Byte Sequence Search) The `Search` variant scans for a literal byte pattern within a bounded range. Unlike `String`, which matches only at the exact offset, `Search` scans forward up to `range` bytes for the first occurrence. diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index ea0d51c..30f2ec9 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -25,8 +25,8 @@ Memory Map Context State Endian Handling Match Logic Hierarchical The evaluator module separates public interface from implementation: -- **`evaluator/mod.rs`** - Public API surface: defines `EvaluationContext` and `RuleMatch` types, re-exports core evaluation functions from the engine submodule -- **`evaluator/engine/mod.rs`** - Core evaluation implementation: `evaluate_single_rule`, `evaluate_rules`, `evaluate_rules_with_config` +- **`evaluator/mod.rs`** - Public API surface: defines `EvaluationContext` and `RuleMatch` types, re-exports core evaluation functions from the engine submodule. Also defines `pub(crate) struct RuleEnvironment { root_rules, name_table }` — the optional environment attached to `EvaluationContext::rule_env` that carries the full rule list and the `name`/`use` subroutine table for meta-type dispatch. +- **`evaluator/engine/mod.rs`** - Core evaluation implementation: `evaluate_single_rule`, `evaluate_rules`, `evaluate_rules_with_config`. Also hosts the per-level `sibling_matched` bookkeeping and inline dispatch for `MetaType::Default`, `MetaType::Clear`, `MetaType::Use`, and `MetaType::Indirect`. - **`evaluator/offset/mod.rs`** - Offset resolution - **`evaluator/operators/mod.rs`** - Operator application - **`evaluator/types/`** - Type reading and coercion (organized as submodules as of v0.4.2) @@ -360,6 +360,37 @@ pub fn evaluate_rules( 3. Results accumulate hierarchically (parent message + child details) +### Meta-type Dispatch + +Before calling `evaluate_single_rule_with_anchor` for a value-read rule, `evaluate_rules` inspects the rule's `TypeKind` for meta-type dispatch. Each `MetaType` variant has distinct semantics: + +- **`MetaType::Clear`**: Sets the per-level `sibling_matched` flag to `false`. No match is recorded, the anchor is unchanged, and children are not evaluated. +- **`MetaType::Default`**: Fires only when `!sibling_matched` at the current level. On fire, records a `RuleMatch`, evaluates children (inheriting the match context), and sets `sibling_matched = true`. +- **`MetaType::Use(name)`**: Looks up `name` in `RuleEnvironment::name_table`. On hit, evaluates the subroutine's child rules at the resolved offset, propagates their matches into the caller's match vector, then also evaluates the `use` rule's own `rule.children`. On miss, logs a `warn!` and returns `Ok(None)` (treated as non-match). +- **`MetaType::Indirect`**: Resolves the rule's offset against the buffer, slices the buffer at that point, resets the `EvaluationContext` anchor to 0, calls `evaluate_rules` recursively with `RuleEnvironment::root_rules` (the complete top-level rule list), and then restores the caller's anchor on return. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. +- **`MetaType::Name`**: Unreachable after load-time extraction — `name` blocks are hoisted out of the rule list by `parser::name_table::extract_name_table` before the evaluator ever sees them. Defensive arm returns `Ok(None)` and emits a `debug!` rather than `debug_assert!` so that property tests synthesizing arbitrary `TypeKind` values do not break the never-panics invariant. + +```mermaid +sequenceDiagram + participant ER as evaluate_rules + participant ESR as evaluate_single_rule_with_anchor + participant NT as NameTable + participant RR as root_rules + + ER->>ESR: rule (Use "part2") + ESR->>NT: lookup("part2") + NT-->>ESR: Vec (subroutine) + ESR->>ER: evaluate_rules(subroutine, buffer, ctx) + ER-->>ESR: subroutine matches + ESR-->>ER: Ok(Some(offset, value)) + merged matches + + ER->>ESR: rule (Indirect) + ESR->>RR: clone root_rules + ESR->>ER: evaluate_rules(root_rules, sub_buffer, ctx) + ER-->>ESR: inner matches + ESR-->>ER: Ok(Some(offset, value)) + merged matches +``` + ### Hierarchical Processing ```mermaid @@ -494,13 +525,15 @@ let magic_content = r#" >4 byte 1 32-bit >4 byte 2 64-bit "#; -let rules = parse_text_magic_file(magic_content)?; +let parsed = parse_text_magic_file(magic_content)?; // Read target file let buffer = std::fs::read("sample.bin")?; -// Evaluate with default config -let matches = evaluate_rules(&rules, &buffer)?; +// Evaluate with default config. The low-level `evaluate_rules` takes only +// the top-level rules; `parsed.name_table` is handled by `MagicDatabase` +// (see library-api.md) and is ignored here. +let matches = evaluate_rules(&parsed.rules, &buffer)?; for m in matches { println!("Match at offset {}: {}", m.offset, m.message); @@ -518,10 +551,10 @@ let magic_content = r#" 0 leshort <100 Small value detected 0 leshort >=1000 Large value detected "#; -let rules = parse_text_magic_file(magic_content)?; +let parsed = parse_text_magic_file(magic_content)?; let buffer = vec![0x0A, 0x00]; // Little-endian 10 -let matches = evaluate_rules(&rules, &buffer)?; +let matches = evaluate_rules(&parsed.rules, &buffer)?; // Matches first rule (<100) assert_eq!(matches[0].message, "Small value detected"); @@ -538,11 +571,11 @@ let magic_content = r#" 0 lefloat 3.14159 Pi constant detected 0 bedouble >100.0 Large double value "#; -let rules = parse_text_magic_file(magic_content)?; +let parsed = parse_text_magic_file(magic_content)?; // IEEE 754 little-endian representation of 3.14159f32 let buffer = vec![0xd0, 0x0f, 0x49, 0x40]; -let matches = evaluate_rules(&rules, &buffer)?; +let matches = evaluate_rules(&parsed.rules, &buffer)?; assert_eq!(matches[0].message, "Pi constant detected"); ``` @@ -561,20 +594,20 @@ let magic_content = r#" 0 pstring/L =\x00\x00\x00\x05MAGIC Pascal string (4-byte BE prefix) 0 pstring/l =\x05\x00\x00\x00MAGIC Pascal string (4-byte LE prefix) "#; -let rules = parse_text_magic_file(magic_content)?; +let parsed = parse_text_magic_file(magic_content)?; // 1-byte prefix: length=5, then "MAGIC" let buffer = b"\x05MAGIC"; -let matches = evaluate_rules(&rules, &buffer)?; +let matches = evaluate_rules(&parsed.rules, &buffer)?; assert_eq!(matches[0].message, "Pascal string (1-byte prefix)"); // 2-byte big-endian prefix with /J flag: stored length 7 (includes 2-byte prefix), effective content 5 bytes let magic_content_j = r#" 0 pstring/HJ =MAGIC JPEG-style pstring with self-inclusive length "#; -let rules_j = parse_text_magic_file(magic_content_j)?; +let parsed_j = parse_text_magic_file(magic_content_j)?; let buffer_j = b"\x00\x07MAGIC"; // 2-byte BE prefix: value 7, minus 2 = 5 bytes of content -let matches_j = evaluate_rules(&rules_j, &buffer_j)?; +let matches_j = evaluate_rules(&parsed_j.rules, &buffer_j)?; assert_eq!(matches_j[0].message, "JPEG-style pstring with self-inclusive length"); ``` @@ -594,6 +627,7 @@ assert_eq!(matches_j[0].message, "JPEG-style pstring with self-inclusive length" - [x] Relative offset support (GNU `file` anchor semantics, issue #38) - [x] Regex type support (binary-safe `regex::bytes::Regex` with `/c`, `/s`, `/l` flags and 8192-byte cap; unconditional `REG_NEWLINE`) - [x] Search type support (bounded literal pattern scan via `memchr::memmem::find` with mandatory `NonZeroUsize` range) +- [x] Meta-type directives: `default`, `clear`, `name`/`use` subroutines, `indirect` re-evaluation (issue #42) - [ ] Performance optimizations (rule ordering, caching) ## Performance Considerations diff --git a/docs/src/magic-format.md b/docs/src/magic-format.md index b7c9775..e60ed2d 100644 --- a/docs/src/magic-format.md +++ b/docs/src/magic-format.md @@ -514,6 +514,71 @@ Output: `GIF image data, version 89a` 0 bedouble =0.45455 PNG image with gamma 0.45455 ``` +## Meta-types / Control Directives + +Meta-types are pseudo-types that do not read bytes from the buffer. Instead, they control the evaluation flow: defining named subroutines, invoking them, providing fallbacks when no sibling matched, resetting per-level match state, or re-applying the entire rule database at a resolved offset. + +| Keyword | Syntax | Description | +| ----------- | ---------------------- | ------------------------------------------------------------------ | +| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | +| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | +| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | +| `clear` | `0 clear` | Resets the per-level sibling-matched flag | +| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | + +### `name` and `use` — Named Subroutines + +`name ` defines a named subroutine block at the top level; its children are the subroutine body. `use ` invokes that subroutine at a given offset. + +```text +# Define a reusable subroutine +0 name part2 +>0 search/64 ABC found_ABC +>>&0 byte x followed_by 0x%x + +# Top-level rule that invokes the subroutine +0 string TEST Testfmt +>0 use part2 +>64 use part2 +``` + +Top-level `name` blocks are hoisted out of the flat rule list at parse time into a `NameTable` keyed by identifier. Duplicate names retain the first definition and emit a warning. `name` rules nested inside another rule's children are not well-defined in magic(5) and are scrubbed at load time. + +### `default` — Fallback Rule + +A `default` rule at a given level fires only when none of its siblings at the same level have matched. The operator is conventionally `x` (any-value), and the value column is ignored. + +```text +0 byte 0xAA Real-Match +0 default x DEFAULT-FALLBACK +``` + +Against a buffer starting with `0xAA`, only `Real-Match` fires. Against a buffer starting with any other byte, `DEFAULT-FALLBACK` fires. + +### `clear` — Reset Sibling-Matched Flag + +A `clear` directive resets the per-level "sibling matched" flag, so a subsequent `default` at the same level can fire again even after an earlier sibling matched. Pair with `EvaluationConfig::with_stop_at_first_match(false)` to walk all top-level siblings. + +```text +0 byte 0xAA Match-A +0 default x DEFAULT-SKIPPED +0 clear +0 default x DEFAULT-FIRES +``` + +Against a buffer starting with `0xAA`: `Match-A` fires, `DEFAULT-SKIPPED` is suppressed (a sibling matched), `clear` resets the flag, and `DEFAULT-FIRES` fires. + +### `indirect` — Re-apply Root Rules at a Resolved Offset + +An `indirect` rule resolves its offset, slices the buffer at that point, and re-applies the full rule database against the sub-buffer. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. + +```text +0 byte 0x42 Inner-Match +8 indirect x +``` + +Against a 16-byte buffer with `buf[8] = 0x42`: the top-level `byte` rule at offset 0 does not match, and the `indirect` rule re-applies the root rules at offset 8 — where `buf[8] = 0x42` matches the inner `byte` rule, producing `Inner-Match`. + ## Best Practices ### 1. Order Rules by Specificity @@ -603,8 +668,6 @@ Consider: - Regex patterns - 128-bit integer types -- Use/name directives -- Default rules ### Recently Added diff --git a/docs/src/parser.md b/docs/src/parser.md index dde31ca..1160869 100644 --- a/docs/src/parser.md +++ b/docs/src/parser.md @@ -429,6 +429,63 @@ parse_type_and_operator("search/0") - ✅ Bare `search` and `search/0` rejected at parse time - ✅ Binary-safe literal matching via `memchr::memmem::find` +### Meta-type Directives (`name`, `use`, `default`, `clear`, `indirect`) + +The parser supports five meta-type directives that represent control-flow rather than buffer reads. They all parse into the `TypeKind::Meta(MetaType)` AST variant and carry no endianness or width. + +**Type Keywords and `MetaType` Variants:** + +| Keyword | `MetaType` Variant | Role | +| ----------- | ------------------------ | -------------------------------------------------------------- | +| `name ` | `MetaType::Name(String)` | Declares a named subroutine; children form the subroutine body | +| `use ` | `MetaType::Use(String)` | Invokes a named subroutine at the resolved offset | +| `default` | `MetaType::Default` | Fires only when no sibling at the same level has matched | +| `clear` | `MetaType::Clear` | Resets the per-level sibling-matched flag | +| `indirect` | `MetaType::Indirect` | Re-applies the root rule set at the resolved offset | + +Meta-types have `bit_width() == None` because they consume zero on-disk bytes. + +**`ParsedMagic` Return Type (Breaking Change):** + +`parse_text_magic_file`, `load_magic_file`, and `load_magic_directory` now return `Result` (not `Result, ParseError>`). The `ParsedMagic` struct carries both the top-level rules and a name table: + +```rust +pub struct ParsedMagic { + pub rules: Vec, + pub name_table: NameTable, +} +``` + +Callers must destructure at the boundary: + +```rust +use libmagic_rs::parser::parse_text_magic_file; + +let magic = r#"0 string \x7fELF ELF file +>4 byte 1 32-bit"#; + +let parsed = parse_text_magic_file(magic)?; +assert_eq!(parsed.rules.len(), 1); // One root rule +assert_eq!(parsed.rules[0].children.len(), 1); // One child rule +// parsed.name_table holds any `name ` blocks extracted at load time +``` + +**Load-time Name Extraction:** + +Top-level `name ` rules are hoisted *out* of `ParsedMagic::rules` by `parser::name_table::extract_name_table` and placed into `name_table` keyed by identifier. As a result: + +- `name` rules do not appear in `ParsedMagic::rules` at all — only `use ` invocations remain to drive subroutine dispatch at evaluation time. +- Duplicate `name` declarations keep the first definition and emit a `warn!`. +- `name` rules that appear as children (not at level 0) are not well-defined in magic(5); they are scrubbed from the tree with a `warn!` during extraction. + +**Features:** + +- ✅ All five keywords recognized by `parse_type_keyword` + `type_keyword_to_kind` +- ✅ Round-trip through `serialize_type_kind` in `codegen.rs` +- ✅ Top-level `name` extraction into `NameTable` +- ✅ Defensive scrubbing of misplaced nested `name` rules +- ✅ First-wins merge across directory loads + ## Parser Design Principles ### Error Handling @@ -513,9 +570,10 @@ let magic_content = r#" >4 byte 2 64-bit "#; -let rules = parse_text_magic_file(magic_content)?; -assert_eq!(rules.len(), 1); // One root rule -assert_eq!(rules[0].children.len(), 2); // Two child rules +let parsed = parse_text_magic_file(magic_content)?; +assert_eq!(parsed.rules.len(), 1); // One root rule +assert_eq!(parsed.rules[0].children.len(), 2); // Two child rules +// parsed.name_table holds any top-level `name ` subroutine blocks ``` The parser distinguishes between signed and unsigned type variants (e.g., `byte` vs `ubyte`, `leshort` vs `uleshort`), mapping them to the `signed` field in `TypeKind::Byte { signed: bool }` and similar type variants. Unprefixed types default to signed in accordance with libmagic conventions. Float and double types do not have signed/unsigned variants; IEEE 754 handles sign internally. @@ -538,9 +596,8 @@ match detect_format(path)? { ### Not Yet Implemented -- **Indirect Offsets**: Pointer dereferencing patterns (e.g., `(0x3c.l)`) - **Binary .mgc Format**: Compiled magic database format -- **Strength Modifiers**: `!:strength` parsing for rule priority +- **`offset` pseudo-type**: The `offset` keyword used in `searchbug.magic` for `at_offset %lld` output ### Planned Enhancements @@ -556,19 +613,20 @@ The parser provides a complete pipeline from text to AST: use libmagic_rs::parser::{parse_text_magic_file, detect_format, MagicFileFormat}; // Detect format and parse accordingly -let rules = match detect_format(path)? { +let parsed = match detect_format(path)? { MagicFileFormat::Text => { let content = std::fs::read_to_string(path)?; parse_text_magic_file(&content)? } MagicFileFormat::Directory => { - // Load and merge all files in directory + // Load and merge all files in directory (rules + merged name table) load_magic_directory(path)? } MagicFileFormat::Binary => { return Err(ParseError::UnsupportedFormat { ... }); } }; +// parsed.rules is the top-level rule list, parsed.name_table holds `name`/`use` subroutines ``` The hierarchical structure is automatically built from indentation levels (`>` prefixes), enabling parent-child rule relationships for detailed file type identification. diff --git a/src/build_helpers.rs b/src/build_helpers.rs index c55f88a..b27d35f 100644 --- a/src/build_helpers.rs +++ b/src/build_helpers.rs @@ -29,8 +29,10 @@ use crate::parser::codegen::{ /// /// Returns a `ParseError` if the magic file content is invalid or malformed. pub fn parse_and_generate_builtin_rules(magic_content: &str) -> Result { - let rules = parse_text_magic_file(magic_content)?; - Ok(crate::parser::codegen::generate_builtin_rules(&rules)) + let parsed = parse_text_magic_file(magic_content)?; + Ok(crate::parser::codegen::generate_builtin_rules( + &parsed.rules, + )) } /// Formats a parse error for display in build script output. diff --git a/src/error.rs b/src/error.rs index 9214169..e3f33f0 100644 --- a/src/error.rs +++ b/src/error.rs @@ -201,12 +201,61 @@ pub enum EvaluationError { #[error("Type reading error: {0}")] TypeReadError(#[from] crate::evaluator::types::TypeReadError), + /// A `use` directive referenced a name not present in the name table. + /// + /// The evaluator currently handles this condition with a `warn!` log + /// plus `Ok(vec![])` for backward compatibility with magic files that + /// invoke subroutines defined in another magic file. This variant is + /// reserved for consumers that opt in to strict-mode evaluation where + /// an unknown-name reference should abort the run. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::error::EvaluationError; + /// + /// let error = EvaluationError::UnknownName { + /// name: "missing_sub".to_string(), + /// }; + /// assert!(matches!(error, EvaluationError::UnknownName { .. })); + /// ``` + #[error("use directive references unknown name: {name}")] + UnknownName { + /// The name that could not be resolved in the name table. + name: String, + }, + /// Internal error indicating a bug in the evaluation logic. #[error("Internal error: {message}")] InternalError { /// Description of the internal error message: String, }, + + /// An `indirect` directive was evaluated without a rule environment + /// attached to the [`crate::evaluator::EvaluationContext`]. + /// + /// `MetaType::Indirect` re-evaluates the entire root rule list at the + /// resolved offset. Without a [`crate::evaluator::RuleEnvironment`] (the + /// shared root rules + name table) the engine has nothing to re-enter, + /// so the directive is a no-op. This variant is reserved for consumers + /// that opt in to strict-mode evaluation where the misconfiguration + /// should abort the run; the default engine path logs at `debug!` and + /// returns `Ok(vec![])` for backward compatibility with low-level + /// programmatic callers (tests, fuzz harnesses) that intentionally run + /// without a `MagicDatabase`-attached environment, mirroring the + /// `Use`-without-env contract. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::error::EvaluationError; + /// + /// let error = EvaluationError::indirect_without_environment(); + /// assert!(matches!(error, EvaluationError::IndirectWithoutEnvironment)); + /// ``` + #[error("indirect directive evaluated without a rule environment")] + IndirectWithoutEnvironment, } impl ParseError { @@ -331,6 +380,12 @@ impl EvaluationError { message: message.into(), } } + + /// Create a new `IndirectWithoutEnvironment` error. + #[must_use] + pub const fn indirect_without_environment() -> Self { + Self::IndirectWithoutEnvironment + } } #[cfg(test)] diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 9008e43..47da1bc 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -12,11 +12,56 @@ //! - Providing a convenience wrapper for evaluation with configuration //! (`evaluate_rules_with_config`) -use crate::parser::ast::MagicRule; +use crate::parser::ast::{MagicRule, MetaType, TypeKind}; use crate::{EvaluationConfig, LibmagicError}; use super::{EvaluationContext, RecursionGuard, RuleMatch, offset, operators, types}; use log::{debug, warn}; +use std::sync::atomic::{AtomicBool, Ordering}; + +/// RAII guard that saves the GNU `file` previous-match anchor on entry and +/// restores it on drop. +/// +/// `MetaType::Indirect` re-evaluates the root rule list at the resolved +/// offset, which means it must seed the anchor with that offset for the +/// nested call and then put the caller's anchor back when it returns. +/// Without an RAII wrapper, every early-return path inside the indirect +/// branch would have to remember to restore the anchor manually. +struct AnchorScope<'a> { + context: &'a mut EvaluationContext, + saved_anchor: usize, +} + +impl<'a> AnchorScope<'a> { + /// Save the current anchor and seed the context with `new_anchor`. + fn enter(context: &'a mut EvaluationContext, new_anchor: usize) -> Self { + let saved_anchor = context.last_match_end(); + context.set_last_match_end(new_anchor); + Self { + context, + saved_anchor, + } + } + + /// Access the underlying context for the duration of the guard. + #[allow(dead_code)] + fn context(&mut self) -> &mut EvaluationContext { + self.context + } +} + +impl Drop for AnchorScope<'_> { + fn drop(&mut self) { + self.context.set_last_match_end(self.saved_anchor); + } +} + +/// Process-local once guard for the "use directive without rule environment" +/// warning. Ensures we surface the misconfiguration exactly once per process +/// so low-level programmatic consumers of [`evaluate_rules`] (tests, fuzz +/// harnesses) that intentionally run without a `MagicDatabase`-attached +/// environment do not flood the log on every `Use` rule they encounter. +static USE_WITHOUT_RULE_ENV_WARNED: AtomicBool = AtomicBool::new(false); /// Evaluate a single magic rule against a file buffer /// @@ -102,12 +147,15 @@ fn evaluate_single_rule_with_anchor( rule: &MagicRule, buffer: &[u8], last_match_end: usize, + base_offset: usize, ) -> Result, LibmagicError> { use crate::parser::ast::TypeKind; // Step 1: Resolve the offset specification to an absolute position. + // `base_offset` is non-zero only inside a `MetaType::Use` subroutine + // body, where it biases positive absolute offsets to the use-site. let absolute_offset = - offset::resolve_offset_with_context(&rule.offset, buffer, last_match_end)?; + offset::resolve_offset_with_base(&rule.offset, buffer, last_match_end, base_offset)?; // Step 2 & 3: Dispatch on type category. Pattern-bearing types // (Regex, Search) take a different path from fixed-width types @@ -116,7 +164,35 @@ fn evaluate_single_rule_with_anchor( // would compare matched text ("123") against the pattern literal // ("[0-9]+") and produce false negatives on any regex with // metacharacters. + // + // Meta-type directives (`default`, `clear`, `name`, `use`, + // `indirect`) are silent no-ops in this phase -- the parser + // preserves them in the AST but the evaluator does not yet wire + // them into any control-flow behavior. Short-circuiting here with + // `Ok(None)` keeps them out of the value/pattern paths (which + // would otherwise surface `TypeReadError::UnsupportedType`). let (matched, read_value) = match &rule.typ { + TypeKind::Meta(MetaType::Name(name)) => { + // `Name` rules are normally hoisted into the name table at + // parse time and should not reach the evaluator. Programmatic + // consumers (e.g. fuzz harnesses, property tests) can still + // construct them directly; treat that as a no-op rather than + // a hard failure so the evaluator-never-panics invariant is + // preserved. + debug!( + "Name rule '{name}' reached evaluator (likely bypassed name-table extraction); treating as no-op" + ); + return Ok(None); + } + TypeKind::Meta(MetaType::Use(_)) => { + // `Use` is dispatched inline by `evaluate_rules` so it can + // push the subroutine's matches into the caller's match + // vector. Reaching this arm means the rule went through the + // single-rule path (e.g. via `evaluate_single_rule`) which + // lacks that wiring; treat it as a silent no-op. + return Ok(None); + } + TypeKind::Meta(_) => return Ok(None), TypeKind::Regex { .. } | TypeKind::Search { .. } => { evaluate_pattern_rule(rule, buffer, absolute_offset)? } @@ -125,6 +201,88 @@ fn evaluate_single_rule_with_anchor( Ok(matched.then_some((absolute_offset, read_value))) } +/// Evaluate a `TypeKind::Meta(MetaType::Use(name))` rule inline. +/// +/// Looks up `name` in the context's rule environment, temporarily sets the +/// GNU `file` previous-match anchor to the resolved offset, and recursively +/// evaluates the subroutine's rules against `buffer`. Any matches produced +/// by the subroutine are returned in document order and are intended to be +/// pushed into the caller's match vector *before* the synthetic `Use` match +/// itself (matching GNU `file` behavior where a `use` site is replaced by +/// its expansion in the output). +/// +/// Returns `Ok((Some(absolute_offset), matches))` on a successful resolution +/// (even if the subroutine produced no matches), or `Ok((None, vec![]))` +/// when: +/// - the context has no rule environment attached (programmatic consumers +/// bypassing `MagicDatabase`) +/// - the referenced name is not in the table (logged at warn level) +/// +/// Recursion-limit propagation is handled via [`RecursionGuard`] so that a +/// subroutine calling `use` on itself triggers `RecursionLimitExceeded` +/// instead of a stack overflow. +fn evaluate_use_rule( + rule: &MagicRule, + name: &str, + buffer: &[u8], + context: &mut EvaluationContext, +) -> Result<(Option, Vec), LibmagicError> { + let Some(env) = context.rule_env() else { + // Surface the misconfiguration once per process at warn! level so + // it is visible in default logging, then gate subsequent hits so a + // magic file with many `use` directives does not flood the log. + // Use `Ordering::Relaxed`: the flag is an idempotent diagnostic + // latch, not a synchronization primitive guarding other state. + if USE_WITHOUT_RULE_ENV_WARNED.swap(true, Ordering::Relaxed) { + debug!("use directive '{name}' evaluated without a rule environment; no-op"); + } else { + warn!( + "use directive '{name}' evaluated without a rule environment; treating as no-op (subsequent occurrences suppressed)" + ); + } + return Ok((None, Vec::new())); + }; + + let Some(subroutine_rules) = env.name_table.get(name) else { + warn!("use directive references unknown name '{name}'"); + return Ok((None, Vec::new())); + }; + + // Clone the Arc reference to detach from the immutable borrow of + // `context`, so we can mutably borrow the context below. + let subroutine_rules: Vec = subroutine_rules.clone(); + + // Resolve the use-site offset under the *caller's* base, not the + // subroutine's -- the use rule itself is in the caller's scope. + let absolute_offset = offset::resolve_offset_with_base( + &rule.offset, + buffer, + context.last_match_end(), + context.base_offset(), + )?; + + // Save the anchor and base offset, seed the subroutine body with the + // use-site offset for both, and restore on exit. This gives the + // subroutine: + // * `&N` offsets resolving from the use-site (via last_match_end) + // * `>N` / absolute offsets in the subroutine resolving as + // `use_site + N` (via base_offset), matching magic(5) semantics + let saved_anchor = context.last_match_end(); + let saved_base = context.base_offset(); + context.set_last_match_end(absolute_offset); + context.set_base_offset(absolute_offset); + + let subroutine_matches = { + let mut guard = RecursionGuard::enter(context)?; + evaluate_rules(&subroutine_rules, buffer, guard.context())? + }; + + context.set_last_match_end(saved_anchor); + context.set_base_offset(saved_base); + + Ok((Some(absolute_offset), subroutine_matches)) +} + /// Evaluate a pattern-bearing rule (`TypeKind::Regex` / `TypeKind::Search`). /// /// `read_pattern_match` returns `Some(value)` on a successful match @@ -283,6 +441,7 @@ fn evaluate_value_rule( /// * `LibmagicError::EvaluationError` - Only for critical failures like recursion limit exceeded /// /// Individual rule evaluation errors are handled gracefully and do not stop the overall evaluation. +#[allow(clippy::too_many_lines)] pub fn evaluate_rules( rules: &[MagicRule], buffer: &[u8], @@ -292,6 +451,37 @@ pub fn evaluate_rules( let start_time = std::time::Instant::now(); let mut rule_count = 0u32; + // Per-level "did any sibling match yet?" flag for `default`/`clear` + // dispatch. Each recursive descent gets its own fresh flag, so child + // sibling chains track their own state independently of the parent. + let mut sibling_matched: bool = false; + + // Per-level entry anchor: captured at the start of this sibling list's + // evaluation. For CHILD sibling lists (recursion_depth > 0), the + // GNU `file`/libmagic previous-match anchor is reset to this value + // between sibling iterations so that `&N` offsets on continuation + // siblings resolve against the parent-level anchor, not against + // whatever the *previous sibling* left the anchor at. This matches + // libmagic's continuation-level model (`ms->c.li[cont_level]`) + // where each level tracks its own anchor; a sibling at level L does + // not inherit the post-match anchor of another sibling at level L. + // + // TOP-LEVEL siblings (recursion_depth == 0) are independent + // classification attempts -- each top-level rule intentionally sees + // the anchor advance that prior top-level rules produced (see + // GOTCHAS S3.8 and the `relative_anchor_can_decrease_...` + // integration test). Gate the reset on recursion_depth to preserve + // that documented discipline while still fixing the continuation- + // sibling behavior that the GNU `file` `searchbug.magic` fixture + // relies on. + // + // Recursing into a matched rule's children still carries forward the + // post-match anchor (via the current value of `last_match_end()` at + // the point of recursion), so child sibling lists see their parent's + // resolved position as their own entry anchor. + let entry_anchor = context.last_match_end(); + let is_child_sibling_list = context.recursion_depth() > 0; + // Entry-point timeout check: ensures every recursive descent is bounded // and that evaluations of small rule sets (< 16 rules) are still guarded. // Without this, the periodic every-16-rules check below never fires for @@ -304,6 +494,15 @@ pub fn evaluate_rules( } for rule in rules { + // For continuation siblings (child recursion), reset the + // previous-match anchor to the entry anchor so `&N` offsets + // resolve against the parent-level position. Top-level + // siblings (depth 0) keep the chaining behavior documented in + // GOTCHAS S3.8. See the `entry_anchor` comment above. + if is_child_sibling_list { + context.set_last_match_end(entry_anchor); + } + // Check timeout periodically (every 16 rules) to reduce syscall overhead rule_count = rule_count.wrapping_add(1); if rule_count.trailing_zeros() >= 4 @@ -313,35 +512,466 @@ pub fn evaluate_rules( return Err(LibmagicError::Timeout { timeout_ms }); } - // Evaluate the current rule with graceful error handling. - // Pass the GNU `file` anchor so OffsetSpec::Relative resolves - // correctly against the previous match's end position. - let match_data = - match evaluate_single_rule_with_anchor(rule, buffer, context.last_match_end()) { - Ok(data) => data, + // `Clear` resets the per-level "sibling matched" flag so a + // subsequent `default` sibling can fire even if an earlier + // sibling matched. It does not produce a match, evaluate + // children, or advance the anchor. + if let TypeKind::Meta(MetaType::Clear) = &rule.typ { + sibling_matched = false; + continue; + } + + // `Default` fires only when no earlier sibling at this level has + // matched yet. The anchor is intentionally not advanced -- the + // directive does not consume bytes -- but its children are + // evaluated and the per-level "sibling matched" flag is set so + // any later `default` sibling at the same level is suppressed. + if let TypeKind::Meta(MetaType::Default) = &rule.typ { + if !sibling_matched { + let matches_before = matches.len(); + + let match_result = RuleMatch { + message: rule.message.clone(), + offset: context.last_match_end(), + level: rule.level, + value: crate::parser::ast::Value::Uint(0), + type_kind: rule.typ.clone(), + confidence: RuleMatch::calculate_confidence(rule.level), + }; + matches.push(match_result); + + // `default` is treated as a successful match at this + // level, so its children are evaluated under the same + // recursion-guard pattern as every other successful rule. + if !rule.children.is_empty() { + let mut guard = RecursionGuard::enter(context)?; + match evaluate_rules(&rule.children, buffer, guard.context()) { + Ok(child_matches) => { + matches.extend(child_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { + .. + } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { + .. + }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + warn!( + "Discarding child evaluation under default rule '{}' due to unexpected error: {} -- default match is still emitted", + rule.message, e + ); + } + Err(e) => return Err(e), + } + } + + sibling_matched = true; + + if matches.len() > matches_before && context.should_stop_at_first_match() { + break; + } + } + continue; + } + + // `Indirect` re-evaluates the root rule list at the resolved + // offset, mirroring libmagic's indirect-type semantics. The + // sub-evaluation runs against `buffer[absolute_offset..]` with a + // fresh anchor (0) so relative offsets inside the root rules + // resolve correctly; the caller's anchor is restored on exit + // via `AnchorScope`. Without an attached `RuleEnvironment` + // (programmatic consumers bypassing `MagicDatabase`) the + // directive is a silent no-op. + if let TypeKind::Meta(MetaType::Indirect) = &rule.typ { + // Resolve the offset first so a malformed offset surfaces + // as a graceful skip rather than a hard error. + let absolute_offset = match offset::resolve_offset_with_base( + &rule.offset, + buffer, + context.last_match_end(), + context.base_offset(), + ) { + Ok(o) => o, Err( - e @ (LibmagicError::EvaluationError( + e @ LibmagicError::EvaluationError( crate::error::EvaluationError::BufferOverrun { .. } - | crate::error::EvaluationError::InvalidOffset { .. } - | crate::error::EvaluationError::TypeReadError( - crate::evaluator::types::TypeReadError::BufferOverrun { .. } - | crate::evaluator::types::TypeReadError::InvalidPStringLength { .. }, - ), - ) - | LibmagicError::IoError(_)), + | crate::error::EvaluationError::InvalidOffset { .. }, + ), ) => { - // Expected data-dependent evaluation errors -- skip gracefully. - // TypeReadError::UnsupportedType is intentionally NOT caught here - // so that evaluator capability gaps propagate as errors. - debug!("Skipping rule '{}': {}", rule.message, e); + debug!("Skipping indirect rule '{}': {}", rule.message, e); continue; } - Err(e) => { - // Unexpected errors (InternalError, UnsupportedType, etc.) should propagate - return Err(e); + Err(e) => return Err(e), + }; + + // Pull the root rules out of the rule environment. Without + // an environment there is nothing to re-enter, so this is a + // silent no-op (matching the `Use`-without-env behavior). + // + // We use `debug!` rather than `debug_assert!` here because + // property tests (`prop_arbitrary_rule_evaluation_never_panics`) + // synthesize arbitrary `TypeKind::Meta(MetaType::Indirect)` + // rules and run them without attaching a `RuleEnvironment`; + // a panic on this path would break the never-panics invariant. + // See GOTCHAS S2.1 for the same rationale on the leaked-Name arm. + let Some(root_rules) = context.rule_env().map(|e| e.root_rules.clone()) else { + debug!( + "indirect rule '{}' evaluated without a rule environment; treating as no-op", + rule.message + ); + continue; + }; + + // Bounds-check before slicing. An indirect offset past the + // end of the buffer is a data-dependent skip, not an error. + let Some(sub_buffer) = buffer.get(absolute_offset..) else { + debug!( + "Skipping indirect rule '{}': offset {} past buffer end ({} bytes)", + rule.message, + absolute_offset, + buffer.len() + ); + continue; + }; + + let matches_before = matches.len(); + + // Advance the GNU `file` previous-match anchor to the indirect's + // resolved offset and emit a `RuleMatch` for the indirect rule + // itself BEFORE descending into the root re-entry or children. + // This matches the shared successful-match flow used by every + // other rule kind: advance anchor first, record the match, then + // recurse. Without this, sibling rules of the `indirect` resolve + // their relative offsets against the stale anchor and the + // directive's own `message` never surfaces in the output. + context.set_last_match_end(absolute_offset); + + let indirect_match = RuleMatch { + message: rule.message.clone(), + offset: absolute_offset, + level: rule.level, + value: crate::parser::ast::Value::String("indirect".to_string()), + type_kind: rule.typ.clone(), + confidence: RuleMatch::calculate_confidence(rule.level), + }; + matches.push(indirect_match); + + // Indirect counts as a match for `sibling_matched` regardless of + // whether the sub-evaluation produced any matches -- the directive + // itself successfully dispatched. + sibling_matched = true; + + // Recursion guard + anchor scope: nested indirect / use cycles + // surface as `RecursionLimitExceeded` instead of a stack overflow, + // and the caller's anchor is restored on every exit path. + { + let mut guard = RecursionGuard::enter(context)?; + let mut anchor_scope = AnchorScope::enter(guard.context(), 0); + match evaluate_rules(&root_rules, sub_buffer, anchor_scope.context()) { + Ok(sub_matches) => { + matches.extend(sub_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err(e) => return Err(e), } + // anchor_scope drops here, restoring the saved anchor + // (which is now `absolute_offset`, set above before the + // scope was entered). + // guard drops next, decrementing the recursion depth. + } + + // Evaluate the indirect rule's own children under the same + // recursion-guard pattern used by every other successful rule. + if !rule.children.is_empty() { + let mut guard = RecursionGuard::enter(context)?; + match evaluate_rules(&rule.children, buffer, guard.context()) { + Ok(child_matches) => { + matches.extend(child_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { .. } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { + .. + }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + warn!( + "Discarding child evaluation under indirect rule '{}' due to unexpected error: {} -- indirect matches are still emitted", + rule.message, e + ); + } + Err(e) => return Err(e), + } + } + + if matches.len() > matches_before && context.should_stop_at_first_match() { + break; + } + continue; + } + + // `Offset` reports the resolved file offset as the rule's read + // value, matching GNU `file`'s `FILE_OFFSET` semantics: the match + // emits a value-bearing `RuleMatch` whose `value` is the absolute + // position, which downstream message formatting substitutes into + // `%lld` / `%d` specifiers via `output::format::format_magic_message`. + // + // Per magic(5) the only legal operator is `x` (AnyValue); any + // other operator is a magic-file semantic error. Matching the + // evaluator's graceful-skip discipline, we `debug!`-log and skip + // rather than erroring -- a rogue rule shouldn't poison the rest + // of the evaluation. + if let TypeKind::Meta(MetaType::Offset) = &rule.typ { + if !matches!(rule.op, crate::parser::ast::Operator::AnyValue) { + debug!( + "offset rule '{}': non-`x` operator {:?} not supported; skipping", + rule.message, rule.op + ); + continue; + } + + // Resolve the offset first so a malformed offset surfaces as + // a graceful skip rather than a hard error. Mirrors the + // `Indirect` dispatch above. + let absolute_offset = match offset::resolve_offset_with_base( + &rule.offset, + buffer, + context.last_match_end(), + context.base_offset(), + ) { + Ok(o) => o, + Err( + e @ LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. }, + ), + ) => { + debug!("Skipping offset rule '{}': {}", rule.message, e); + continue; + } + Err(e) => return Err(e), + }; + + let matches_before = matches.len(); + + // Advance the anchor BEFORE emitting the match so sibling + // rules resolve their relative offsets against the offset + // directive's resolved position. Same discipline as + // `Indirect` and every other value-bearing rule. + context.set_last_match_end(absolute_offset); + + let offset_match = RuleMatch { + message: rule.message.clone(), + offset: absolute_offset, + level: rule.level, + value: crate::parser::ast::Value::Uint(absolute_offset as u64), + type_kind: rule.typ.clone(), + confidence: RuleMatch::calculate_confidence(rule.level), + }; + matches.push(offset_match); + + sibling_matched = true; + + // Evaluate children under the recursion-guard pattern used + // by every other successful rule. + if !rule.children.is_empty() { + let mut guard = RecursionGuard::enter(context)?; + match evaluate_rules(&rule.children, buffer, guard.context()) { + Ok(child_matches) => { + matches.extend(child_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { .. } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { + .. + }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + warn!( + "Discarding child evaluation under offset rule '{}' due to unexpected error: {} -- offset match is still emitted", + rule.message, e + ); + } + Err(e) => return Err(e), + } + } + + if matches.len() > matches_before && context.should_stop_at_first_match() { + break; + } + continue; + } + + // `Use` is handled inline so the subroutine's matches can be + // spliced into the caller's match vector in document order. + // Routing this through `evaluate_single_rule_with_anchor` would + // force the helper to return a `Vec`, which would + // reshape the single-rule return type for every other variant. + // + // On a successful use path we must also descend into the rule's + // own children, matching the flow of every other successful rule + // kind. libmagic chains like `>>0 use part2` often carry + // continuation rules (siblings and descendants of the `use` site) + // that depend on the anchor the subroutine left behind; skipping + // them produces user-visible false negatives. + if let TypeKind::Meta(MetaType::Use(name)) = &rule.typ { + let matches_before = matches.len(); + let use_resolved = match evaluate_use_rule(rule, name, buffer, context) { + Ok((Some(absolute_offset), subroutine_matches)) => { + matches.extend(subroutine_matches); + + // A `use` rule itself does not produce a surface + // `RuleMatch` in GNU `file` output; the subroutine's + // rules carry the visible messages. We therefore only + // advance the anchor (to the use-site offset, which + // may have been moved by the subroutine; since we + // restored it above, we now re-advance to the + // use-site offset so subsequent sibling rules resolve + // relative offsets from the use-site end). + context.set_last_match_end(absolute_offset); + true + } + Ok((None, _)) => { + // No environment, or name not found -- silent no-op. + false + } + Err( + e @ LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. }, + ), + ) => { + debug!("Skipping use rule '{name}': {e}"); + false + } + Err(e) => return Err(e), }; + // Evaluate the use rule's own children exactly like any other + // successful rule. Subroutine matches are already appended + // above, so children are spliced in after them to preserve + // document order. The recursion guard mirrors the non-`Use` + // path so a `use`-site chain cannot blow past the configured + // recursion limit. + if use_resolved && !rule.children.is_empty() { + let mut guard = RecursionGuard::enter(context)?; + match evaluate_rules(&rule.children, buffer, guard.context()) { + Ok(child_matches) => { + matches.extend(child_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { .. } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { + .. + }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + // Same defensive rationale as the main rule path: + // individual child failures are already handled + // inside the recursive `evaluate_rules`, so this + // arm only fires if that error-handling strategy + // changes. Logged at warn! so the asymmetry is + // visible. + warn!( + "Discarding child evaluation under use rule '{name}' due to unexpected error: {e} -- subroutine matches are still emitted; investigate the recursive evaluate_rules error-handling path" + ); + } + Err(e) => return Err(e), + } + // `guard` drops here, decrementing the recursion depth. + } + + // A successful `use` site is treated as a sibling match for + // `default`/`clear` dispatch purposes -- subsequent `default` + // siblings should not fire if the subroutine resolved. + if use_resolved { + sibling_matched = true; + } + + // Apply stop-at-first-match with the same semantics as every + // other successful rule kind: if this `use` site contributed + // any matches (either from the subroutine or from its own + // children) and the caller configured first-match + // short-circuiting, halt evaluation of further siblings. + if matches.len() > matches_before && context.should_stop_at_first_match() { + break; + } + continue; + } + + // Evaluate the current rule with graceful error handling. + // Pass the GNU `file` anchor so OffsetSpec::Relative resolves + // correctly against the previous match's end position. + let match_data = match evaluate_single_rule_with_anchor( + rule, + buffer, + context.last_match_end(), + context.base_offset(), + ) { + Ok(data) => data, + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { .. } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { .. }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + // Expected data-dependent evaluation errors -- skip gracefully. + // TypeReadError::UnsupportedType is intentionally NOT caught here + // so that evaluator capability gaps propagate as errors. + debug!("Skipping rule '{}': {}", rule.message, e); + continue; + } + Err(e) => { + // Unexpected errors (InternalError, UnsupportedType, etc.) should propagate + return Err(e); + } + }; + if let Some((absolute_offset, read_value)) = match_data { // Advance the GNU `file` previous-match anchor BEFORE recursing // into children, so children and their descendants see the new @@ -357,6 +987,11 @@ pub fn evaluate_rules( let new_anchor = absolute_offset.saturating_add(consumed); context.set_last_match_end(new_anchor); + // Mark this level as "matched" so any subsequent `default` + // sibling at the same level is suppressed, matching libmagic's + // default-after-match semantics. + sibling_matched = true; + let match_result = RuleMatch { message: rule.message.clone(), offset: absolute_offset, @@ -486,6 +1121,19 @@ pub fn evaluate_rules_with_config( // are rejected at the API boundary rather than triggering subtle // failures during evaluation. config.validate()?; + // Debug-only guard: `evaluate_rules_with_config` builds a context + // without an attached `RuleEnvironment`, which means any + // `MetaType::Indirect` rule reached during evaluation is silently + // no-op'd at runtime. That is the intentional release behavior + // (matching the `Use`-without-env contract for low-level callers), + // but in debug builds we surface the misconfiguration eagerly so + // consumer tests catch env-less `indirect` usage before it ships. + // Release behavior is unchanged. + debug_assert!( + !contains_indirect_rule(rules), + "{}", + crate::error::EvaluationError::indirect_without_environment() + ); // Clear the thread-local regex compile cache so it is bounded to // the lifetime of a single top-level evaluation call. Cache // entries from a previous rule set would otherwise persist on the @@ -496,5 +1144,25 @@ pub fn evaluate_rules_with_config( evaluate_rules(rules, buffer, &mut context) } +/// Recursively walk `rules` (including children) looking for any +/// [`MetaType::Indirect`] directive. +/// +/// Used by the debug-only guard in [`evaluate_rules_with_config`]: the +/// low-level `_with_config` entry point builds a context without a +/// [`crate::evaluator::RuleEnvironment`], so any `indirect` rule is +/// silently no-op'd at runtime. Firing `debug_assert!` here makes that +/// misconfiguration loud in tests without affecting release behavior. +/// +/// Intentionally not gated on `cfg(debug_assertions)` so release builds +/// still compile the `debug_assert!` call site (the macro evaluates its +/// arguments in both modes for type-checking, even though the check +/// itself is stripped in release). +fn contains_indirect_rule(rules: &[MagicRule]) -> bool { + rules.iter().any(|rule| { + matches!(rule.typ, TypeKind::Meta(MetaType::Indirect)) + || contains_indirect_rule(&rule.children) + }) +} + #[cfg(test)] mod tests; diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests.rs index bf3e404..60a66f1 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests.rs @@ -21,7 +21,7 @@ fn evaluate_single_rule_legacy( rule: &MagicRule, buffer: &[u8], ) -> Result, LibmagicError> { - evaluate_single_rule_with_anchor(rule, buffer, 0) + evaluate_single_rule_with_anchor(rule, buffer, 0, 0) } #[test] @@ -2630,3 +2630,839 @@ fn test_search_parent_relative_child_at_positive_offset() { assert_eq!(matches.len(), 2); assert_eq!(matches[1].message, "a after"); } + +// ============================================================================= +// Tests for MetaType::Use semantics +// ============================================================================= + +use crate::evaluator::RuleEnvironment; +use crate::parser::ast::MetaType; +use crate::parser::name_table::NameTable; + +/// Build an `EvaluationContext` with the supplied name table and (optional) +/// root-rules list. The root-rules list is retained for parity with the +/// `RuleEnvironment` shape even though `MetaType::Use` itself does not +/// consult it. +fn make_context_with_env(name_table: NameTable, root_rules: &[MagicRule]) -> EvaluationContext { + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(root_rules), + }); + EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env) +} + +/// Minimal helper: wrap a `TypeKind::Meta(MetaType::Use(name))` rule at +/// offset 0 with the given `message` and empty child list. +fn use_rule(name: &str) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Use(name.to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: format!("use {name}"), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +/// Construct a name table from `(name, subroutine_rules)` pairs. +fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { + // Build via the extraction helper so the table construction matches the + // real parser path. Wrap each entry in a Name rule whose `children` are + // the subroutine body. + let mut top = Vec::new(); + for (name, body) in entries { + top.push(MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Name(name.to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: body, + level: 0, + strength_modifier: None, + }); + } + let (_rules, table) = crate::parser::name_table::extract_name_table(top); + table +} + +#[test] +fn test_use_known_name_evaluates_subroutine() { + // The subroutine `part2` reads byte 3 and expects 0x42. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(3), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sub-match".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("part2", subroutine)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0x00u8, 0x00, 0x00, 0x42, 0x00]; + let rules = vec![use_rule("part2")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "subroutine should produce exactly one match" + ); + assert_eq!(matches[0].message, "sub-match"); +} + +#[test] +fn test_use_unknown_name_returns_no_match() { + // Empty name table so the lookup fails; the evaluator should not panic + // and should produce zero matches. + let table = NameTable::empty(); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0x00u8, 0x42]; + let rules = vec![use_rule("nonexistent")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!(matches.is_empty(), "unknown name should yield no matches"); +} + +#[test] +fn test_use_without_rule_env_returns_no_match() { + // A default context has no rule_env attached; `use` rules should be + // silent no-ops in that case rather than returning an error or panicking. + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let buffer = [0x00u8, 0x42]; + let rules = vec![use_rule("part2")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "Use with no rule_env should produce no matches" + ); +} + +#[test] +fn test_use_recursion_limit() { + // Build a mutually-recursive pair: subroutine A calls B, B calls A. + // With the default recursion limit, this should surface as + // `RecursionLimitExceeded` rather than a stack overflow. + let a_body = vec![use_rule("b")]; + let b_body = vec![use_rule("a")]; + let table = build_name_table(vec![("a", a_body), ("b", b_body)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0u8; 8]; + let rules = vec![use_rule("a")]; + let result = evaluate_rules(&rules, &buffer, &mut context); + assert!( + matches!( + result, + Err(LibmagicError::EvaluationError( + crate::error::EvaluationError::RecursionLimitExceeded { .. } + )) + ), + "mutual recursion through use must surface RecursionLimitExceeded, got {result:?}" + ); +} + +#[test] +fn test_use_child_rules_evaluated_after_subroutine() { + // `Use` itself does not expose a visible RuleMatch today, so we cover + // the "subroutine matches come first" invariant by verifying that the + // subroutine's match appears in the output and is followed by a + // sibling rule's match in the surrounding scope. + // + // `EvaluationConfig::default()` sets `stop_at_first_match = true`, which + // (correctly, after the Comment 2 fix) short-circuits sibling iteration + // once the `use` path produces a match. To exercise the ordering + // invariant between the subroutine and its sibling we opt into the + // "completeness" semantics by disabling first-match short-circuit. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let sibling = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "sibling".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let rules = vec![use_rule("sub"), sibling]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!(matches.len(), 2); + assert_eq!(matches[0].message, "sub-head"); + assert_eq!(matches[1].message, "sibling"); +} + +#[test] +fn test_use_stop_at_first_match_short_circuits_siblings() { + // Comment 2 regression guard: with the default + // `stop_at_first_match = true` config, a successful `use` subroutine + // must prevent later sibling top-level rules from being evaluated, + // matching the short-circuit semantics every other rule kind obeys. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let sibling = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "sibling".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let rules = vec![use_rule("sub"), sibling]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "stop-at-first-match must halt sibling iteration once the use path produces a match" + ); + assert_eq!(matches[0].message, "sub-head"); +} + +#[test] +fn test_use_rule_children_are_evaluated() { + // Comment 1 regression guard: a `use` rule with its own children must + // descend into those children after the subroutine runs, so that + // libmagic chains like `>>0 use part2` followed by continuation rules + // continue producing matches in document order. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + // Disable stop-at-first-match so both the subroutine and the child + // rule are visible in the match vector. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let child = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "use-child".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let mut use_with_child = use_rule("sub"); + use_with_child.children = vec![child]; + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let matches = evaluate_rules(&[use_with_child], &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "use rule's own children must run after the subroutine" + ); + assert_eq!(matches[0].message, "sub-head"); + assert_eq!(matches[1].message, "use-child"); +} + +#[test] +fn test_name_rule_leaked_is_noop() { + // Programmatic consumers may construct a Name rule directly and pass + // it to the evaluator (e.g. property tests). The evaluator must not + // panic; it should instead treat the rule as a silent no-op. + let leaked = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Name("orphan".to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&[leaked], &[0u8; 4], &mut context).unwrap(); + assert!(matches.is_empty(), "leaked Name rule should be a no-op"); +} + +// ============================================================================= +// MetaType::Default / Clear / Indirect tests +// ============================================================================= + +/// Build a `Default` rule with the given message and (optional) children. +fn default_rule(message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Default), + op: Operator::Equal, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} + +/// Build a `Clear` rule. Carries no message in the magic file syntax, but the +/// AST requires a message field. +fn clear_rule() -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Clear), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +/// Build a single byte-equality rule at `offset` for `value`. +fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(value), + message: message.to_string(), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +#[test] +fn test_default_fires_when_no_sibling_matched() { + let rules = vec![default_rule("DEFAULT-FIRES", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "default with no prior sibling match should fire" + ); + assert_eq!(matches[0].message, "DEFAULT-FIRES"); +} + +#[test] +fn test_default_skipped_when_sibling_matched() { + // Disable stop-at-first-match so we can see whether the default would + // have fired or not. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + byte_eq_rule(0, 0xAA, "real-match"), + default_rule("DEFAULT-SKIPPED", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "default after a successful sibling should not fire" + ); + assert_eq!(matches[0].message, "real-match"); +} + +#[test] +fn test_default_fires_only_once() { + // Two consecutive default rules: the first sets sibling_matched, so + // the second must not fire. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + default_rule("FIRST-DEFAULT", vec![]), + default_rule("SECOND-DEFAULT", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "only the first default should fire when no real sibling matched" + ); + assert_eq!(matches[0].message, "FIRST-DEFAULT"); +} + +#[test] +fn test_default_children_evaluated() { + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let child = byte_eq_rule(0, 0xAA, "default-child"); + let rules = vec![default_rule("PARENT-DEFAULT", vec![child])]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "default rule's children must be evaluated when the default fires" + ); + assert_eq!(matches[0].message, "PARENT-DEFAULT"); + assert_eq!(matches[1].message, "default-child"); +} + +#[test] +fn test_clear_resets_sibling_matched() { + // Sequence: byte-match, default-skipped, clear, default-fires. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + byte_eq_rule(0, 0xAA, "byte-match"), + default_rule("DEFAULT-SKIPPED", vec![]), + clear_rule(), + default_rule("DEFAULT-FIRES-AFTER-CLEAR", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "clear must reset sibling_matched so a later default fires" + ); + assert_eq!(matches[0].message, "byte-match"); + assert_eq!(matches[1].message, "DEFAULT-FIRES-AFTER-CLEAR"); +} + +#[test] +fn test_clear_at_top_is_noop() { + let rules = vec![clear_rule(), default_rule("AFTER-CLEAR", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "clear at top of list is a no-op; default after still fires" + ); + assert_eq!(matches[0].message, "AFTER-CLEAR"); +} + +#[test] +fn test_clear_does_not_produce_match() { + let rules = vec![clear_rule()]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!(matches.is_empty(), "clear alone must produce no match"); +} + +#[test] +fn test_default_clear_per_level_isolation() { + // Parent has its own sibling_matched flag. The child list runs with a + // fresh flag, so a child-level `default` must fire even though the + // parent's flag is true. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "parent-match".to_string(), + children: vec![ + byte_eq_rule(1, 0xBB, "child-byte-match"), + default_rule("CHILD-DEFAULT-SKIPPED", vec![]), + clear_rule(), + default_rule("CHILD-DEFAULT-AFTER-CLEAR", vec![]), + ], + level: 0, + strength_modifier: None, + }; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + + // Expected order: parent-match, child-byte-match, CHILD-DEFAULT-AFTER-CLEAR + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec![ + "parent-match", + "child-byte-match", + "CHILD-DEFAULT-AFTER-CLEAR" + ], + "child-level sibling_matched must be isolated from parent-level state" + ); +} + +/// Build an `Indirect` rule at `offset` with optional children. +fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Meta(MetaType::Indirect), + op: Operator::Equal, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} + +#[test] +fn test_indirect_evaluates_root_rules_at_offset() { + // Root rules: detect a "ZIP-like" header (0x50 0x4b) at offset 0 of the + // sub-buffer. The indirect rule fires at offset 4 of the outer buffer, + // which means the sub-buffer starts at byte 4. Place 0x50 0x4b there. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let root_rule = byte_eq_rule(0, 0x50, "ZIP-like-header"); + let root_rules: Vec = vec![root_rule]; + + // Build an environment where root_rules is the same as the rules we + // dispatch into. + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(root_rules.as_slice()), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + // Buffer: ELF magic at offset 0, ZIP-like at offset 4. The indirect + // rule is the trigger; the root re-entry detects 0x50 at sub-buffer 0. + let buffer = [0x7fu8, 0x45, 0x4c, 0x46, 0x50, 0x4b, 0x03, 0x04]; + let rules = vec![indirect_rule(4, "indirect-trigger", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + + assert!( + matches.iter().any(|m| m.message == "ZIP-like-header"), + "indirect must dispatch root rules against the sub-buffer at offset 4; got {matches:?}" + ); +} + +#[test] +fn test_indirect_out_of_bounds_is_noop() { + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(&[byte_eq_rule(0, 0x00, "root")] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let buffer = [0u8; 4]; + // Indirect at offset 100, which is well past the 4-byte buffer. + let rules = vec![indirect_rule(100, "indirect-oob", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "indirect past buffer end must be a graceful no-op" + ); +} + +#[test] +fn test_indirect_without_env_is_noop() { + // Property tests synthesize Indirect rules without an attached + // RuleEnvironment, so this path must be a graceful no-op (matching the + // `Use`-without-env contract). The engine logs at `debug!` rather than + // panicking via `debug_assert!` to preserve the never-panics invariant + // exercised by `prop_arbitrary_rule_evaluation_never_panics`. + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let buffer = [0u8; 4]; + let rules = vec![indirect_rule(0, "indirect-no-env", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "indirect without env must produce no matches" + ); +} + +#[test] +fn test_indirect_recursion_limit() { + // Root rules contain an indirect rule that points back to offset 0, + // creating an infinite re-entry chain. Must surface as + // `RecursionLimitExceeded`, not stack overflow. + let inner_indirect = indirect_rule(0, "recursive-indirect", vec![]); + let root_rules: Vec = vec![inner_indirect]; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(root_rules.as_slice()), + }); + let mut context = EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env); + + let buffer = [0u8; 8]; + let rules = vec![indirect_rule(0, "outer-indirect", vec![])]; + let result = evaluate_rules(&rules, &buffer, &mut context); + assert!( + matches!( + result, + Err(LibmagicError::EvaluationError( + crate::error::EvaluationError::RecursionLimitExceeded { .. } + )) + ), + "infinite indirect recursion must surface RecursionLimitExceeded, got {result:?}" + ); +} + +// ======================================================================= +// MetaType::Offset dispatch (issue #42) +// ======================================================================= + +/// Build an `Offset` rule at `offset` with an `x` (`AnyValue`) operator and +/// the given message. Mirrors `default_rule`/`indirect_rule` helpers. +fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Meta(MetaType::Offset), + op: Operator::AnyValue, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} + +#[test] +fn test_offset_emits_match_with_resolved_position() { + let rules = vec![offset_rule(5, "pos=%lld", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 10], &mut context).unwrap(); + assert_eq!(matches.len(), 1, "offset rule must emit exactly one match"); + assert_eq!(matches[0].offset, 5, "match.offset is the resolved offset"); + assert_eq!( + matches[0].value, + Value::Uint(5), + "match.value carries the resolved offset for format substitution" + ); + assert_eq!(matches[0].message, "pos=%lld"); +} + +#[test] +fn test_offset_at_zero() { + // Regression guard: offset 0 must still produce a match (not be + // indistinguishable from "no match"). + let rules = vec![offset_rule(0, "top", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].value, Value::Uint(0)); +} + +#[test] +fn test_offset_out_of_bounds_graceful_skip() { + // Offset past the end of the buffer is a data-dependent skip, not an + // error. Matches the Indirect dispatch's graceful-skip discipline. + let rules = vec![offset_rule(1_000_000, "unreachable", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!( + matches.is_empty(), + "offset past buffer end must produce no match" + ); +} + +#[test] +fn test_offset_non_x_operator_is_skipped() { + // magic(5) only allows `x` on an `offset` rule. Anything else is + // semantically undefined -> debug-log + skip. + let mut rule = offset_rule(0, "bogus", vec![]); + rule.op = Operator::Equal; + rule.value = Value::Uint(5); + let rules = vec![rule]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!( + matches.is_empty(), + "offset rule with non-AnyValue operator must be skipped" + ); +} + +#[test] +fn test_offset_evaluates_children() { + // A child byte rule at offset 0 runs AFTER the parent offset rule + // fires. The child's own offset is resolved independently. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let mut parent = offset_rule( + 0, + "parent-offset", + vec![byte_eq_rule(0, 0x42, "child-byte")], + ); + // Child level must be deeper than parent per MagicRule::validate. + parent.children[0].level = 1; + let buffer = [0x42u8, 0x00, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!(messages, vec!["parent-offset", "child-byte"]); +} + +#[test] +fn test_offset_advances_anchor_for_children() { + // An offset rule at position 5 advances `last_match_end` to 5 *for its + // children* -- but NOT for sibling rules at the same level. This + // matches libmagic's continuation-level semantics: each sibling at + // level L resolves `&N` against the parent-level anchor, not against + // the previous sibling's advance. See the `entry_anchor` discipline + // in `evaluate_rules`. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // A child of the offset rule uses &0 to resolve at the offset rule's + // resolved position (5). buffer[5] = 0x42. + let mut child = byte_eq_rule(0, 0x42, "child-at-offset-anchor"); + child.offset = OffsetSpec::Relative(0); + child.level = 1; + + let buffer = [0x00u8, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00]; + let rules = vec![offset_rule(5, "mark", vec![child])]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + + assert!( + matches + .iter() + .any(|m| m.message == "child-at-offset-anchor"), + "child of offset rule must resolve against offset's anchor (5); got {matches:?}" + ); +} + +#[test] +fn test_offset_does_not_advance_anchor_for_continuation_siblings() { + // Regression guard for the libmagic continuation-sibling anchor + // semantic: two CHILD siblings at the same level resolve `&N` + // against the parent-level anchor, not against the previous + // sibling's advance. This is gated on `recursion_depth > 0`; + // top-level siblings still chain (see + // `relative_anchor_can_decrease_...` in the relative-offset + // integration tests). + // + // Parent `byte` at offset 0 matches 0x01 -> anchor = 1. Two + // child siblings at &0 must both read buffer[1] = 0x42. If the + // first child incorrectly advanced the anchor to 2, the second + // would read buffer[2] = 0x00 and miss. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x01), + message: "parent".to_string(), + children: vec![ + MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sibling-1".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }, + MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sibling-2".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }, + ], + level: 0, + strength_modifier: None, + }; + + let buffer = [0x01u8, 0x42, 0x00, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent", "sibling-1", "sibling-2"], + "both continuation siblings must resolve against parent anchor (1); \ + if sibling-1 advanced the anchor to 2, sibling-2 would read \ + buffer[2]=0x00 and fail" + ); +} + +#[test] +fn test_offset_sets_sibling_matched() { + // An offset rule match suppresses a following `default` sibling -- + // same discipline as any other matching rule. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + offset_rule(0, "offset-match", vec![]), + default_rule("DEFAULT-SUPPRESSED", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["offset-match"], + "default must be suppressed when offset sibling matched; got {matches:?}" + ); +} diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 364382c..ef52d3f 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -18,6 +18,21 @@ pub mod types; pub use engine::{evaluate_rules, evaluate_rules_with_config, evaluate_single_rule}; +/// Shared environment attached to an [`EvaluationContext`] so the engine can +/// resolve whole-database operations (currently: `Use` subroutine lookups; +/// eventually `indirect` whole-tree re-entry). +/// +/// Stored as an `Arc` so cloning a context across recursive calls is cheap +/// and the rule data can be shared safely across threads. +#[derive(Debug, Clone)] +pub(crate) struct RuleEnvironment { + /// Named subroutine table, keyed by identifier. + pub(crate) name_table: std::sync::Arc, + /// Top-level rule list retained for future whole-database operations. + #[allow(dead_code)] + pub(crate) root_rules: std::sync::Arc<[crate::parser::ast::MagicRule]>, +} + /// Context for maintaining evaluation state during rule processing /// /// The `EvaluationContext` tracks the current state of rule evaluation, @@ -54,6 +69,22 @@ pub struct EvaluationContext { recursion_depth: u32, /// Configuration settings for evaluation behavior config: EvaluationConfig, + /// Optional rule environment (name table + root rules) threaded from + /// [`MagicDatabase`](crate::MagicDatabase). Evaluations that come in + /// through the low-level [`evaluate_rules`] / [`evaluate_rules_with_config`] + /// surface (tests, programmatic consumers) run with `rule_env = None`, + /// in which case `MetaType::Use` rules are silent no-ops. + rule_env: Option>, + /// Base offset applied to absolute offset resolution. + /// + /// Normally 0. When evaluating a subroutine body via `MetaType::Use`, + /// this is set to the use-site offset so that the subroutine's + /// `OffsetSpec::Absolute(n)` rules resolve to `base + n` (matching + /// magic(5) / libmagic semantics: subroutines see offsets relative + /// to the caller's invocation point, not absolute file positions). + /// Restored to the caller's value on subroutine exit via the + /// `BaseOffsetScope` RAII guard in `engine/mod.rs`. + base_offset: usize, } impl EvaluationContext { @@ -79,9 +110,45 @@ impl EvaluationContext { last_match_end: 0, recursion_depth: 0, config, + rule_env: None, + base_offset: 0, } } + /// Read-only access to the subroutine base offset. Non-zero only + /// during a `MetaType::Use` body evaluation. + #[must_use] + pub(crate) const fn base_offset(&self) -> usize { + self.base_offset + } + + /// Set the subroutine base offset. + /// + /// `pub(crate)` and owned by the engine's `BaseOffsetScope` RAII + /// guard -- no external caller should set this directly. + pub(crate) fn set_base_offset(&mut self, offset: usize) { + self.base_offset = offset; + } + + /// Attach a rule environment to this context. + /// + /// The environment carries the name-subroutine table and root rule list + /// so the engine can resolve `MetaType::Use` rules and (eventually) + /// `MetaType::Indirect` re-entries. Intended to be called once by + /// [`MagicDatabase`](crate::MagicDatabase) before handing the context + /// to [`evaluate_rules`]. + #[must_use] + pub(crate) fn with_rule_env(mut self, env: std::sync::Arc) -> Self { + self.rule_env = Some(env); + self + } + + /// Read-only access to the attached rule environment, if any. + #[must_use] + pub(crate) fn rule_env(&self) -> Option<&RuleEnvironment> { + self.rule_env.as_deref() + } + /// Get the current offset position /// /// # Returns @@ -236,6 +303,7 @@ impl EvaluationContext { self.current_offset = 0; self.last_match_end = 0; self.recursion_depth = 0; + self.base_offset = 0; } } diff --git a/src/evaluator/offset/mod.rs b/src/evaluator/offset/mod.rs index 3da4561..40c42c8 100644 --- a/src/evaluator/offset/mod.rs +++ b/src/evaluator/offset/mod.rs @@ -117,15 +117,46 @@ pub(crate) fn resolve_offset_with_context( spec: &OffsetSpec, buffer: &[u8], last_match_end: usize, +) -> Result { + resolve_offset_with_base(spec, buffer, last_match_end, 0) +} + +/// Like [`resolve_offset_with_context`] but applies a subroutine +/// `base_offset` to positive absolute offsets. +/// +/// Inside a `MetaType::Use` subroutine body, `OffsetSpec::Absolute(n)` +/// with `n >= 0` resolves to `base_offset + n`, matching magic(5) +/// semantics where the subroutine's offsets are relative to the +/// caller's invocation point. Negative `Absolute`, `FromEnd`, +/// `Relative`, and `Indirect` are unaffected -- they already have +/// well-defined frames of reference (buffer end, previous match, or +/// a pointer read from the buffer). +pub(crate) fn resolve_offset_with_base( + spec: &OffsetSpec, + buffer: &[u8], + last_match_end: usize, + base_offset: usize, ) -> Result { match spec { OffsetSpec::Absolute(offset) => { - resolve_absolute_offset(*offset, buffer).map_err(|e| map_offset_error(&e, *offset)) + // Apply base_offset only to positive absolute offsets. + // Negative values mean "from end" and should not be shifted + // by the subroutine base. + let effective = if *offset >= 0 { + let abs = usize::try_from(*offset).unwrap_or(usize::MAX); + let biased = base_offset.saturating_add(abs); + i64::try_from(biased).unwrap_or(i64::MAX) + } else { + *offset + }; + resolve_absolute_offset(effective, buffer).map_err(|e| map_offset_error(&e, effective)) } OffsetSpec::Indirect { .. } => indirect::resolve_indirect_offset(spec, buffer), OffsetSpec::Relative(_) => relative::resolve_relative_offset(spec, buffer, last_match_end), OffsetSpec::FromEnd(offset) => { - // FromEnd is handled the same as negative Absolute offsets + // FromEnd is handled the same as negative Absolute offsets. + // Base offset does not apply -- "from end" is always + // relative to the buffer itself. resolve_absolute_offset(*offset, buffer).map_err(|e| map_offset_error(&e, *offset)) } } diff --git a/src/evaluator/strength.rs b/src/evaluator/strength.rs index f401da4..f800149 100644 --- a/src/evaluator/strength.rs +++ b/src/evaluator/strength.rs @@ -106,6 +106,38 @@ pub fn calculate_default_strength(rule: &MagicRule) -> i32 { TypeKind::Short { .. } => 10, // Single bytes are least specific TypeKind::Byte { .. } => 5, + // Meta-type directives do not read or compare bytes, so most of + // them contribute no ordering specificity. `Use` and `Indirect` + // get a moderate score because the rules they dispatch into can + // carry real specificity that is opaque from the call site. + // + // `clippy::match_same_arms` is silenced here so the per-variant + // rationale is preserved verbatim instead of being collapsed into + // a single OR-arm: the variants are semantically distinct (each + // dispatches into a different evaluator path) and the explicit + // table is the documentation we want to keep next to the values. + #[allow(clippy::match_same_arms)] + TypeKind::Meta(meta) => match meta { + // `default` must sort below every real rule so it only fires + // when no sibling matched at the current level. + crate::parser::ast::MetaType::Default => 0, + // `clear` is a control-flow toggle with no byte-matching + // specificity of its own. + crate::parser::ast::MetaType::Clear => 0, + // `name` rules are extracted at load time and never sorted at + // eval time; the value is provided for completeness. + crate::parser::ast::MetaType::Name(_) => 0, + // `use` dispatches into a subroutine whose specificity is + // opaque from the call site -- give it a moderate weight so + // it sorts above pure no-ops but below real type-bearing rules. + crate::parser::ast::MetaType::Use(_) => 5, + // `indirect` re-evaluates the root rule set at the resolved + // offset; same rationale as `use` for the moderate weight. + crate::parser::ast::MetaType::Indirect => 5, + // `offset` reports the current file offset rather than reading + // a typed value -- no byte-matching specificity. + crate::parser::ast::MetaType::Offset => 0, + }, }; // Operator contribution: equality is most specific @@ -1041,4 +1073,87 @@ mod tests { "Absolute strength {absolute_strength} should be > relative strength {relative_strength}" ); } + + // ============================================================ + // MetaType strength tests + // ============================================================ + + fn meta_rule(meta: crate::parser::ast::MetaType, msg: &str) -> MagicRule { + let mut rule = make_rule( + TypeKind::Meta(meta), + Operator::Equal, + OffsetSpec::Absolute(0), + Value::Uint(0), + ); + rule.message = msg.to_string(); + rule + } + + #[test] + fn test_meta_default_and_clear_sort_to_bottom() { + use crate::parser::ast::MetaType; + let mut rules = vec![ + meta_rule(MetaType::Default, "default"), + meta_rule(MetaType::Clear, "clear"), + { + let mut r = make_rule( + TypeKind::Byte { signed: true }, + Operator::Equal, + OffsetSpec::Absolute(0), + Value::Uint(0), + ); + r.message = "byte".to_string(); + r + }, + ]; + + sort_rules_by_strength(&mut rules); + + // Byte rule has nonzero strength; default/clear are 0 + Equal 10 + + // Absolute 10 + numeric 0 = 20. Byte is 5 + Equal 10 + Absolute 10 + // = 25 -- so byte sorts first. + assert_eq!(rules[0].message, "byte"); + } + + #[test] + fn test_meta_use_and_indirect_sort_above_default() { + use crate::parser::ast::MetaType; + let use_rule = meta_rule(MetaType::Use("sub".to_string()), "use"); + let indirect_rule = meta_rule(MetaType::Indirect, "indirect"); + let default_rule = meta_rule(MetaType::Default, "default"); + let clear_rule = meta_rule(MetaType::Clear, "clear"); + + // use/indirect strength: 5 + Equal 10 + Absolute 10 = 25 + // default/clear strength: 0 + Equal 10 + Absolute 10 = 20 + assert!( + calculate_default_strength(&use_rule) > calculate_default_strength(&default_rule), + "use should sort above default" + ); + assert!( + calculate_default_strength(&indirect_rule) > calculate_default_strength(&default_rule), + "indirect should sort above default" + ); + assert!( + calculate_default_strength(&use_rule) > calculate_default_strength(&clear_rule), + "use should sort above clear" + ); + assert!( + calculate_default_strength(&indirect_rule) > calculate_default_strength(&clear_rule), + "indirect should sort above clear" + ); + } + + #[test] + fn test_meta_name_strength_is_zero() { + use crate::parser::ast::MetaType; + let name_rule = meta_rule(MetaType::Name("foo".to_string()), "name"); + let default_rule = meta_rule(MetaType::Default, "default"); + // Both Name and Default should produce identical strength scores + // (both contribute 0 from the type axis). + assert_eq!( + calculate_default_strength(&name_rule), + calculate_default_strength(&default_rule), + "Name strength should equal Default strength (both type-axis 0)" + ); + } } diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index 132758f..6b94044 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -198,7 +198,21 @@ pub fn read_typed_value_with_pattern( TypeKind::Double { endian } => read_double(buffer, offset, *endian), TypeKind::Date { endian, utc } => read_date(buffer, offset, *endian, *utc), TypeKind::QDate { endian, utc } => read_qdate(buffer, offset, *endian, *utc), - TypeKind::String { max_length } => read_string(buffer, offset, *max_length), + TypeKind::String { max_length } => { + // libmagic semantics: `string TEST` compares the first + // pattern-length bytes of the buffer against the literal, + // *not* a NUL-terminated C-string. When the rule specifies + // no explicit `max_length` but the operand is a `Value::String` + // literal, fall back to the pattern's byte length so the + // read/compare path matches GNU `file` on NUL-free inputs. + // An explicit `max_length` on the rule always wins. + let effective_max = match (max_length, pattern) { + (Some(n), _) => Some(*n), + (None, Some(Value::String(p))) => Some(p.len()), + (None, _) => None, + }; + read_string(buffer, offset, effective_max) + } TypeKind::PString { max_length, length_width, @@ -240,6 +254,9 @@ pub fn read_typed_value_with_pattern( Ok(read_search(buffer, offset, pattern_bytes, *range)? .unwrap_or_else(|| Value::String(String::new()))) } + TypeKind::Meta(meta) => Err(TypeReadError::UnsupportedType { + type_name: format!("meta-type {meta:?} cannot be read as a value"), + }), } } @@ -291,6 +308,9 @@ pub(crate) fn read_pattern_match( }; read_search(buffer, offset, pattern_bytes, *range) } + TypeKind::Meta(meta) => Err(TypeReadError::UnsupportedType { + type_name: format!("meta-type {meta:?} cannot be read as a pattern match"), + }), _ => Err(TypeReadError::UnsupportedType { type_name: format!("read_pattern_match called on non-pattern type: {type_kind:?}"), }), @@ -433,7 +453,35 @@ pub(crate) fn bytes_consumed_with_pattern( } match type_kind { - TypeKind::String { max_length } => string_bytes_consumed(buffer, offset, *max_length), + TypeKind::String { max_length } => { + // For the (`max_length: None`, string literal pattern) + // combination we now compare exactly `pattern.len()` bytes + // in `read_typed_value_with_pattern` (libmagic semantics). + // Keep the NUL-terminator inclusion that the chained-record + // tests rely on by peeking at the byte immediately after + // the pattern window: if it is NUL, consume one extra + // byte; otherwise stop at the pattern boundary. Explicit + // `max_length` rules and non-string patterns keep the + // original NUL-scan behavior. + match (max_length, pattern) { + (Some(n), _) => string_bytes_consumed(buffer, offset, Some(*n)), + (None, Some(Value::String(p))) => { + let plen = p.len(); + let base = offset + .checked_add(plen) + .map_or(0, |end| if end > buffer.len() { 0 } else { plen }); + if base == 0 { + 0 + } else { + match buffer.get(offset.saturating_add(plen)) { + Some(&0) => base.saturating_add(1), + _ => base, + } + } + } + (None, _) => string_bytes_consumed(buffer, offset, None), + } + } TypeKind::PString { max_length, length_width, @@ -502,6 +550,12 @@ pub(crate) fn bytes_consumed_with_pattern( ); 0 } + // Meta-type directives do not consume buffer bytes; the anchor + // should not advance when a meta rule is encountered. Per the + // GOTCHAS S2.1 checklist, listing them explicitly (rather than + // relying on a `_ =>` wildcard) keeps the match exhaustive so + // any future `TypeKind` variant triggers a compile error. + TypeKind::Meta(_) => 0, } } diff --git a/src/lib.rs b/src/lib.rs index 9f79fde..7abd98c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -151,6 +151,15 @@ impl From for LibmagicError { #[derive(Debug)] pub struct MagicDatabase { rules: Vec, + /// Named subroutine definitions extracted from magic file `name` rules, + /// keyed by identifier. The evaluator consults this table when a rule of + /// type `TypeKind::Meta(MetaType::Use(name))` is reached. + name_table: std::sync::Arc, + /// Top-level rules retained as a shared immutable slice. Passed through + /// the evaluation context as part of the rule environment so future + /// whole-database operations (e.g. `indirect`) can re-enter at the root + /// without re-sorting or cloning the rule tree. + root_rules: std::sync::Arc<[MagicRule]>, config: EvaluationConfig, /// Optional path to the source magic file or directory from which rules were loaded. /// This is used for debugging and logging purposes. @@ -241,8 +250,11 @@ impl MagicDatabase { config.validate()?; let mut rules = crate::builtin_rules::get_builtin_rules(); crate::evaluator::strength::sort_rules_by_strength_recursive(&mut rules); + let root_rules: std::sync::Arc<[MagicRule]> = std::sync::Arc::from(rules.as_slice()); Ok(Self { rules, + name_table: std::sync::Arc::new(crate::parser::name_table::NameTable::empty()), + root_rules, config, source_path: None, mime_mapper: mime::MimeMapper::new(), @@ -296,14 +308,27 @@ impl MagicDatabase { config: EvaluationConfig, ) -> Result { config.validate()?; - let mut rules = parser::load_magic_file(path.as_ref()).map_err(|e| match e { + let parsed = parser::load_magic_file(path.as_ref()).map_err(|e| match e { ParseError::IoError(io_err) => LibmagicError::IoError(io_err), other => LibmagicError::ParseError(other), })?; + let parser::ParsedMagic { + mut rules, + mut name_table, + } = parsed; crate::evaluator::strength::sort_rules_by_strength_recursive(&mut rules); + // Each named subroutine body must be sorted by the same strength + // ordering so evaluation of a `use` site is deterministic and + // matches the ordering applied to top-level rules. + for subroutine in name_table.values_mut() { + crate::evaluator::strength::sort_rules_by_strength_recursive(subroutine); + } + let root_rules: std::sync::Arc<[MagicRule]> = std::sync::Arc::from(rules.as_slice()); Ok(Self { rules, + name_table: std::sync::Arc::new(name_table), + root_rules, config, source_path: Some(path.as_ref().to_path_buf()), mime_mapper: mime::MimeMapper::new(), @@ -361,7 +386,6 @@ impl MagicDatabase { /// # Ok::<(), Box>(()) /// ``` pub fn evaluate_file>(&self, path: P) -> Result { - use crate::evaluator::evaluate_rules_with_config; use crate::io::FileBuffer; use std::fs; use std::time::Instant; @@ -387,11 +411,12 @@ impl MagicDatabase { let file_buffer = FileBuffer::from_path_and_metadata(path, &file_metadata)?; let buffer = file_buffer.as_slice(); - // Evaluate rules against the file buffer. `evaluate_rules_with_config` - // returns `Ok(vec![])` for an empty rule list, so no guard is needed. - let matches = evaluate_rules_with_config(&self.rules, buffer, &self.config)?; - - Ok(self.build_result(matches, file_size, start_time)) + // Route the evaluation through `evaluate_buffer_internal` so the + // rule environment (name table + root rules) is attached to the + // context identically for in-memory and on-disk paths. + let mut result = self.evaluate_buffer_internal(buffer, start_time)?; + result.metadata.file_size = file_size; + Ok(result) } /// Evaluate magic rules against an in-memory buffer @@ -429,13 +454,28 @@ impl MagicDatabase { buffer: &[u8], start_time: std::time::Instant, ) -> Result { - use crate::evaluator::evaluate_rules_with_config; + use crate::evaluator::{EvaluationContext, RuleEnvironment, evaluate_rules}; let file_size = buffer.len() as u64; - // `evaluate_rules_with_config` returns `Ok(vec![])` for an empty - // rule list, so no `is_empty()` guard is needed here. - let matches = evaluate_rules_with_config(&self.rules, buffer, &self.config)?; + // Validate config once at the entry point to match the previous + // behavior of `evaluate_rules_with_config`. + self.config.validate()?; + + // Reset the thread-local regex compile cache so it is bounded to + // the lifetime of a single top-level evaluation call. + crate::evaluator::types::regex::reset_regex_cache(); + + let env = std::sync::Arc::new(RuleEnvironment { + name_table: self.name_table.clone(), + root_rules: self.root_rules.clone(), + }); + + let mut context = EvaluationContext::new(self.config.clone()).with_rule_env(env); + + // `evaluate_rules` returns `Ok(vec![])` for an empty rule list, + // so no `is_empty()` guard is needed here. + let matches = evaluate_rules(&self.rules, buffer, &mut context)?; Ok(self.build_result(matches, file_size, start_time)) } @@ -484,20 +524,32 @@ impl MagicDatabase { /// Concatenate match messages following libmagic behavior /// - /// Messages are joined with spaces, except when a message starts with - /// backspace character (\\b) which suppresses the space. + /// Each match's `message` is first run through + /// [`crate::output::format::format_magic_message`], which substitutes + /// printf-style specifiers (`%lld`, `%02x`, `%s`, etc.) with the + /// rule's read value. The resulting rendered strings are then joined + /// with spaces, except when a rendered string starts with the + /// backspace character (`\b`, U+0008) which suppresses both the + /// separating space and the backspace itself (GOTCHAS.md S14.1). + /// + /// The backspace check runs on the *post-substitution* text so rules + /// like `\b, version %s` compose correctly once the specifier has been + /// rendered. fn concatenate_messages(matches: &[evaluator::RuleMatch]) -> String { + use crate::output::format::format_magic_message; + let capacity: usize = matches.iter().map(|m| m.message.len() + 1).sum(); let mut result = String::with_capacity(capacity); for m in matches { - if let Some(rest) = m.message.strip_prefix('\u{0008}') { + let rendered = format_magic_message(&m.message, &m.value, &m.type_kind); + if let Some(rest) = rendered.strip_prefix('\u{0008}') { // Backspace suppresses the space and the character itself result.push_str(rest); } else if !result.is_empty() { result.push(' '); - result.push_str(&m.message); + result.push_str(&rendered); } else { - result.push_str(&m.message); + result.push_str(&rendered); } } result diff --git a/src/output/format.rs b/src/output/format.rs new file mode 100644 index 0000000..4c9decc --- /dev/null +++ b/src/output/format.rs @@ -0,0 +1,623 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Printf-style format specifier substitution for magic rule messages. +//! +//! Magic file messages frequently contain C-style format specifiers such as +//! `%lld`, `%02x`, or `%s` that reference the rule's read value. GNU `file` +//! renders the message with the value substituted at the specifier's +//! position; without this pass libmagic-rs would emit the literal +//! specifier tokens (e.g., `at_offset %lld`) and diverge visibly from +//! `file(1)` output. +//! +//! The substitution is intentionally narrow: it supports the subset of +//! C's `printf` syntax that appears in shipping magic corpora (notably +//! `third_party/tests/searchbug.magic` and the GNU `file` `Magdir` +//! collection). Unrecognized specifiers pass through literally with a +//! `debug!` log rather than erroring -- matching the evaluator's +//! graceful-skip discipline. +//! +//! Width masking for hex specifiers uses [`crate::parser::ast::TypeKind::bit_width`] +//! so that e.g. a signed byte rendered with `%02x` produces the unsigned +//! 8-bit interpretation (`0xff`, not `0xffffffffffffffff`). +//! +//! See the project plan at +//! `docs/plans/2026-04-22-001-feat-meta-type-offset-and-format-substitution-plan.md` +//! for scope, and GOTCHAS.md S14.2 for historical context. + +use log::debug; + +use crate::parser::ast::{TypeKind, Value}; + +/// Substitute printf-style format specifiers in a magic rule message. +/// +/// Walks `template` left to right. Plain text is copied verbatim; on +/// each `%`, the full specifier (`%[flags][width][.precision][length]`) +/// is parsed and substituted from `value`. `%%` emits a single `%`. +/// Unrecognized or malformed specifiers are passed through literally +/// with a `debug!` log. +/// +/// `type_kind` is consulted only for hex specifiers, which need the +/// natural bit width of the underlying read to mask sign-extended +/// values correctly. For non-hex specifiers `type_kind` is ignored. +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::output::format::format_magic_message; +/// use libmagic_rs::parser::ast::{TypeKind, Value}; +/// +/// let out = format_magic_message( +/// "at_offset %lld", +/// &Value::Uint(11), +/// &TypeKind::Byte { signed: false }, +/// ); +/// assert_eq!(out, "at_offset 11"); +/// +/// let out = format_magic_message( +/// "followed_by 0x%02x", +/// &Value::Uint(0x31), +/// &TypeKind::Byte { signed: false }, +/// ); +/// assert_eq!(out, "followed_by 0x31"); +/// +/// // Unknown specifier falls through literally. +/// let out = format_magic_message("%q", &Value::Uint(0), &TypeKind::Byte { signed: false }); +/// assert_eq!(out, "%q"); +/// +/// // `%%` is an escaped literal percent. +/// let out = format_magic_message("100%% sure", &Value::Uint(0), &TypeKind::Byte { signed: false }); +/// assert_eq!(out, "100% sure"); +/// ``` +#[must_use] +pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) -> String { + let mut out = String::with_capacity(template.len()); + let bytes = template.as_bytes(); + let mut i = 0; + + while i < bytes.len() { + let b = bytes[i]; + if b != b'%' { + // SAFETY: iterating by byte but template is valid UTF-8; any + // non-ASCII multi-byte character has all continuation bytes + // > 0x7f which cannot equal b'%' (0x25), so we never split + // a UTF-8 codepoint here. Push as char. + out.push(b as char); + i += 1; + continue; + } + + // Start of a format specifier at position i. + let spec_start = i; + let Some(parsed_spec) = parse_spec(bytes, i + 1) else { + // Malformed specifier (e.g., trailing `%` with nothing after, + // or a sequence that doesn't end in a valid conversion char). + // Pass through the remaining literal and stop scanning. + debug!( + "format_magic_message: malformed specifier at byte {i} in template {template:?}; passing through remainder literally", + ); + out.push_str(&template[i..]); + break; + }; + let next_i = parsed_spec.end; + if let Some(rendered) = render(&parsed_spec, value, type_kind) { + out.push_str(&rendered); + } else { + // Type mismatch or unsupported conversion; pass through the + // literal specifier and log. + let literal = &template[spec_start..next_i]; + debug!( + "format_magic_message: unsupported specifier {literal:?} for value {value:?}; passing through literally", + ); + out.push_str(literal); + } + i = next_i; + } + + out +} + +/// Kinds of conversion characters we recognize. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Conv { + /// `%d`, `%i`, `%ld`, `%lld` -- signed decimal. + Signed, + /// `%u`, `%lu`, `%llu` -- unsigned decimal. + Unsigned, + /// `%x` -- lowercase hex. + HexLower, + /// `%X` -- uppercase hex. + HexUpper, + /// `%o` -- octal. + Octal, + /// `%s` -- string. + Str, + /// `%c` -- single character (from an integer codepoint, ASCII range only). + Char, + /// `%%` -- literal percent. + Percent, +} + +/// Parsed format specifier. +#[derive(Debug, Clone)] +struct Spec { + zero_pad: bool, + left_align: bool, + alt_form: bool, + width: usize, + conv: Conv, + /// Byte index of the character *after* this specifier in the template. + end: usize, +} + +/// Parse a format specifier starting at `start` (the first byte after the +/// leading `%`). Returns `None` if the sequence does not end in a +/// recognized conversion character. +fn parse_spec(bytes: &[u8], start: usize) -> Option { + let mut i = start; + let mut zero_pad = false; + let mut left_align = false; + let mut alt_form = false; + + // Flags (subset: 0, -, #). Other flags (+, space) are parsed but ignored. + while i < bytes.len() { + match bytes[i] { + b'0' => { + zero_pad = true; + i += 1; + } + b'-' => { + left_align = true; + i += 1; + } + b'#' => { + alt_form = true; + i += 1; + } + b'+' | b' ' => { + // Accepted for syntactic completeness, no rendering effect + // in the current subset. + i += 1; + } + _ => break, + } + } + + // Width (decimal digits). + let mut width: usize = 0; + while i < bytes.len() && bytes[i].is_ascii_digit() { + let digit = (bytes[i] - b'0') as usize; + width = width.saturating_mul(10).saturating_add(digit); + i += 1; + } + + // Precision (`.`): parsed and skipped -- no current consumer + // requires precision handling, and numeric rendering is whole-value. + if i < bytes.len() && bytes[i] == b'.' { + i += 1; + while i < bytes.len() && bytes[i].is_ascii_digit() { + i += 1; + } + } + + // Length modifier (`h`, `hh`, `l`, `ll`, `j`, `z`, `t`). We consume + // these for syntactic completeness but never rely on them -- all + // numeric rendering uses full u64/i64 width. + while i < bytes.len() { + match bytes[i] { + b'l' | b'h' | b'j' | b'z' | b't' => i += 1, + _ => break, + } + } + + if i >= bytes.len() { + return None; + } + + let conv = match bytes[i] { + b'd' | b'i' => Conv::Signed, + b'u' => Conv::Unsigned, + b'x' => Conv::HexLower, + b'X' => Conv::HexUpper, + b'o' => Conv::Octal, + b's' => Conv::Str, + b'c' => Conv::Char, + b'%' => Conv::Percent, + _ => return None, + }; + i += 1; + + Some(Spec { + zero_pad, + left_align, + alt_form, + width, + conv, + end: i, + }) +} + +/// Render the specifier against `value`, or return `None` if the value +/// is type-incompatible with the conversion. +fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option { + match spec.conv { + Conv::Percent => Some("%".to_string()), + Conv::Str => Some(render_string(value)), + Conv::Signed => { + let n = coerce_to_i64(value)?; + Some(pad_numeric(&n.to_string(), spec)) + } + Conv::Unsigned => { + let n = coerce_to_u64(value)?; + Some(pad_numeric(&n.to_string(), spec)) + } + Conv::HexLower => { + let n = coerce_to_u64_masked(value, type_kind)?; + Some(render_prefixed_int( + &format!("{n:x}"), + if spec.alt_form { "0x" } else { "" }, + spec, + )) + } + Conv::HexUpper => { + let n = coerce_to_u64_masked(value, type_kind)?; + Some(render_prefixed_int( + &format!("{n:X}"), + if spec.alt_form { "0x" } else { "" }, + spec, + )) + } + Conv::Octal => { + let n = coerce_to_u64_masked(value, type_kind)?; + Some(render_prefixed_int( + &format!("{n:o}"), + if spec.alt_form { "0o" } else { "" }, + spec, + )) + } + Conv::Char => { + let n = coerce_to_u64(value)?; + let byte = u8::try_from(n).ok()?; + if byte > 0x7f { + return None; + } + Some(pad_numeric(&(byte as char).to_string(), spec)) + } + } +} + +/// Render a [`Value`] for `%s`. Strings pass through; byte sequences are +/// converted via lossy UTF-8; numbers render as decimal (GNU `file` does +/// the same for mixed-type `%s` substitutions). +fn render_string(value: &Value) -> String { + match value { + Value::String(s) => s.clone(), + Value::Bytes(b) => String::from_utf8_lossy(b).into_owned(), + Value::Uint(n) => n.to_string(), + Value::Int(n) => n.to_string(), + Value::Float(f) => f.to_string(), + } +} + +/// Coerce a numeric-ish [`Value`] to `i64`. Float values are truncated +/// toward zero (documented intent -- matches C's `(long long)float` +/// semantics that libmagic's `printf` path relies on). String/Bytes +/// values have no sensible mapping and return `None`. +#[allow( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::cast_possible_wrap +)] +fn coerce_to_i64(value: &Value) -> Option { + match value { + Value::Int(n) => Some(*n), + // u64 -> i64 bit-pattern reinterpret: matches C's implicit + // cast in `printf("%lld", (unsigned long long)...)`. + Value::Uint(n) => Some(*n as i64), + // f64 -> i64 truncation toward zero, matching C behavior for + // `printf("%d", (double)...)`. + Value::Float(f) => Some(*f as i64), + Value::String(_) | Value::Bytes(_) => None, + } +} + +/// Coerce a numeric-ish [`Value`] to `u64`. Mirrors [`coerce_to_i64`] +/// but preserves the unsigned bit pattern when the source is signed. +#[allow( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::cast_precision_loss +)] +fn coerce_to_u64(value: &Value) -> Option { + match value { + Value::Uint(n) => Some(*n), + // i64 -> u64 bit-pattern reinterpret for rendering; parallels + // the `coerce_to_i64` case. + Value::Int(n) => Some(*n as u64), + Value::Float(f) => Some(*f as u64), + Value::String(_) | Value::Bytes(_) => None, + } +} + +/// Coerce a numeric-ish [`Value`] to `u64`, masked to the natural bit +/// width of `type_kind`. Used by hex/octal specifiers to avoid +/// surprising sign-extended renderings like `byte = -1` rendering as +/// `ffffffffffffffff` when the user expected `ff`. +fn coerce_to_u64_masked(value: &Value, type_kind: &TypeKind) -> Option { + let raw = coerce_to_u64(value)?; + let mask = match type_kind.bit_width() { + Some(8) => 0xff_u64, + Some(16) => 0xffff_u64, + Some(32) => 0xffff_ffff_u64, + // 64-bit, unknown width, or any other case: no mask needed. + _ => return Some(raw), + }; + Some(raw & mask) +} + +/// Render a numeric body with an alt-form prefix (`0x` / `0o` / empty), +/// applying width and padding correctly. +/// +/// For zero-padded widths (`%#0Nx`), C printf inserts zeros *between* +/// the prefix and the digits: `%#06x` + `0xab` -> `0x00ab`, not +/// ` 0xab`. For space-padded widths (`%#Nx`), the spaces go *before* +/// the prefix: `%#6x` + `0xab` -> ` 0xab`. For left-aligned widths +/// (`%-#6x`), trailing spaces follow the digits: `0xab `. +fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String { + // The effective body length for width comparison is prefix + digits. + let body_len = prefix.len() + digits.len(); + if body_len >= spec.width { + return format!("{prefix}{digits}"); + } + let pad = spec.width - body_len; + if spec.zero_pad && !spec.left_align { + // Zeros insert between the prefix and the digits. + let zeros: String = std::iter::repeat_n('0', pad).collect(); + format!("{prefix}{zeros}{digits}") + } else if spec.left_align { + let spaces: String = std::iter::repeat_n(' ', pad).collect(); + format!("{prefix}{digits}{spaces}") + } else { + let spaces: String = std::iter::repeat_n(' ', pad).collect(); + format!("{spaces}{prefix}{digits}") + } +} + +/// Apply width and padding to an already-rendered numeric body. +fn pad_numeric(body: &str, spec: &Spec) -> String { + if body.len() >= spec.width { + return body.to_string(); + } + let pad = spec.width - body.len(); + let pad_char = if spec.zero_pad && !spec.left_align { + '0' + } else { + ' ' + }; + let padding: String = std::iter::repeat_n(pad_char, pad).collect(); + if spec.left_align { + format!("{body}{padding}") + } else { + format!("{padding}{body}") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn byte_t() -> TypeKind { + TypeKind::Byte { signed: false } + } + + fn long_t() -> TypeKind { + TypeKind::Long { + endian: crate::parser::ast::Endianness::Little, + signed: true, + } + } + + // ---- happy path -------------------------------------------------- + + #[test] + fn test_signed_decimal_substitution() { + // Covers %d, %i, %ld, %lld (length modifiers are accepted and ignored). + let cases = [ + ("v=%d", Value::Int(-7), "v=-7"), + ("v=%i", Value::Int(42), "v=42"), + ("v=%ld", Value::Int(10), "v=10"), + ("at_offset %lld", Value::Uint(11), "at_offset 11"), + ]; + for (tmpl, val, expected) in cases { + assert_eq!( + format_magic_message(tmpl, &val, &byte_t()), + expected, + "template {tmpl:?} with value {val:?}", + ); + } + } + + #[test] + fn test_unsigned_decimal_substitution() { + let out = format_magic_message("n=%u", &Value::Uint(200), &byte_t()); + assert_eq!(out, "n=200"); + + // i64::MIN as unsigned should come through as 2^63. + let out = format_magic_message("n=%llu", &Value::Int(i64::MIN), &long_t()); + assert_eq!(out, "n=9223372036854775808"); + } + + #[test] + fn test_hex_substitution_with_byte_width_masking() { + // The canonical searchbug.result case: ubyte `%02x`. + let out = format_magic_message("0x%02x", &Value::Uint(0x31), &byte_t()); + assert_eq!(out, "0x31"); + + // Byte -1 (sign-extended to u64::MAX in Value::Int) must render as "ff", + // not "ffffffffffffffff", when the underlying type is a byte. + let out = format_magic_message("0x%02x", &Value::Int(-1), &byte_t()); + assert_eq!(out, "0xff"); + + // %X is uppercase. + let out = format_magic_message("%X", &Value::Uint(0xdead_beef), &long_t()); + assert_eq!(out, "DEADBEEF"); + + // %#x emits the "0x" prefix via alt form. + let out = format_magic_message("%#x", &Value::Uint(0xab), &byte_t()); + assert_eq!(out, "0xab"); + + // %#06x: zero-pad inserts between prefix and digits (C printf semantics), + // not before the prefix. Regression guard for correctness review COR-002. + let out = format_magic_message("%#06x", &Value::Uint(0xab), &byte_t()); + assert_eq!(out, "0x00ab"); + + // Space-padded width with alt-form prefix: spaces go before prefix. + let out = format_magic_message("%#6x", &Value::Uint(0xab), &byte_t()); + assert_eq!(out, " 0xab"); + + // Left-aligned with alt-form prefix: spaces trail the digits. + let out = format_magic_message("%-#6x|", &Value::Uint(0xab), &byte_t()); + assert_eq!(out, "0xab |"); + + // %#08o: zero-pad inserts between "0o" prefix and digits. + let out = format_magic_message("%#08o", &Value::Uint(8), &byte_t()); + assert_eq!(out, "0o000010"); + } + + #[test] + fn test_string_substitution() { + let out = format_magic_message( + "hello %s", + &Value::String("world".to_string()), + &TypeKind::String { max_length: None }, + ); + assert_eq!(out, "hello world"); + + // Bytes go through lossy UTF-8. + let out = format_magic_message( + "data=%s", + &Value::Bytes(b"abc".to_vec()), + &TypeKind::String { max_length: None }, + ); + assert_eq!(out, "data=abc"); + } + + #[test] + fn test_octal_substitution() { + let out = format_magic_message("%o", &Value::Uint(8), &byte_t()); + assert_eq!(out, "10"); + let out = format_magic_message("%#o", &Value::Uint(8), &byte_t()); + assert_eq!(out, "0o10"); + } + + #[test] + fn test_char_substitution() { + let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, "[A]"); + } + + #[test] + fn test_percent_escape() { + let out = format_magic_message("100%% sure", &Value::Uint(0), &byte_t()); + assert_eq!(out, "100% sure"); + } + + #[test] + fn test_multiple_specifiers_in_one_template() { + // Note: current implementation binds every specifier to the single + // `value`; multiple specifiers are rendered against the same value. + // This matches libmagic's single-argument model -- magic rules only + // expose one read value per rule. + let out = format_magic_message("a=%d b=%d", &Value::Int(5), &long_t()); + assert_eq!(out, "a=5 b=5"); + } + + #[test] + fn test_width_padding() { + // Zero-padded width. + let out = format_magic_message("%05d", &Value::Int(42), &long_t()); + assert_eq!(out, "00042"); + // Space-padded width. + let out = format_magic_message("%5d", &Value::Int(42), &long_t()); + assert_eq!(out, " 42"); + // Left-aligned (zero flag ignored when `-` is set). + let out = format_magic_message("%-5d|", &Value::Int(42), &long_t()); + assert_eq!(out, "42 |"); + } + + // ---- edge cases -------------------------------------------------- + + #[test] + fn test_empty_template() { + assert_eq!( + format_magic_message("", &Value::Uint(0), &byte_t()), + String::new() + ); + } + + #[test] + fn test_literal_with_no_specifiers() { + assert_eq!( + format_magic_message("hello world", &Value::Uint(0), &byte_t()), + "hello world" + ); + } + + #[test] + fn test_trailing_percent_with_no_spec() { + // A stray `%` at end-of-string: pass through literally. + let out = format_magic_message("done %", &Value::Uint(0), &byte_t()); + assert_eq!(out, "done %"); + } + + #[test] + fn test_unknown_specifier_pass_through() { + // `%q` is not in our subset. + let out = format_magic_message("bad %q end", &Value::Uint(0), &byte_t()); + assert_eq!(out, "bad %q end"); + } + + #[test] + fn test_type_mismatch_string_conv_on_uint_still_renders() { + // `%s` against an integer value -- GNU `file` renders the number + // as decimal; libmagic-rs matches that behavior via `render_string`. + let out = format_magic_message("v=%s", &Value::Uint(42), &byte_t()); + assert_eq!(out, "v=42"); + } + + #[test] + fn test_type_mismatch_numeric_conv_on_string_passes_through() { + // `%d` against a string has no sensible coercion -> literal. + let out = format_magic_message( + "v=%d", + &Value::String("hi".to_string()), + &TypeKind::String { max_length: None }, + ); + assert_eq!(out, "v=%d"); + } + + #[test] + fn test_char_specifier_rejects_non_ascii() { + // Values above 0x7f cannot be rendered as `%c` -> pass through literally. + let out = format_magic_message("[%c]", &Value::Uint(0xff), &byte_t()); + assert_eq!(out, "[%c]"); + } + + #[test] + fn test_byte_width_masking_on_negative_signed_byte() { + // Regression guard: a signed byte carrying -1 (the representation + // on the Value side is Int(-1)) must NOT render as a 64-bit mask. + let out = format_magic_message("%x", &Value::Int(-1), &byte_t()); + assert_eq!(out, "ff"); + } + + #[test] + fn test_hex_width_masking_respects_16bit() { + let short_t = TypeKind::Short { + endian: crate::parser::ast::Endianness::Little, + signed: true, + }; + let out = format_magic_message("%x", &Value::Int(-1), &short_t); + assert_eq!(out, "ffff"); + } +} diff --git a/src/output/mod.rs b/src/output/mod.rs index be0db8c..d4089e3 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -9,6 +9,7 @@ //! The module follows a structured approach where evaluation results contain metadata //! about the evaluation process and a list of matches found during rule processing. +pub mod format; pub mod json; pub mod text; diff --git a/src/parser/ast.rs b/src/parser/ast.rs index e92b595..f10a384 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -165,6 +165,99 @@ pub enum OffsetSpec { FromEnd(i64), } +/// Control-flow directive carried by [`TypeKind::Meta`]. +/// +/// These are not value-reading types -- they correspond to magic(5) +/// control-flow keywords (`default`, `clear`, `name`, `use`, `indirect`) +/// that modify how a rule set is traversed rather than which bytes are +/// read from the buffer. In this phase they are parsed and preserved in +/// the AST but evaluated as silent no-ops; subsequent phases will wire +/// each variant into the evaluator. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[non_exhaustive] +pub enum MetaType { + /// `default` directive: fires when no sibling at the same indentation + /// level has matched at the current offset. See magic(5) for the + /// "default" type semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Default; + /// assert_eq!(meta, MetaType::Default); + /// ``` + Default, + /// `clear` directive: resets the sibling-matched flag so a later + /// `default` sibling can fire even if an earlier sibling matched. + /// See magic(5) for the "clear" type semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Clear; + /// assert_eq!(meta, MetaType::Clear); + /// ``` + Clear, + /// `name ` directive: declares a named subroutine that + /// can be invoked later via [`MetaType::Use`]. See magic(5) for the + /// "name" type semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Name("part2".to_string()); + /// assert_eq!(meta, MetaType::Name("part2".to_string())); + /// ``` + Name(String), + /// `use ` directive: invokes a named subroutine + /// previously declared via [`MetaType::Name`]. See magic(5) for the + /// "use" type semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Use("part2".to_string()); + /// assert_eq!(meta, MetaType::Use("part2".to_string())); + /// ``` + Use(String), + /// `indirect` directive: re-applies the entire magic database at the + /// resolved offset. See magic(5) for the "indirect" type semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Indirect; + /// assert_eq!(meta, MetaType::Indirect); + /// ``` + Indirect, + /// `offset` type keyword: reports the current file offset rather than + /// reading a typed value from the buffer. See magic(5) for the + /// "offset" type semantics. + /// + /// Evaluation: the engine resolves the rule's offset specification + /// to an absolute position and emits a `RuleMatch` whose `value` is + /// `Value::Uint(position)`. Message templates can reference that + /// value through printf-style format specifiers (e.g. `%lld`), + /// which are substituted by + /// [`crate::output::format::format_magic_message`] at description- + /// assembly time. The only supported operator is `x` (`AnyValue`); + /// any other operator is `debug!`-logged and skipped. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::MetaType; + /// let meta = MetaType::Offset; + /// assert_eq!(meta, MetaType::Offset); + /// ``` + Offset, +} + /// Data type specifications for interpreting bytes #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[non_exhaustive] @@ -408,6 +501,22 @@ pub enum TypeKind { /// Scan window width in bytes, starting at the rule's offset. range: NonZeroUsize, }, + /// Control-flow directive (`default`, `clear`, `name`, `use`, `indirect`). + /// + /// These magic(5) keywords do not read or compare bytes; they modify + /// how a rule set is traversed. In the current phase they are parsed + /// into the AST and preserved through codegen, but the evaluator + /// treats them as silent no-ops. See [`MetaType`] for the individual + /// variants and their intended semantics. + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::{MetaType, TypeKind}; + /// let default_rule = TypeKind::Meta(MetaType::Default); + /// assert_eq!(default_rule, TypeKind::Meta(MetaType::Default)); + /// ``` + Meta(MetaType), } /// Regex modifier flags parsed from the `/[cs]` suffix on a `regex` rule. @@ -554,7 +663,8 @@ impl TypeKind { Self::String { .. } | Self::PString { .. } | Self::Regex { .. } - | Self::Search { .. } => None, + | Self::Search { .. } + | Self::Meta(_) => None, } } } @@ -1805,4 +1915,83 @@ mod tests { assert_eq!(rule.strength_modifier, None); } + + // MetaType tests + #[test] + fn test_meta_type_variants_debug_clone_eq() { + let cases = [ + MetaType::Default, + MetaType::Clear, + MetaType::Indirect, + MetaType::Offset, + MetaType::Name("part2".to_string()), + MetaType::Use("part2".to_string()), + ]; + + for (i, variant) in cases.iter().enumerate() { + // Debug formatting is non-empty + let debug_str = format!("{variant:?}"); + assert!( + !debug_str.is_empty(), + "Debug format must be non-empty for variant at index {i}" + ); + + // Clone round-trip preserves equality + let cloned = variant.clone(); + assert_eq!( + variant, &cloned, + "Clone must preserve equality for variant at index {i}" + ); + + // Distinct variants are not equal + for (j, other) in cases.iter().enumerate() { + if i == j { + assert_eq!(variant, other); + } else { + assert_ne!( + variant, other, + "Variants at indices {i} and {j} must differ" + ); + } + } + } + } + + #[test] + fn test_meta_type_serde_roundtrip() { + let cases = [ + MetaType::Default, + MetaType::Clear, + MetaType::Indirect, + MetaType::Offset, + MetaType::Name("foo".to_string()), + MetaType::Use("bar".to_string()), + ]; + + for variant in cases { + let json = serde_json::to_string(&variant).expect("serialize MetaType"); + let deserialized: MetaType = serde_json::from_str(&json).expect("deserialize MetaType"); + assert_eq!(variant, deserialized); + } + } + + #[test] + fn test_type_kind_meta_bit_width_is_none() { + let cases = [ + MetaType::Default, + MetaType::Clear, + MetaType::Indirect, + MetaType::Offset, + MetaType::Name("x".to_string()), + MetaType::Use("x".to_string()), + ]; + for meta in cases { + let kind = TypeKind::Meta(meta); + assert_eq!( + kind.bit_width(), + None, + "TypeKind::Meta must have no bit width: {kind:?}" + ); + } + } } diff --git a/src/parser/codegen.rs b/src/parser/codegen.rs index 3a5a9e3..963c978 100644 --- a/src/parser/codegen.rs +++ b/src/parser/codegen.rs @@ -12,8 +12,8 @@ //! binary as built-in rules. use super::ast::{ - Endianness, MagicRule, OffsetSpec, Operator, PStringLengthWidth, StrengthModifier, TypeKind, - Value, + Endianness, MagicRule, MetaType, OffsetSpec, Operator, PStringLengthWidth, StrengthModifier, + TypeKind, Value, }; const INDENT_WIDTH: usize = 4; @@ -29,7 +29,7 @@ pub fn generate_builtin_rules(rules: &[MagicRule]) -> String { push_line(&mut output, "#[allow(unused_imports)]"); push_line( &mut output, - "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier, PStringLengthWidth};", + "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier, PStringLengthWidth, MetaType};", ); push_line(&mut output, "use std::sync::LazyLock;"); push_line(&mut output, ""); @@ -269,6 +269,20 @@ pub fn serialize_type_kind(typ: &TypeKind) -> String { "TypeKind::Search {{ range: ::std::num::NonZeroUsize::new({}).unwrap_or(::std::num::NonZeroUsize::MIN) }}", range.get() ), + TypeKind::Meta(meta) => match meta { + MetaType::Default => "TypeKind::Meta(MetaType::Default)".to_string(), + MetaType::Clear => "TypeKind::Meta(MetaType::Clear)".to_string(), + MetaType::Indirect => "TypeKind::Meta(MetaType::Indirect)".to_string(), + MetaType::Offset => "TypeKind::Meta(MetaType::Offset)".to_string(), + MetaType::Name(id) => format!( + "TypeKind::Meta(MetaType::Name(String::from({})))", + format_string_literal(id) + ), + MetaType::Use(id) => format!( + "TypeKind::Meta(MetaType::Use(String::from({})))", + format_string_literal(id) + ), + }, } } @@ -504,4 +518,36 @@ mod tests { "escaped newline missing from serialized message:\n{generated}" ); } + + /// Security regression test for `MetaType::Name` / `MetaType::Use`: + /// the identifier is user-controlled (from the magic file) and must + /// be escaped the same way as the message field. A malicious + /// identifier containing `"`, `panic!`, or other Rust tokens must + /// not escape the string literal and land as bare code in the + /// generated `builtin_rules.rs`. + #[test] + fn test_serialize_meta_name_escapes_injection() { + let malicious = r#""; panic!("pwned-from-meta"); let _ = ""#; + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Name(malicious.to_string())), + op: Operator::AnyValue, + value: Value::Uint(0), + message: "meta rule".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let generated = serialize_magic_rule(&rule, 0); + + assert!( + !generated.contains(r#"panic!("pwned-from-meta")"#), + "injected Rust tokens leaked through MetaType::Name identifier:\n{generated}" + ); + assert!( + generated.contains(r#"\""#), + "escaped quote missing from serialized MetaType::Name identifier:\n{generated}" + ); + } } diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs index 4dd2d54..a998873 100644 --- a/src/parser/grammar/mod.rs +++ b/src/parser/grammar/mod.rs @@ -18,7 +18,7 @@ use nom::{ }; use crate::parser::ast::{ - Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value, + Endianness, MagicRule, MetaType, OffsetSpec, Operator, StrengthModifier, TypeKind, Value, }; mod numbers; @@ -195,6 +195,17 @@ pub fn parse_offset(input: &str) -> IResult<&str, OffsetSpec> { let (input, spec) = parse_indirect_offset(input)?; let (input, _) = multispace0(input)?; Ok((input, spec)) + } else if let Some(rest) = input.strip_prefix('&') { + // Relative offset: `&N`, `&+N`, or `&-N`. `parse_number` handles the + // bare and `-`-prefixed cases natively; `+` is consumed manually + // (see the indirect-offset adjustment parser for the same pattern). + let (rest, value) = if let Some(after_plus) = rest.strip_prefix('+') { + parse_number(after_plus)? + } else { + parse_number(rest)? + }; + let (rest, _) = multispace0(rest)?; + Ok((rest, OffsetSpec::Relative(value))) } else { let (input, offset_value) = parse_number(input)?; let (input, _) = multispace0(input)?; @@ -263,14 +274,11 @@ pub fn parse_operator(input: &str) -> IResult<&str, Operator> { (Operator::Equal, 1) } } - Some(b'!') => { - // Only "!=" is valid; bare "!" is an error. - if bytes.get(1).copied() == Some(b'=') { - (Operator::NotEqual, 2) - } else { - return Err(err()); - } - } + // Only "!=" is valid; bare "!" is an error. Express this as a + // match arm with a guard so clippy's `collapsible_match` lint is + // satisfied -- a guard-false fallthrough lands on the final `_` + // arm below, which returns the same parse error. + Some(b'!') if bytes.get(1).copied() == Some(b'=') => (Operator::NotEqual, 2), Some(b'<') => { // "<=", "<>", or bare "<" match bytes.get(1).copied() { @@ -327,6 +335,74 @@ pub fn parse_operator(input: &str) -> IResult<&str, Operator> { Ok((remaining, op)) } +/// Parse the identifier operand of a `name` / `use` meta-type directive. +/// +/// Called from [`parse_type_and_operator`] when the leading keyword is +/// `name` or `use`. Enforces that the keyword is followed by whitespace, +/// an identifier matching `[A-Za-z0-9_-]+`, and no further non-whitespace +/// content on the line. Malformed identifiers such as `part2=foo` +/// (operator-adjacent continuation) or `part 2` (split identifier) are +/// rejected as parse errors rather than silently consumed as a message. +fn parse_name_or_use_meta<'a>( + type_name: &str, + input: &'a str, +) -> IResult<&'a str, (TypeKind, Option)> { + use nom::character::complete::space1; + + // Require at least one whitespace character between the keyword and + // the identifier. `space1` rejects an empty gap, which enforces + // "bare `name` / `use` with no identifier" as a parse error. + let (input, _) = space1(input)?; + let (after_id, id) = + take_while(|c: char| c.is_alphanumeric() || c == '_' || c == '-').parse(input)?; + if id.is_empty() { + return Err(nom::Err::Error(NomError::new( + after_id, + nom::error::ErrorKind::AlphaNumeric, + ))); + } + + // The character immediately following the identifier must be + // whitespace or end-of-input. Anything else (e.g. `=`, `!`, `<`, + // `>`, `&`, `^`, `~`, `|`, punctuation) means `take_while` truncated + // a malformed identifier such as `part2=foo`: reject instead of + // silently treating the leftover text as a message. + if let Some(next_char) = after_id.chars().next() + && !matches!(next_char, ' ' | '\t' | '\n' | '\r') + { + return Err(nom::Err::Error(NomError::new( + after_id, + nom::error::ErrorKind::Alpha, + ))); + } + + // Consume horizontal whitespace after the identifier; the remaining + // text on this line must then be empty or terminated by a newline. + // `parse_text_magic_file` splits input into lines before parsing, + // so "empty" means no trailing content at all. Anything else + // (like `part 2`) is a split identifier and must fail. + let mut tail = after_id; + while let Some(rest) = tail.strip_prefix(' ').or_else(|| tail.strip_prefix('\t')) { + tail = rest; + } + if let Some(next_char) = tail.chars().next() + && !matches!(next_char, '\n' | '\r') + { + return Err(nom::Err::Error(NomError::new( + tail, + nom::error::ErrorKind::AlphaNumeric, + ))); + } + + let meta = if type_name == "name" { + MetaType::Name(id.to_string()) + } else { + MetaType::Use(id.to_string()) + }; + let (input, _) = multispace0(tail)?; + Ok((input, (TypeKind::Meta(meta), None))) +} + /// Parse a type specification with an optional attached bitwise-AND mask operator /// (e.g., `lelong&0xf0000000`). /// @@ -357,6 +433,15 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option bool { /// assert_eq!(rule.message, "32-bit"); /// ``` /// +/// Consume a leading `x` (`AnyValue`) operator with surrounding whitespace, +/// if present. Used by the Meta-type short-circuit so that +/// `>>&0 offset x at_offset %lld` does not emit `x\tat_offset %lld` as +/// the message. A bare `x` with no following whitespace (e.g. `xylophone`) +/// is left untouched -- we require the `x` to be a standalone token. +fn strip_optional_x_operator(input: &str) -> &str { + let trimmed = input.trim_start_matches([' ', '\t']); + if let Some(rest) = trimmed.strip_prefix('x') { + // Require whitespace or end-of-line after `x` so we don't eat + // the first character of a message that happens to start with x. + if rest.is_empty() || rest.starts_with([' ', '\t', '\n', '\r']) { + return rest.trim_start_matches([' ', '\t']); + } + } + input +} + /// # Errors /// /// Returns a nom parsing error if: @@ -692,13 +794,62 @@ pub fn parse_magic_rule(input: &str) -> IResult<&str, MagicRule> { // Parse the type and any attached operator let (input, (typ, attached_op)) = parse_type_and_operator(input)?; + // Meta-type directives (default, clear, name, use, indirect, offset) + // conceptually have no operator/value operand, but magic(5) source + // files (including GNU `file`'s own `searchbug.magic`) often write + // them with an `x` (AnyValue) placeholder between the type and the + // message, e.g. `>>&0 offset x at_offset %lld`. Consume an optional + // leading `x` token here so it does not leak into the rendered + // message. + // + // `name`/`use` are handled earlier in parse_type_and_operator and + // already consumed their identifier operand, so the `x` stripping + // is a no-op for them. + if matches!(typ, TypeKind::Meta(_)) { + let input = strip_optional_x_operator(input); + let (input, message) = if input.trim().is_empty() { + (input, String::new()) + } else { + parse_message(input)? + }; + let rule = MagicRule { + offset, + typ, + op: Operator::AnyValue, + value: Value::Uint(0), + message, + children: vec![], + level, + strength_modifier: None, + }; + return Ok((input, rule)); + } + // Try to parse a separate operator (optional - use attached operator if present) let (input, separate_op) = opt(parse_operator).parse(input)?; let op = attached_op.or(separate_op).unwrap_or(Operator::Equal); - // For AnyValue (`x`), no operand is needed -- treat remaining text as message + // For AnyValue (`x`), no operand is needed -- treat remaining text as message. + // For string-family types, fall back to a bare (unquoted) single-token + // literal if the strict `parse_value` alternatives all fail. magic(5) + // syntax permits writing `string TEST` or `search/12 ABC` without + // surrounding quotes, and this fallback supports that form without + // relaxing value parsing for non-string types (where `xyz` must + // still be rejected -- see `test_parse_value_invalid_input`). + let is_string_family_type = matches!( + typ, + TypeKind::String { .. } + | TypeKind::PString { .. } + | TypeKind::Regex { .. } + | TypeKind::Search { .. } + ); let (input, value) = if op == Operator::AnyValue { (input, Value::Uint(0)) + } else if is_string_family_type { + match parse_value(input) { + Ok(ok) => ok, + Err(orig_err) => parse_bare_string_value(input).map_err(|_| orig_err)?, + } } else { parse_value(input)? }; @@ -724,6 +875,30 @@ pub fn parse_magic_rule(input: &str) -> IResult<&str, MagicRule> { Ok((input, rule)) } +/// Parse a bare (unquoted) single-token string literal as a `Value::String`. +/// +/// Used only as a fallback for string-family types (`string`, `pstring`, +/// `regex`, `search`) when the strict [`parse_value`] alternatives all +/// fail. Consumes leading whitespace, then reads a run of non-whitespace +/// characters as the literal value. This supports the magic(5) syntax +/// `string TEST` where the value is not surrounded by quotes. +/// +/// # Errors +/// Returns a nom parsing error if the input contains no non-whitespace +/// token (e.g. it is empty or consists entirely of whitespace). +fn parse_bare_string_value(input: &str) -> IResult<&str, Value> { + let (input, _) = multispace0(input)?; + let (input, token) = + take_while(|c: char| !c.is_whitespace() && c != '\n' && c != '\r').parse(input)?; + if token.is_empty() { + return Err(nom::Err::Error(NomError::new( + input, + nom::error::ErrorKind::TakeWhile1, + ))); + } + Ok((input, Value::String(token.to_string()))) +} + /// Parse a comment line (starts with #) /// /// Comments in magic files start with '#' and continue to the end of the line. diff --git a/src/parser/grammar/tests/mod.rs b/src/parser/grammar/tests/mod.rs index 368a63d..bdc2040 100644 --- a/src/parser/grammar/tests/mod.rs +++ b/src/parser/grammar/tests/mod.rs @@ -5,6 +5,7 @@ mod indirect_offset; use super::*; use crate::parser::ast::Endianness; +use crate::parser::ast::MetaType; use crate::parser::ast::PStringLengthWidth; /// Helper function to test parsing with various whitespace patterns @@ -283,6 +284,30 @@ fn test_parse_offset_boundary_values() { ); } +#[test] +fn test_parse_offset_relative() { + // `&N` -- relative offset from the GNU `file` previous-match anchor. + // Bare (`&0`), explicit-positive (`&+4`), and negative (`&-4`) forms + // all decode to `OffsetSpec::Relative(N)`. + assert_eq!(parse_offset("&0"), Ok(("", OffsetSpec::Relative(0)))); + assert_eq!(parse_offset("&4"), Ok(("", OffsetSpec::Relative(4)))); + assert_eq!(parse_offset("&+4"), Ok(("", OffsetSpec::Relative(4)))); + assert_eq!(parse_offset("&-4"), Ok(("", OffsetSpec::Relative(-4)))); + assert_eq!(parse_offset("&0x10"), Ok(("", OffsetSpec::Relative(16)))); + assert_eq!(parse_offset("&-0x10"), Ok(("", OffsetSpec::Relative(-16)))); + + // Whitespace handling around the relative offset. + assert_eq!(parse_offset(" &0 "), Ok(("", OffsetSpec::Relative(0)))); + assert_eq!( + parse_offset("&0 ubyte"), + Ok(("ubyte", OffsetSpec::Relative(0))) + ); + + // Bare `&` with no number must fail. + assert!(parse_offset("&").is_err(), "bare `&` must fail"); + assert!(parse_offset("& ").is_err(), "`&` with only space must fail"); +} + #[test] fn test_parse_rule_offset_indirect_child() { // Level 1 child with indirect offset: >(0x3c.l) @@ -2476,3 +2501,179 @@ fn test_parse_magic_rule_regex_and_search() { ); assert_eq!(rule.message, "version line"); } + +#[test] +fn test_parse_magic_rule_meta_types() { + // Table: (input, expected_level, expected_typ, expected_message) + let cases: &[(&str, u32, TypeKind, &str)] = &[ + // `x` is the AnyValue operator; for meta types the parser strips + // it (with surrounding whitespace) before taking the rest of the + // line as the message. See `strip_optional_x_operator` in + // `parser/grammar/mod.rs`. Without that strip, rules like + // `>>&0 offset x at_offset %lld` would render as + // `x\tat_offset 11` and diverge from GNU `file` output. + ( + "0 default x msg", + 0, + TypeKind::Meta(MetaType::Default), + "msg", + ), + // And a message without a leading `x` passes through unchanged. + ("0 default msg", 0, TypeKind::Meta(MetaType::Default), "msg"), + ("0 clear", 0, TypeKind::Meta(MetaType::Clear), ""), + ( + "0 offset x pos=%lld", + 0, + TypeKind::Meta(MetaType::Offset), + "pos=%lld", + ), + ("0 indirect x", 0, TypeKind::Meta(MetaType::Indirect), ""), + ( + "0 name part2", + 0, + TypeKind::Meta(MetaType::Name("part2".to_string())), + "", + ), + ( + "0 use part2", + 0, + TypeKind::Meta(MetaType::Use("part2".to_string())), + "", + ), + ("0 indirect", 0, TypeKind::Meta(MetaType::Indirect), ""), + ( + ">0 use part2", + 1, + TypeKind::Meta(MetaType::Use("part2".to_string())), + "", + ), + ]; + + for (input, expected_level, expected_typ, expected_message) in cases { + let (remaining, rule) = + parse_magic_rule(input).unwrap_or_else(|e| panic!("parse failed for {input:?}: {e:?}")); + assert_eq!(remaining, "", "remaining mismatch for {input:?}"); + assert_eq!(rule.level, *expected_level, "level mismatch for {input:?}"); + assert_eq!(rule.typ, *expected_typ, "typ mismatch for {input:?}"); + assert_eq!( + rule.message, *expected_message, + "message mismatch for {input:?}" + ); + } + + // Bare `name` / `use` with no identifier must be a parse error. + assert!( + parse_magic_rule("0 name").is_err(), + "bare `name` with no identifier must fail" + ); + assert!( + parse_magic_rule("0 use").is_err(), + "bare `use` with no identifier must fail" + ); +} + +#[test] +fn test_parse_magic_rule_meta_name_use_reject_malformed_identifiers() { + // Operator-adjacent continuation must reject the truncated identifier + // (`part2=foo`, `part2!bar`, etc.) rather than silently dropping the + // operator text into the message slot. + let operator_cases = [ + "0 use part2=foo", + "0 use part2!=bar", + "0 use partfoo", + "0 name part&foo", + "0 name part^foo", + "0 name part~foo", + "0 name part|foo", + ]; + for input in operator_cases { + assert!( + parse_magic_rule(input).is_err(), + "operator-adjacent identifier must fail: {input:?}" + ); + } + + // Split identifiers with embedded whitespace (`part 2`) must also fail: + // the phase requires that `name`/`use` identifiers are terminated by + // whitespace followed only by EOL/EOF, with no trailing content. + let split_cases = [ + "0 name part 2", + "0 use part2 extra", + "0 name my id", + "0 use foo bar", + ]; + for input in split_cases { + assert!( + parse_magic_rule(input).is_err(), + "split identifier must fail: {input:?}" + ); + } + + // Sanity check: an identifier followed only by trailing whitespace still parses. + let (_, rule) = parse_magic_rule("0 name part2 ").expect("trailing ws is ok"); + assert_eq!( + rule.typ, + TypeKind::Meta(MetaType::Name("part2".to_string())) + ); + let (_, rule) = parse_magic_rule("0 use part2\t").expect("trailing tab is ok"); + assert_eq!(rule.typ, TypeKind::Meta(MetaType::Use("part2".to_string()))); +} + +#[test] +fn test_parse_text_magic_file_meta_roundtrip() { + // Build a small magic file that uses all five meta-types. The `name` + // block is a level-1 subroutine invoked by the top-level `use`, and + // `indirect` / `default` / `clear` appear as sibling directives to + // exercise the parse path for each variant. + // + // NOTE: all rules use the SAME top-level indentation so + // build_rule_hierarchy treats them as siblings. Child rules would + // require a preceding parent match, which meta-types do not produce. + let magic = "\ +0 name subroutine +0 use subroutine +0 default default-msg +0 clear +0 indirect +"; + let parsed = + crate::parser::parse_text_magic_file(magic).expect("meta-type magic file should parse"); + // Only the `name` declaration is hoisted into the name table; the + // other four meta-types remain as top-level rules in document order. + let rules = parsed.rules; + assert_eq!( + rules.len(), + 4, + "expected 4 top-level rules after name hoist, got {rules:?}" + ); + assert!( + parsed.name_table.get("subroutine").is_some(), + "name subroutine should be extracted into the name table" + ); + + assert_eq!( + rules[0].typ, + TypeKind::Meta(MetaType::Use("subroutine".to_string())) + ); + assert_eq!(rules[1].typ, TypeKind::Meta(MetaType::Default)); + assert_eq!(rules[2].typ, TypeKind::Meta(MetaType::Clear)); + assert_eq!(rules[3].typ, TypeKind::Meta(MetaType::Indirect)); +} + +#[test] +fn test_parse_text_magic_file_searchbug_fixture() { + // Regression: the canonical GNU `file` testfile `searchbug.magic` + // exercises the `offset` keyword, `&N` relative-offset syntax, the + // `name`/`use` subroutine machinery, and `search/N` -- every piece of + // this phase's acceptance surface in a single fixture. Previously the + // parser rejected the file on the unknown `offset` type keyword. + let magic = std::fs::read_to_string("third_party/tests/searchbug.magic") + .expect("searchbug.magic fixture must exist"); + let parsed = crate::parser::parse_text_magic_file(&magic) + .expect("searchbug.magic must parse end-to-end"); + assert!( + !parsed.rules.is_empty(), + "searchbug.magic must produce at least one top-level rule" + ); +} diff --git a/src/parser/loader.rs b/src/parser/loader.rs index 486dcb1..45e4216 100644 --- a/src/parser/loader.rs +++ b/src/parser/loader.rs @@ -9,7 +9,8 @@ use log::warn; use crate::error::ParseError; -use crate::parser::ast::MagicRule; +use crate::parser::ParsedMagic; +use crate::parser::name_table::NameTable; use std::path::{Path, PathBuf}; use super::format::{MagicFileFormat, detect_format}; @@ -89,8 +90,8 @@ fn read_magic_file_bounded(path: &Path) -> Result { /// use libmagic_rs::parser::load_magic_directory; /// use std::path::Path; /// -/// let rules = load_magic_directory(Path::new("/usr/share/file/magic.d"))?; -/// println!("Loaded {} rules from directory", rules.len()); +/// let parsed = load_magic_directory(Path::new("/usr/share/file/magic.d"))?; +/// println!("Loaded {} rules from directory", parsed.rules.len()); /// # Ok::<(), libmagic_rs::ParseError>(()) /// ``` /// @@ -106,7 +107,7 @@ fn read_magic_file_bounded(path: &Path) -> Result { /// // ├── 02-archive /// // └── 03-text /// -/// let rules = load_magic_directory(Path::new("./magic.d"))?; +/// let parsed = load_magic_directory(Path::new("./magic.d"))?; /// // Rules from all three files are merged in alphabetical order /// # Ok::<(), libmagic_rs::ParseError>(()) /// ``` @@ -122,7 +123,7 @@ fn read_magic_file_bounded(path: &Path) -> Result { /// # Panics /// /// This function does not panic under normal operation. -pub fn load_magic_directory(dir_path: &Path) -> Result, ParseError> { +pub fn load_magic_directory(dir_path: &Path) -> Result { use std::fs; // Read directory entries @@ -164,9 +165,11 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro // Sort by filename for deterministic ordering file_paths.sort_by_key(|path| path.file_name().map(std::ffi::OsStr::to_os_string)); - // Accumulate rules from all files + // Accumulate rules and name tables from all files let mut all_rules = Vec::new(); + let mut merged_table = NameTable::empty(); let mut parse_failures: Vec<(PathBuf, ParseError)> = Vec::new(); + let mut any_success = false; let file_count = file_paths.len(); for path in file_paths { @@ -184,9 +187,10 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro // Parse the file match super::parse_text_magic_file(&contents) { - Ok(rules) => { - // Successfully parsed - merge rules - all_rules.extend(rules); + Ok(parsed) => { + any_success = true; + all_rules.extend(parsed.rules); + merged_table.merge(parsed.name_table); } Err(e) => { // Track parse failures for reporting @@ -195,8 +199,12 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro } } - // If all files failed to parse, return an error - if all_rules.is_empty() && !parse_failures.is_empty() { + // If all files failed to parse, return an error. + // Use `any_success` rather than `all_rules.is_empty()` so that directories + // whose files parse successfully but contain only meta-type definitions + // (e.g. a directory of pure `name`-subroutine files) are not mistaken for + // complete failure. + if !any_success && !parse_failures.is_empty() { use std::fmt::Write; let failure_details: Vec = parse_failures @@ -222,7 +230,10 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro warn!("Failed to parse '{}': {}", path.display(), e); } - Ok(all_rules) + Ok(ParsedMagic { + rules: all_rules, + name_table: merged_table, + }) } /// Loads magic rules from a file or directory, automatically detecting the format. @@ -267,8 +278,8 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro /// use libmagic_rs::parser::load_magic_file; /// use std::path::Path; /// -/// let rules = load_magic_file(Path::new("/usr/share/misc/magic"))?; -/// println!("Loaded {} magic rules", rules.len()); +/// let parsed = load_magic_file(Path::new("/usr/share/misc/magic"))?; +/// println!("Loaded {} magic rules", parsed.rules.len()); /// # Ok::<(), libmagic_rs::ParseError>(()) /// ``` /// @@ -278,8 +289,8 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro /// use libmagic_rs::parser::load_magic_file; /// use std::path::Path; /// -/// let rules = load_magic_file(Path::new("/usr/share/misc/magic.d"))?; -/// println!("Loaded {} rules from directory", rules.len()); +/// let parsed = load_magic_file(Path::new("/usr/share/misc/magic.d"))?; +/// println!("Loaded {} rules from directory", parsed.rules.len()); /// # Ok::<(), libmagic_rs::ParseError>(()) /// ``` /// @@ -290,7 +301,7 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro /// use std::path::Path; /// /// match load_magic_file(Path::new("/usr/share/misc/magic.mgc")) { -/// Ok(rules) => println!("Loaded {} rules", rules.len()), +/// Ok(parsed) => println!("Loaded {} rules", parsed.rules.len()), /// Err(e) => { /// eprintln!("Error loading magic file: {}", e); /// eprintln!("Hint: Use --use-builtin for binary files"); @@ -318,7 +329,7 @@ pub fn load_magic_directory(dir_path: &Path) -> Result, ParseErro /// - [`detect_format()`] - Format detection logic /// - [`super::parse_text_magic_file()`] - Text file parser /// - [`load_magic_directory()`] - Directory loader -pub fn load_magic_file(path: &Path) -> Result, ParseError> { +pub fn load_magic_file(path: &Path) -> Result { // Detect the magic file format let format = detect_format(path)?; @@ -385,10 +396,10 @@ mod tests { fs::write(&invalid_path, "this is invalid syntax\n").expect("Failed to write invalid file"); // Should succeed, loading only the valid file - let rules = load_magic_directory(temp_dir.path()).expect("Should load valid files"); + let parsed = load_magic_directory(temp_dir.path()).expect("Should load valid files"); - assert_eq!(rules.len(), 1, "Should load only valid file"); - assert_eq!(rules[0].message, "valid"); + assert_eq!(parsed.rules.len(), 1, "Should load only valid file"); + assert_eq!(parsed.rules[0].message, "valid"); } #[test] @@ -408,9 +419,13 @@ mod tests { .expect("Failed to write comments file"); // Should succeed with no rules - let rules = load_magic_directory(temp_dir.path()).expect("Should handle empty files"); + let parsed = load_magic_directory(temp_dir.path()).expect("Should handle empty files"); - assert_eq!(rules.len(), 0, "Empty files should contribute no rules"); + assert_eq!( + parsed.rules.len(), + 0, + "Empty files should contribute no rules" + ); } #[test] @@ -459,16 +474,16 @@ mod tests { fs::write(temp_dir.path().join("noext"), "0 string \\x05\\x06 noext\n") .expect("Failed to write no-ext file"); - let rules = load_magic_directory(temp_dir.path()) + let parsed = load_magic_directory(temp_dir.path()) .expect("Should load all files regardless of extension"); assert_eq!( - rules.len(), + parsed.rules.len(), 3, "Should process all files regardless of extension" ); - let messages: Vec<&str> = rules.iter().map(|r| r.message.as_str()).collect(); + let messages: Vec<&str> = parsed.rules.iter().map(|r| r.message.as_str()).collect(); assert!(messages.contains(&"magic")); assert!(messages.contains(&"txt")); assert!(messages.contains(&"noext")); @@ -498,13 +513,13 @@ mod tests { ) .expect("Failed to write second file"); - let rules = load_magic_directory(temp_dir.path()).expect("Should load directory in order"); + let parsed = load_magic_directory(temp_dir.path()).expect("Should load directory in order"); - assert_eq!(rules.len(), 3); + assert_eq!(parsed.rules.len(), 3); // Should be sorted alphabetically by filename - assert_eq!(rules[0].message, "first"); - assert_eq!(rules[1].message, "second"); - assert_eq!(rules[2].message, "third"); + assert_eq!(parsed.rules[0].message, "first"); + assert_eq!(parsed.rules[1].message, "second"); + assert_eq!(parsed.rules[2].message, "third"); } // ============================================================ @@ -524,10 +539,10 @@ mod tests { .expect("Failed to write magic file"); // Load using load_magic_file - let rules = load_magic_file(&magic_file).expect("Failed to load text magic file"); + let parsed = load_magic_file(&magic_file).expect("Failed to load text magic file"); - assert_eq!(rules.len(), 1); - assert_eq!(rules[0].message, "ELF executable"); + assert_eq!(parsed.rules.len(), 1); + assert_eq!(parsed.rules[0].message, "ELF executable"); } #[test] @@ -552,11 +567,11 @@ mod tests { .expect("Failed to write zip file"); // Load using load_magic_file - let rules = load_magic_file(&magic_dir).expect("Failed to load directory"); + let parsed = load_magic_file(&magic_dir).expect("Failed to load directory"); - assert_eq!(rules.len(), 2); - assert_eq!(rules[0].message, "ELF executable"); - assert_eq!(rules[1].message, "ZIP archive"); + assert_eq!(parsed.rules.len(), 2); + assert_eq!(parsed.rules[0].message, "ELF executable"); + assert_eq!(parsed.rules[1].message, "ZIP archive"); } #[test] @@ -674,4 +689,32 @@ mod tests { "Error should mention the maximum allowed size, got: {err_msg}" ); } + + #[test] + fn test_load_directory_merges_name_tables() { + use std::fs; + use tempfile::TempDir; + + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + + // Each file defines a different named subroutine. + fs::write( + temp_dir.path().join("00_first"), + "0 name sub_a\n>0 byte 1 a-body\n", + ) + .expect("Failed to write sub_a file"); + fs::write( + temp_dir.path().join("01_second"), + "0 name sub_b\n>0 byte 2 b-body\n", + ) + .expect("Failed to write sub_b file"); + + let parsed = + load_magic_directory(temp_dir.path()).expect("Should load both name subroutines"); + + // Both `name` rules are hoisted out, so top-level rules list is empty. + assert_eq!(parsed.rules.len(), 0); + assert!(parsed.name_table.get("sub_a").is_some()); + assert!(parsed.name_table.get("sub_b").is_some()); + } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 3b2198a..4fa0ac9 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -142,6 +142,8 @@ mod format; pub(crate) mod grammar; mod hierarchy; mod loader; +#[allow(dead_code)] +pub(crate) mod name_table; pub(crate) mod preprocessing; pub mod types; @@ -158,6 +160,22 @@ pub(crate) use preprocessing::preprocess_lines; use crate::error::ParseError; +/// Result of parsing a text magic file. +/// +/// Contains the top-level rule list with any `name`-declared subroutines +/// hoisted into a separate [`name_table::NameTable`] keyed by identifier. +/// The rule list preserves the original ordering of all non-`Name` top-level +/// rules, so strength-based sorting and evaluation semantics are unchanged +/// for magic files that do not use the `name`/`use` directive pair. +#[derive(Debug)] +pub struct ParsedMagic { + /// Top-level rules after `Name` subroutines have been removed. + pub rules: Vec, + /// Extracted `name` subroutine definitions, consulted by the evaluator + /// when a rule of type `TypeKind::Meta(MetaType::Use(_))` is reached. + pub(crate) name_table: name_table::NameTable, +} + /// Parses a complete magic file from raw text input. /// /// This is the main public-facing parser function that orchestrates the complete @@ -170,7 +188,9 @@ use crate::error::ParseError; /// /// # Returns /// -/// `Result, ParseError>` - A vector of root rules with nested children +/// `Result` - A [`ParsedMagic`] value containing +/// the top-level rules (with `name`-declared subroutines hoisted out) and +/// the resulting name table. /// /// # Errors /// @@ -188,14 +208,16 @@ use crate::error::ParseError; /// >4 byte 1 32-bit /// >4 byte 2 64-bit"#; /// -/// let rules = parse_text_magic_file(magic)?; -/// assert_eq!(rules.len(), 1); -/// assert_eq!(rules[0].message, "ELF file"); +/// let parsed = parse_text_magic_file(magic)?; +/// assert_eq!(parsed.rules.len(), 1); +/// assert_eq!(parsed.rules[0].message, "ELF file"); /// # Ok::<(), Box>(()) /// ``` -pub fn parse_text_magic_file(input: &str) -> Result, ParseError> { +pub fn parse_text_magic_file(input: &str) -> Result { let lines = preprocess_lines(input)?; - build_rule_hierarchy(lines) + let rules = build_rule_hierarchy(lines)?; + let (rules, name_table) = name_table::extract_name_table(rules); + Ok(ParsedMagic { rules, name_table }) } #[cfg(test)] @@ -209,7 +231,7 @@ mod unit_tests { #[test] fn test_parse_text_magic_file_single_rule() { let input = "0 string 0 ZIP archive"; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); assert_eq!(rules[0].message, "ZIP archive"); } @@ -221,7 +243,7 @@ mod unit_tests { >4 byte 1 32-bit >4 byte 2 64-bit "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); assert_eq!(rules[0].children.len(), 2); } @@ -233,7 +255,7 @@ mod unit_tests { 0 string 0 ELF >4 byte 1 32-bit "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); assert_eq!(rules[0].children.len(), 1); } @@ -247,14 +269,14 @@ mod unit_tests { 0 byte 2 PDF >5 byte 1 v1 "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); } #[test] fn test_parse_text_magic_file_empty_input() { let input = ""; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 0); } @@ -265,7 +287,7 @@ mod unit_tests { # Comment 2 # Comment 3 "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 0); } @@ -278,14 +300,14 @@ mod unit_tests { "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); } #[test] fn test_parse_text_magic_file_with_message_spaces() { let input = "0 string 0 Long message continued here"; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert!(rules[0].message.contains("continued")); } @@ -300,7 +322,7 @@ mod unit_tests { 0 byte 2 Root2 >4 byte 4 Child3 "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); assert_eq!(rules[0].children.len(), 2); assert_eq!(rules[0].children[1].children.len(), 1); @@ -325,7 +347,7 @@ mod unit_tests { >5 byte 0x34 version 1.4 >5 byte 0x32 version 2.0 "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); assert_eq!(rules[0].message, "ELF executable"); assert!(rules[0].children.len() > 1); @@ -341,7 +363,7 @@ mod unit_tests { !:strength +10 0 string \\x7fELF ELF executable "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10))); } @@ -353,7 +375,7 @@ mod unit_tests { 0 string \\x7fELF ELF executable 0 string \\x50\\x4b ZIP archive "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); // Strength should only apply to the immediately following rule assert_eq!( @@ -371,7 +393,7 @@ mod unit_tests { >4 byte 1 32-bit >4 byte 2 64-bit "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); // Strength applies to root rule assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Set(50))); @@ -388,7 +410,7 @@ mod unit_tests { !:strength -5 0 string \\x50\\x4b ZIP archive "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); assert_eq!(rules[0].strength_modifier, Some(StrengthModifier::Add(10))); assert_eq!( @@ -415,7 +437,7 @@ mod unit_tests { ]; for (input, expected_modifier) in inputs { - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!( rules[0].strength_modifier, Some(expected_modifier), @@ -432,7 +454,7 @@ mod unit_tests { fn test_continuation_with_indentation() { let input = r">4 byte 1 Message \ continued"; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 1); } @@ -442,7 +464,7 @@ continued"; 0x100 string 0 At 256 0x200 string 0 At 512 "; - let rules = parse_text_magic_file(input).unwrap(); + let ParsedMagic { rules, .. } = parse_text_magic_file(input).unwrap(); assert_eq!(rules.len(), 2); } @@ -508,7 +530,9 @@ continued"; #[cfg(test)] mod output_test { - use crate::parser::{build_rule_hierarchy, parse_text_magic_file, preprocess_lines}; + use crate::parser::{ + ParsedMagic, build_rule_hierarchy, parse_text_magic_file, preprocess_lines, + }; #[test] fn demo_show_all_parser_outputs() { @@ -544,7 +568,8 @@ mod output_test { // -------------------------------------------------- println!("\n================ PARSED MAGIC RULES ================\n"); - let rules = parse_text_magic_file(input).expect("parse_text_magic_file failed"); + let ParsedMagic { rules, .. } = + parse_text_magic_file(input).expect("parse_text_magic_file failed"); for (i, rule) in rules.iter().enumerate() { println!("ROOT RULE [{i}]:"); diff --git a/src/parser/name_table.rs b/src/parser/name_table.rs new file mode 100644 index 0000000..2c9e562 --- /dev/null +++ b/src/parser/name_table.rs @@ -0,0 +1,255 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Name table for `MetaType::Name` subroutine extraction. +//! +//! When a magic file declares `0 name ` at the top level, its +//! children form a named subroutine that can be invoked later via +//! `use `. This module extracts those definitions out of the +//! flat rule list at load time, so the evaluator can look them up by name +//! without re-walking the AST. + +use std::collections::HashMap; + +use log::warn; + +use crate::parser::ast::{MagicRule, MetaType, TypeKind}; + +/// A lookup table mapping subroutine names to their child rule lists. +/// +/// Built by [`extract_name_table`] from a parsed magic file's top-level +/// rule list. The evaluator consults this table when it encounters a +/// `TypeKind::Meta(MetaType::Use(name))` rule to retrieve the rules that +/// should be evaluated as if inlined at the `use` site. +#[derive(Debug, Default, Clone)] +pub(crate) struct NameTable { + inner: HashMap>, +} + +impl NameTable { + /// Create an empty name table. + #[must_use] + pub(crate) fn empty() -> Self { + Self { + inner: HashMap::new(), + } + } + + /// Look up a subroutine's rule list by name. + #[must_use] + pub(crate) fn get(&self, name: &str) -> Option<&Vec> { + self.inner.get(name) + } + + /// Mutable access to the underlying map. + /// + /// Used by `MagicDatabase` after load to sort each subroutine's rules + /// by strength without allocating a new map. + pub(crate) fn values_mut( + &mut self, + ) -> std::collections::hash_map::ValuesMut<'_, String, Vec> { + self.inner.values_mut() + } + + /// Merge another name table into this one. + /// + /// Used when loading a magic directory: each file's extracted name + /// table is merged into the accumulating table. On key collisions, + /// the first-seen definition is kept and a warning is emitted. + pub(crate) fn merge(&mut self, other: Self) { + for (name, rules) in other.inner { + if self.inner.contains_key(&name) { + warn!("duplicate name definition '{name}' across magic files; keeping first"); + continue; + } + self.inner.insert(name, rules); + } + } +} + +/// Partition a top-level rule list, hoisting `name` rules into a +/// [`NameTable`] and returning the remaining non-`Name` rules. +/// +/// - Top-level `Name` rules are removed; their `children` become the +/// subroutine body in the returned table. On duplicate names, the +/// first definition wins and a warning is logged. +/// - `Name` rules appearing below the top level (as a child of another +/// rule) are dropped with a warning; they are not well-defined in +/// magic(5) and would confuse the evaluator's lookup path. +/// - Non-`Name` rules are returned unchanged, with their own children +/// also scrubbed of any nested `Name` rules. +pub(crate) fn extract_name_table(rules: Vec) -> (Vec, NameTable) { + let mut table = NameTable::empty(); + let mut kept = Vec::with_capacity(rules.len()); + + for rule in rules { + if let TypeKind::Meta(MetaType::Name(ref name)) = rule.typ { + if table.inner.contains_key(name) { + warn!("duplicate name definition '{name}'; keeping first"); + continue; + } + // Recursively scrub nested Name rules from the subroutine's + // children (shouldn't appear in practice, but be defensive). + let children = scrub_nested_names(rule.children, rule.level); + table.inner.insert(name.clone(), children); + } else { + let scrubbed_children = scrub_nested_names(rule.children, rule.level + 1); + kept.push(MagicRule { + children: scrubbed_children, + ..rule + }); + } + } + + (kept, table) +} + +/// Walk a child list and drop any `Name` rules found below the top level. +fn scrub_nested_names(children: Vec, parent_level: u32) -> Vec { + let mut kept = Vec::with_capacity(children.len()); + for child in children { + if let TypeKind::Meta(MetaType::Name(ref name)) = child.typ { + warn!("name directive '{name}' at level {parent_level} is not top-level; skipping"); + continue; + } + let scrubbed = scrub_nested_names(child.children, child.level + 1); + kept.push(MagicRule { + children: scrubbed, + ..child + }); + } + kept +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::ast::{OffsetSpec, Operator, Value}; + + fn make_rule(level: u32, typ: TypeKind, message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ, + op: Operator::Equal, + value: Value::Uint(0), + message: message.to_string(), + children, + level, + strength_modifier: None, + } + } + + #[test] + fn test_extract_empty() { + let (rules, table) = extract_name_table(vec![]); + assert!(rules.is_empty()); + assert!(table.get("anything").is_none()); + } + + #[test] + fn test_extract_single_name_rule() { + let child = make_rule(1, TypeKind::Byte { signed: false }, "child", vec![]); + let name_rule = make_rule( + 0, + TypeKind::Meta(MetaType::Name("sub".to_string())), + "", + vec![child], + ); + let (rules, table) = extract_name_table(vec![name_rule]); + assert!(rules.is_empty()); + let subroutine = table.get("sub").expect("sub subroutine"); + assert_eq!(subroutine.len(), 1); + assert_eq!(subroutine[0].message, "child"); + } + + #[test] + fn test_extract_preserves_non_name_rules() { + let byte_rule = make_rule(0, TypeKind::Byte { signed: false }, "hello", vec![]); + let (rules, table) = extract_name_table(vec![byte_rule]); + assert_eq!(rules.len(), 1); + assert_eq!(rules[0].message, "hello"); + assert!(table.get("anything").is_none()); + } + + #[test] + fn test_extract_duplicate_name_keeps_first() { + let first = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "first", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "first-child", + vec![], + )], + ); + let second = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "second", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "second-child", + vec![], + )], + ); + let (_, table) = extract_name_table(vec![first, second]); + let subroutine = table.get("dup").expect("first dup kept"); + assert_eq!(subroutine.len(), 1); + assert_eq!(subroutine[0].message, "first-child"); + } + + #[test] + fn test_merge_combines_tables() { + let sub_a = make_rule( + 0, + TypeKind::Meta(MetaType::Name("a".to_string())), + "", + vec![], + ); + let sub_b = make_rule( + 0, + TypeKind::Meta(MetaType::Name("b".to_string())), + "", + vec![], + ); + let (_, mut table_a) = extract_name_table(vec![sub_a]); + let (_, table_b) = extract_name_table(vec![sub_b]); + table_a.merge(table_b); + assert!(table_a.get("a").is_some()); + assert!(table_a.get("b").is_some()); + } + + #[test] + fn test_merge_duplicate_keeps_existing() { + let first = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "first-child", + vec![], + )], + ); + let second = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "second-child", + vec![], + )], + ); + let (_, mut table_a) = extract_name_table(vec![first]); + let (_, table_b) = extract_name_table(vec![second]); + table_a.merge(table_b); + let subroutine = table_a.get("dup").expect("dup kept from first table"); + assert_eq!(subroutine[0].message, "first-child"); + } +} diff --git a/src/parser/types.rs b/src/parser/types.rs index 5683b96..3040b94 100644 --- a/src/parser/types.rs +++ b/src/parser/types.rs @@ -11,7 +11,7 @@ use nom::{IResult, Parser, branch::alt, bytes::complete::tag}; -use crate::parser::ast::{Endianness, PStringLengthWidth, TypeKind}; +use crate::parser::ast::{Endianness, MetaType, PStringLengthWidth, TypeKind}; /// Error returned by [`type_keyword_to_kind`] when the supplied keyword is /// not a recognized magic type keyword. @@ -123,6 +123,22 @@ pub fn parse_type_keyword(input: &str) -> IResult<&str, &str> { )), // String types (and regex/search, which share the string-type family) alt((tag("pstring"), tag("search"), tag("regex"), tag("string"))), + // Meta / control-flow directives. `indirect` is listed first so the + // longest match is tried before `default`, `clear`, `name`, `use`; + // none of these collide with other supported keywords. + // + // `offset` is recognized here so the parser can accept magic files + // that use it (e.g. `searchbug.magic`). In this phase it is + // evaluated as a silent no-op via `TypeKind::Meta(MetaType::Offset)`; + // full offset-reporting semantics are deferred. + alt(( + tag("indirect"), + tag("default"), + tag("offset"), + tag("clear"), + tag("name"), + tag("use"), + )), )) .parse(input) } @@ -195,10 +211,26 @@ pub fn type_keyword_to_kind(type_name: &str) -> Result, Unknown // here makes the "keyword alone isn't enough" invariant // type-enforced instead of relying on a placeholder that the // grammar layer is expected to overwrite. - if matches!(type_name, "regex" | "search") { + // + // `name` and `use` also return `Ok(None)` because their identifier + // suffix is parsed in the grammar layer, following the same + // "keyword alone isn't enough" pattern. + if matches!(type_name, "regex" | "search" | "name" | "use") { return Ok(None); } + // Meta / control-flow directives with no trailing operand are fully + // specified by the keyword alone. `offset` is included here because + // parser-only support for it lands it in the AST as a silent no-op + // during this phase; full offset-reporting semantics are deferred. + match type_name { + "default" => return Ok(Some(TypeKind::Meta(MetaType::Default))), + "clear" => return Ok(Some(TypeKind::Meta(MetaType::Clear))), + "indirect" => return Ok(Some(TypeKind::Meta(MetaType::Indirect))), + "offset" => return Ok(Some(TypeKind::Meta(MetaType::Offset))), + _ => {} + } + if let Some(kind) = byte_family(type_name) .or_else(|| short_family(type_name)) .or_else(|| long_family(type_name)) @@ -605,7 +637,8 @@ mod tests { "long", "ulong", "lelong", "ulelong", "belong", "ubelong", "quad", "uquad", "lequad", "ulequad", "bequad", "ubequad", "float", "befloat", "lefloat", "double", "bedouble", "ledouble", "date", "ldate", "bedate", "beldate", "ledate", "leldate", "qdate", - "qldate", "beqdate", "beqldate", "leqdate", "leqldate", "pstring", "string", + "qldate", "beqdate", "beqldate", "leqdate", "leqldate", "pstring", "string", "default", + "clear", "indirect", "offset", ]; for keyword in convertible_keywords { let (rest, parsed) = parse_type_keyword(keyword).unwrap(); @@ -615,10 +648,11 @@ mod tests { "{keyword} should map to Ok(Some(TypeKind))" ); } - // regex and search are recognized by parse_type_keyword but - // require grammar-layer suffix parsing to construct their - // TypeKind. Verify both sides of this split invariant. - for keyword in ["regex", "search"] { + // regex, search, name, and use are recognized by parse_type_keyword + // but require grammar-layer suffix parsing (flags/count/range or an + // identifier) to construct their TypeKind. Verify both sides of + // this split invariant. + for keyword in ["regex", "search", "name", "use"] { let (rest, parsed) = parse_type_keyword(keyword).unwrap(); assert_eq!(rest, "", "Keyword {keyword} should consume all input"); assert_eq!( diff --git a/tests/compatibility_tests.rs b/tests/compatibility_tests.rs index 2834074..e17448a 100644 --- a/tests/compatibility_tests.rs +++ b/tests/compatibility_tests.rs @@ -364,3 +364,46 @@ fn test_compatibility_files_available() { assert!(!test_files.is_empty(), "No compatibility test files found"); println!("Found {} compatibility test files", test_files.len()); } + +/// Partial-match regression for the canonical GNU `file` `searchbug` fixture. +/// +/// Full byte-for-byte match against `searchbug.result` requires the `offset` +/// pseudo-type (to render `at_offset %lld`) and printf-style format-specifier +/// substitution (to render `0x%02x` as `0x31`/`0x32`); both are tracked in +/// follow-up issues. This test exercises the `name`/`use` subroutine dispatch +/// shipped in v0.5.x by asserting that the description carries the +/// recognizable (unsubstituted) fragments from +/// `third_party/tests/searchbug.result`. +#[test] +fn test_searchbug_partial_match() { + let magic_path = Path::new("third_party/tests/searchbug.magic"); + let testfile_path = Path::new("third_party/tests/searchbug.testfile"); + if !magic_path.exists() || !testfile_path.exists() { + println!("Skipping searchbug partial-match test: fixtures not found"); + return; + } + + let db = + MagicDatabase::load_from_file(magic_path).expect("searchbug.magic must load end-to-end"); + let bytes = fs::read(testfile_path).expect("searchbug.testfile fixture must be readable"); + + let result = db + .evaluate_buffer(&bytes) + .expect("evaluate_buffer on searchbug.testfile"); + + assert!( + result.description.starts_with("Testfmt"), + "description should start with \"Testfmt\", got: {}", + result.description + ); + assert!( + result.description.contains("found_ABC"), + "description should contain \"found_ABC\" (subroutine match), got: {}", + result.description + ); + assert!( + result.description.contains("followed_by"), + "description should contain \"followed_by\" (subroutine child rule), got: {}", + result.description + ); +} diff --git a/tests/directory_loading_tests.rs b/tests/directory_loading_tests.rs index 57afbe7..6426bd2 100644 --- a/tests/directory_loading_tests.rs +++ b/tests/directory_loading_tests.rs @@ -6,7 +6,7 @@ //! These tests validate the `load_magic_directory()` function's behavior //! with various directory structures and content scenarios. -use libmagic_rs::parser::load_magic_directory; +use libmagic_rs::parser::{ParsedMagic, load_magic_directory}; use std::fs; use std::path::{Path, PathBuf}; use tempfile::TempDir; @@ -52,7 +52,8 @@ fn create_magdir_structure(dir: &Path) -> Vec { #[test] fn test_load_empty_directory() { let temp_dir = TempDir::new().expect("Failed to create temp dir"); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load empty directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load empty directory"); assert_eq!(rules.len(), 0, "Empty directory should return no rules"); } @@ -69,7 +70,8 @@ fn test_load_directory_single_file() { >4 byte 2 64-bit\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!(rules.len(), 1, "Should load one top-level rule"); assert_eq!(rules[0].message, "ELF executable"); @@ -99,7 +101,8 @@ fn test_load_directory_multiple_files() { "0 string \\x23\\x21 shell script\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!(rules.len(), 4, "Should load all rules from all files"); @@ -134,7 +137,8 @@ fn test_load_directory_preserves_order() { "0 string \\x07\\x08\\x09 third file\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!(rules.len(), 3); // Files should be processed in alphabetical order @@ -159,7 +163,8 @@ fn test_load_directory_skips_subdirectories() { fs::create_dir(&subdir).expect("Failed to create subdirectory"); create_test_magic_file(&subdir, "sub.magic", "0 string \\x03\\x04 sub file\n"); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); // Should only load the main file, not the one in subdirectory assert_eq!(rules.len(), 1); @@ -192,7 +197,8 @@ fn test_load_directory_skips_symlinks() { let symlink_path = temp_dir.path().join("symlink.magic"); symlink(&external_file, &symlink_path).expect("Failed to create symlink"); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); // Should only load the regular file, not the symlinked one assert_eq!(rules.len(), 1, "Should skip symlinks"); @@ -226,7 +232,8 @@ fn test_load_directory_with_parse_errors() { ); // Should succeed and load only the valid files - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!( rules.len(), @@ -271,7 +278,8 @@ fn test_load_directory_with_comments() { # Empty lines above\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!(rules.len(), 1); assert_eq!(rules[0].message, "test file"); @@ -292,7 +300,8 @@ fn test_load_directory_with_nested_rules() { >4 byte 2 64-bit\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); assert_eq!(rules.len(), 1, "Should have one top-level rule"); assert_eq!(rules[0].children.len(), 2, "Should have two child rules"); @@ -311,7 +320,8 @@ fn test_load_directory_rule_count() { create_magdir_structure(temp_dir.path()); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); // Count total rules from create_magdir_structure: // 01-elf: 1 top-level (ELF executable) with 2 children = 1 top-level rule @@ -342,7 +352,8 @@ fn test_load_directory_empty_files() { "0 string \\x01\\x02 valid file\n", ); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); // Empty files should be handled gracefully assert_eq!( @@ -368,7 +379,8 @@ fn test_load_directory_mixed_extensions() { create_test_magic_file(temp_dir.path(), "noext", "0 string \\x05\\x06 no ext\n"); - let rules = load_magic_directory(temp_dir.path()).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Failed to load directory"); // All files should be processed regardless of extension assert_eq!( @@ -423,7 +435,8 @@ fn test_load_directory_partial_failure_succeeds() { create_test_magic_file(temp_dir.path(), "bad", "not valid magic syntax"); // Should succeed because at least one file parsed - let rules = load_magic_directory(temp_dir.path()).expect("Should succeed with partial failure"); + let ParsedMagic { rules, .. } = + load_magic_directory(temp_dir.path()).expect("Should succeed with partial failure"); assert_eq!(rules.len(), 1, "Should have one rule from the valid file"); assert_eq!(rules[0].message, "valid rule"); diff --git a/tests/meta_types_integration.rs b/tests/meta_types_integration.rs new file mode 100644 index 0000000..c7867ab --- /dev/null +++ b/tests/meta_types_integration.rs @@ -0,0 +1,218 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! End-to-end smoke tests for meta-type directives (name/use/default/clear/indirect). +//! +//! Uses the canonical GNU `file` `searchbug.magic` fixture, which exercises +//! the `name`/`use` subroutine machinery together with `offset`, `search/N`, +//! and relative-offset (`&N`) semantics. These tests verify the acceptance +//! surface shipped in this phase and are intentionally loose about the +//! exact result string -- the full byte-for-byte match is deferred to a +//! later phase that wires up the `offset` pseudo-type. + +use std::fs; +use std::io::Write; + +use libmagic_rs::{EvaluationConfig, MagicDatabase}; +use tempfile::TempDir; + +#[test] +fn test_searchbug_magic_loads_end_to_end() { + // Regression: the canonical GNU `file` testfile `searchbug.magic` + // exercises the `name`/`use` subroutine machinery together with + // `offset`, `search/N`, and relative-offset (`&N`) semantics. Before + // meta-type parsing was wired through, this file failed to load at + // all (the parser rejected the `offset` and `name`/`use` keywords). + // + // The assertion is intentionally loose: evaluation of the top-level + // `string TEST` rule today returns "data" on buffers that contain no + // NUL bytes (see GOTCHAS S6.4 -- unanchored string rules without an + // explicit `/N` length cap read the entire remaining buffer). That is + // orthogonal to meta-type handling and is tracked separately. The + // point of this smoke test is to prove that the fixture parses and + // can be evaluated without panicking or erroring. + let db = MagicDatabase::load_from_file("third_party/tests/searchbug.magic") + .expect("searchbug.magic must load end-to-end"); + let bytes = std::fs::read("third_party/tests/searchbug.testfile") + .expect("searchbug.testfile fixture must exist"); + + let result = db + .evaluate_buffer(&bytes) + .expect("evaluate_buffer on searchbug.testfile"); + + // A non-empty description is the minimum smoke-test bar. + assert!( + !result.description.is_empty(), + "evaluation should produce some description" + ); + + // The top-level `string TEST` rule carries the "Testfmt" message, so + // any correctly-evaluated run must produce a description that starts + // with "Testfmt". This prefix guards the primary regression target + // of this fixture (name/use subroutine dispatch plus continuation + // rules) -- the weaker non-empty check alone can pass even when + // `use`-site children are silently skipped. + assert!( + result.description.starts_with("Testfmt"), + "description should start with \"Testfmt\", got: {}", + result.description + ); +} + +/// Synthetic end-to-end coverage of the `default` and `clear` directives: +/// +/// - When no sibling rule has matched at the current level, a `default` +/// rule must fire and contribute its message to the description. +/// - When a sibling has matched, a `default` rule must remain silent. +/// - A `clear` directive resets the per-level "sibling matched" flag, so a +/// subsequent `default` sibling at the same level can fire again even +/// after an earlier sibling matched. +/// +/// The combined scenario walks the sequence +/// `[match-A, default-skipped, clear, default-fires]` to prove `clear` +/// changes runtime sibling-matched state end-to-end through the full +/// `MagicDatabase` load/evaluate flow. +#[test] +fn test_default_clear_synthetic_scenario() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("default.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + // Real rule fires when first byte is 0xAA. The default fires when + // nothing else matched at this level. Trailing message fields show up + // in the concatenated description. + writeln!(f, r#"0 byte 0xAA Real-Match"#).unwrap(); + writeln!(f, r#"0 default x DEFAULT-FALLBACK"#).unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // Buffer that does NOT trigger the byte rule -> default must fire. + let buf_no_match = [0x00u8, 0x01, 0x02, 0x03]; + let result_default = db.evaluate_buffer(&buf_no_match).unwrap(); + assert!( + result_default.description.contains("DEFAULT-FALLBACK"), + "default should fire when no sibling matched, got: {}", + result_default.description + ); + + // Buffer that DOES trigger the byte rule -> default must remain silent. + let buf_match = [0xAAu8, 0x01, 0x02, 0x03]; + let result_real = db.evaluate_buffer(&buf_match).unwrap(); + assert!( + !result_real.description.contains("DEFAULT-FALLBACK"), + "default must not fire when a sibling matched, got: {}", + result_real.description + ); + assert!( + result_real.description.contains("Real-Match"), + "real byte rule should still match, got: {}", + result_real.description + ); + + // Now exercise `clear` end-to-end: after a sibling matches (Match-A), + // the first `default` sibling (DEFAULT-SKIPPED) must stay silent, then + // `clear` resets the sibling-matched flag so the second `default` + // sibling (DEFAULT-FIRES) fires despite the earlier match. + // + // This walks all top-level siblings, so we must disable + // `stop_at_first_match` (the default config stops after the first + // top-level match, which would prevent the later `clear`/`default` + // siblings from executing). + let clear_path = temp_dir.path().join("clear.magic"); + let mut cf = fs::File::create(&clear_path).unwrap(); + writeln!(cf, r#"0 byte 0xAA Match-A"#).unwrap(); + writeln!(cf, r#"0 default x DEFAULT-SKIPPED"#).unwrap(); + writeln!(cf, r#"0 clear"#).unwrap(); + writeln!(cf, r#"0 default x DEFAULT-FIRES"#).unwrap(); + + let all_matches_config = EvaluationConfig::default().with_stop_at_first_match(false); + let clear_db = + MagicDatabase::load_from_file_with_config(&clear_path, all_matches_config).unwrap(); + + // Buffer that triggers Match-A. Without `clear`, only Match-A fires + // and the DEFAULT-SKIPPED is correctly suppressed. With `clear`, + // Match-A fires, DEFAULT-SKIPPED is suppressed, the clear directive + // resets sibling_matched, and DEFAULT-FIRES then fires. + let buf_clear = [0xAAu8, 0x01, 0x02, 0x03]; + let result_clear = clear_db.evaluate_buffer(&buf_clear).unwrap(); + + assert!( + result_clear.description.contains("Match-A"), + "byte rule should still match before clear, got: {}", + result_clear.description + ); + assert!( + !result_clear.description.contains("DEFAULT-SKIPPED"), + "default immediately after a sibling match must remain silent, got: {}", + result_clear.description + ); + assert!( + result_clear.description.contains("DEFAULT-FIRES"), + "clear must reset sibling-matched so a later default can fire, got: {}", + result_clear.description + ); +} + +/// Synthetic end-to-end coverage of the `indirect` directive: a rule with +/// `TypeKind::Meta(MetaType::Indirect)` re-applies the loaded magic +/// database starting at the resolved offset. The dispatch is wired +/// through `RuleEnvironment::root_rules`, which `MagicDatabase` populates +/// with the same rule list used at the top level. +#[test] +fn test_indirect_synthetic_scenario() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("indirect.magic"); + + // Two rules at the top level: + // - At offset 0: byte 0x7F triggers an indirect re-entry at offset 8. + // The indirect re-entry then re-applies the root rules against the + // sub-buffer starting at byte 8. + // - At offset 0: byte 0x42 produces "Inner-Match". When the indirect + // fires, the sub-buffer's offset 0 is the outer buffer's offset 8, + // so 0x42 there triggers the same rule recursively. + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, r#"0 byte 0x42 Inner-Match"#).unwrap(); + writeln!(f, r#"8 indirect x"#).unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // Build a buffer where: + // buf[0] = 0x00 (no Inner-Match at top level) + // buf[8] = 0x42 (after indirect dispatch, sub-buffer[0] = 0x42) + let mut buf = vec![0u8; 16]; + buf[8] = 0x42; + + let result = db.evaluate_buffer(&buf).unwrap(); + // The indirect re-entry should produce an Inner-Match for the sub-buffer. + assert!( + result.description.contains("Inner-Match"), + "indirect must dispatch root rules at the resolved offset; got: {}", + result.description + ); +} + +#[test] +fn test_searchbug_matches_full_result_string() { + // The `searchbug.result` fixture expects the concatenation of every + // match produced by walking the full rule tree. libmagic-rs's + // `stop_at_first_match` default is `true`, which causes the + // evaluator to short-circuit after the first sibling in every + // nested rule list -- that's the right default for file-type + // classification but the wrong default for round-tripping magic(5) + // fixtures that expect every successful rule to surface its + // message. Disable it here so the fixture's full expected + // description is produced; GNU `file`'s behavior on this fixture + // is equivalent to evaluating every branch. + let config = EvaluationConfig::default().with_stop_at_first_match(false); + let db = MagicDatabase::load_from_file_with_config("third_party/tests/searchbug.magic", config) + .expect("searchbug.magic must load end-to-end"); + let bytes = std::fs::read("third_party/tests/searchbug.testfile") + .expect("searchbug.testfile fixture must exist"); + let expected = std::fs::read_to_string("third_party/tests/searchbug.result") + .expect("searchbug.result fixture must exist"); + + let result = db + .evaluate_buffer(&bytes) + .expect("evaluate_buffer on searchbug.testfile"); + assert_eq!(result.description.trim(), expected.trim()); +} diff --git a/tests/parser_integration_tests.rs b/tests/parser_integration_tests.rs index a784db7..fca4459 100644 --- a/tests/parser_integration_tests.rs +++ b/tests/parser_integration_tests.rs @@ -7,7 +7,7 @@ //! rule evaluation, ensuring all components work together correctly. use libmagic_rs::MagicDatabase; -use libmagic_rs::parser::load_magic_file; +use libmagic_rs::parser::{ParsedMagic, load_magic_file}; use std::fs; use std::io::Write; use std::path::{Path, PathBuf}; @@ -66,7 +66,8 @@ fn test_load_text_magic_file_success() { "; let magic_file = create_test_magic_file(temp_dir.path(), "magic", magic_content); - let rules = load_magic_file(&magic_file).expect("Failed to load magic file"); + let ParsedMagic { rules, .. } = + load_magic_file(&magic_file).expect("Failed to load magic file"); // Verify rules loaded correctly - should have 2 top-level rules assert_eq!(rules.len(), 2, "Should have 2 top-level rules"); @@ -109,7 +110,7 @@ fn test_load_directory_magic_file_success() { ); create_test_magic_file(&magic_dir, "02_pdf", "0 string \\x25PDF- PDF document\n"); - let rules = load_magic_file(&magic_dir).expect("Failed to load directory"); + let ParsedMagic { rules, .. } = load_magic_file(&magic_dir).expect("Failed to load directory"); // Verify all files merged correctly in alphabetical order assert_eq!(rules.len(), 3, "Should have 3 rules from 3 files"); @@ -159,12 +160,58 @@ fn test_load_empty_directory() { let empty_dir = temp_dir.path().join("empty_magic.d"); fs::create_dir(&empty_dir).expect("Failed to create empty directory"); - let rules = load_magic_file(&empty_dir).expect("Failed to load empty directory"); + let ParsedMagic { rules, .. } = + load_magic_file(&empty_dir).expect("Failed to load empty directory"); // Should return empty rules vector (not error) assert_eq!(rules.len(), 0, "Empty directory should return empty rules"); } +// ============================================================ +// Tests for name/use subroutine round-trip +// ============================================================ + +#[test] +fn test_name_use_round_trip() { + use libmagic_rs::parser::ast::{MetaType, TypeKind}; + + // A `name` declaration + a `use` invocation at the top level. The + // name rule should be hoisted into the name table; the use rule + // should survive in the rules list. Evaluating the file against a + // matching buffer should surface the subroutine's message. + let magic = "\ +0 name part2 +>3 byte 0x42 sub-match + +0 use part2 +"; + let parsed = libmagic_rs::parser::parse_text_magic_file(magic).expect("parse meta round-trip"); + + // The name rule should be hoisted; only the `use` remains at the top. + assert_eq!(parsed.rules.len(), 1, "name rule must be hoisted out"); + assert!( + matches!( + parsed.rules[0].typ, + TypeKind::Meta(MetaType::Use(ref n)) if n == "part2" + ), + "remaining top-level rule must be the use invocation" + ); + + // End-to-end evaluation via MagicDatabase. + let temp_dir = TempDir::new().expect("Failed to create temp dir"); + let magic_file = create_test_magic_file(temp_dir.path(), "meta.magic", magic); + let db = MagicDatabase::load_from_file(&magic_file) + .expect("load meta-type magic file into MagicDatabase"); + + let buffer = b"\x00\x00\x00\x42\x00"; + let result = db.evaluate_buffer(buffer).expect("evaluate meta buffer"); + assert!( + result.description.contains("sub-match"), + "description should contain subroutine message, got '{}'", + result.description + ); +} + // ============================================================ // Tests for MagicDatabase Integration // ============================================================ diff --git a/tests/property_tests.proptest-regressions b/tests/property_tests.proptest-regressions new file mode 100644 index 0000000..4d459dc --- /dev/null +++ b/tests/property_tests.proptest-regressions @@ -0,0 +1,9 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc f7a5c24cef58c44127f4600b0bbe65f42a90f4ac5917b1aaaec7441dd2544f52 # shrinks to rule = MagicRule { offset: Absolute(0), typ: Meta(Name("A")), op: Equal, value: Uint(0), message: "A", children: [], level: 0, strength_modifier: None }, buffer = [71, 195, 44, 100, 222, 230, 41, 133, 93, 35, 173, 30, 114, 46, 108, 185, 61, 123, 245, 13, 158, 187, 174, 211, 147, 133, 164, 5, 150, 185, 108, 124, 125, 199, 52, 155, 178, 215, 15, 213, 33, 116, 180, 122, 90, 166, 70, 85, 238, 89, 150, 107, 156, 250, 59, 237, 125, 180, 209, 174, 23, 204, 0, 132, 16, 115, 59, 180, 107, 223, 208, 101, 222, 39, 79, 24, 63, 141, 172, 233, 3, 132, 62, 189, 181, 74, 22, 79, 29, 168, 173] +cc 6d92e58bff36af4467c8702355f734ddd1b7d08dd98464f2792a11905810c754 # shrinks to meta_rules = [MagicRule { offset: Relative(0), typ: Meta(Indirect), op: Equal, value: Uint(0), message: "-", children: [], level: 0, strength_modifier: None }], buffer = [0] +cc fcced6ceb9872349e8c3e8d796e5c65c1e36faa3bc8b22b63b00050bc1474cae # shrinks to rule = MagicRule { offset: Relative(0), typ: Meta(Indirect), op: Equal, value: Uint(0), message: " ", children: [], level: 0, strength_modifier: None }, buffer = [0] diff --git a/tests/property_tests.rs b/tests/property_tests.rs index b24b9e8..43ff3eb 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -11,7 +11,7 @@ use proptest::prelude::*; -use libmagic_rs::parser::ast::PStringLengthWidth; +use libmagic_rs::parser::ast::{MetaType, PStringLengthWidth}; use libmagic_rs::{ Endianness, EvaluationConfig, MagicDatabase, MagicRule, OffsetSpec, Operator, TypeKind, Value, }; @@ -96,6 +96,12 @@ fn arb_type_kind() -> impl Strategy { (1usize..=4096usize).prop_map(|range| TypeKind::Search { range: ::std::num::NonZeroUsize::new(range).unwrap(), }), + Just(TypeKind::Meta(MetaType::Default)), + Just(TypeKind::Meta(MetaType::Clear)), + Just(TypeKind::Meta(MetaType::Indirect)), + Just(TypeKind::Meta(MetaType::Offset)), + "[a-zA-Z_][a-zA-Z0-9_-]{0,16}".prop_map(|id| TypeKind::Meta(MetaType::Name(id))), + "[a-zA-Z_][a-zA-Z0-9_-]{0,16}".prop_map(|id| TypeKind::Meta(MetaType::Use(id))), ] } @@ -153,6 +159,40 @@ fn arb_buffer() -> impl Strategy> { prop::collection::vec(any::(), 0..1024) } +/// Generate a `MagicRule` whose `TypeKind` is one of the `Meta` variants. +/// +/// Reuses [`arb_magic_rule`]-style construction but overrides `typ` with a +/// random `MetaType` choice so the property test exercises the inline +/// dispatch branches for `Default`/`Clear`/`Indirect`/`Use`/`Name`/`Offset` +/// without diluting the sample with non-Meta variants. +fn arb_meta_rule() -> impl Strategy { + let meta_kind = prop_oneof![ + Just(TypeKind::Meta(MetaType::Default)), + Just(TypeKind::Meta(MetaType::Clear)), + Just(TypeKind::Meta(MetaType::Indirect)), + Just(TypeKind::Meta(MetaType::Offset)), + "[a-zA-Z_][a-zA-Z0-9_-]{0,16}".prop_map(|id| TypeKind::Meta(MetaType::Name(id))), + "[a-zA-Z_][a-zA-Z0-9_-]{0,16}".prop_map(|id| TypeKind::Meta(MetaType::Use(id))), + ]; + ( + arb_offset_spec(), + meta_kind, + arb_operator(), + arb_value(), + "[a-zA-Z0-9 _-]{1,64}", + ) + .prop_map(|(offset, typ, op, value, message)| MagicRule { + offset, + typ, + op, + value, + message, + children: vec![], + level: 0, + strength_modifier: None, + }) +} + // ============================================================================= // Property Tests // ============================================================================= @@ -317,6 +357,24 @@ proptest! { let _ = evaluate_rules(&[rule], &buf, &mut context); } + /// Property: meta-type rule evaluation never panics for any + /// `TypeKind::Meta(...)` variant. Exercises the inline branches added + /// for `Default`, `Clear`, and `Indirect` together with the `Use` + /// fast-path and `Name` leaked-rule no-op. The 1-second timeout guard + /// keeps the property test bounded even when an arbitrarily-generated + /// rule fires the indirect-recursion path against a small buffer. + #[test] + fn prop_meta_type_evaluation_never_panics( + meta_rules in prop::collection::vec(arb_meta_rule(), 1..8), + buffer in arb_buffer(), + ) { + use libmagic_rs::evaluator::{EvaluationContext, evaluate_rules}; + let config = EvaluationConfig::default().with_timeout_ms(Some(1000)); + let mut context = EvaluationContext::new(config); + // Must never panic, regardless of variant or buffer contents. + let _ = evaluate_rules(&meta_rules, &buffer, &mut context); + } + /// Property: regex evaluation stays bounded for adversarial /// patterns with large bounded repetitions. Combined with /// `build_regex`'s `size_limit` + `dfa_size_limit` (S-M2 fix), diff --git a/tests/regex_search_corpus_tests.rs b/tests/regex_search_corpus_tests.rs index 69987eb..d0abc07 100644 --- a/tests/regex_search_corpus_tests.rs +++ b/tests/regex_search_corpus_tests.rs @@ -19,7 +19,7 @@ use libmagic_rs::evaluator::evaluate_rules; use libmagic_rs::parser::ast::{RegexCount, RegexFlags}; -use libmagic_rs::parser::parse_text_magic_file; +use libmagic_rs::parser::{ParsedMagic, parse_text_magic_file}; use libmagic_rs::{ EvaluationConfig, EvaluationContext, MagicRule, OffsetSpec, Operator, TypeKind, Value, }; @@ -247,7 +247,7 @@ fn test_json1_corpus_parser_roundtrip() { // escapes are interpreted by the regex compiler, not by the magic // parser, so the double-backslashes are correct. let magic = r#"0 regex "^\\s*[\\{\\[]" JSON text data"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -270,7 +270,7 @@ fn test_regex_flag_parser_roundtrip_case_insensitive() { // and evaluates, confirming the flag is wired through. let buffer = load_corpus_file("json1.testfile"); let magic = r#"0 regex/c "^\\s*[\\{\\[]" JSON text data"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -284,7 +284,7 @@ fn test_regex_flag_parser_roundtrip_case_insensitive() { fn test_search_parser_roundtrip_with_range() { let buffer = load_corpus_file("searchbug.testfile"); let magic = r#"0 search/32 "ABC" found ABC"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -312,7 +312,7 @@ fn test_regex_bytes_count_parser_roundtrip() { // Match the first JSON opener byte within a 64-byte window. let buffer = load_corpus_file("json1.testfile"); let magic = r#"0 regex/64 "^\\s*[\\{\\[]" JSON in first 64 bytes"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -330,7 +330,7 @@ fn test_regex_bytes_count_parser_roundtrip() { fn test_regex_lines_count_parser_roundtrip() { let buffer = load_corpus_file("gedcom.testfile"); let magic = r#"0 regex/1l "^0 HEAD" GEDCOM head on first line"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -351,7 +351,7 @@ fn test_regex_lines_count_parser_roundtrip() { fn test_regex_lines_none_parser_roundtrip() { let buffer = load_corpus_file("json1.testfile"); let magic = r#"0 regex/l "^\\s*[\\{\\[]" JSON text data"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); @@ -368,7 +368,7 @@ fn test_regex_lines_none_parser_roundtrip() { fn test_regex_start_offset_and_line_flag_parser_roundtrip() { let buffer = load_corpus_file("json1.testfile"); let magic = r#"0 regex/ls "^\\s*[\\{\\[]" JSON opener with /s anchor"#; - let rules = parse_text_magic_file(magic).expect("parse_text_magic_file"); + let ParsedMagic { rules, .. } = parse_text_magic_file(magic).expect("parse_text_magic_file"); assert_eq!(rules.len(), 1); let matches = run_rules(&rules, &buffer); From f2647f02a91f30bac30829a09751678e76e1c697 Mon Sep 17 00:00:00 2001 From: "dosubot[bot]" <131922026+dosubot[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 23:39:45 +0000 Subject: [PATCH 07/16] docs: Dosu updates for PR #230 --- docs/API_REFERENCE.md | 101 +++++++++++++++++++++++++++++++++++++- docs/src/api-reference.md | 24 ++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index b6b42e2..08de458 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -25,6 +25,17 @@ The main interface for loading magic rules and evaluating files. use libmagic_rs::MagicDatabase; ``` +The struct contains internal fields: + +| Field (Internal) | Type | Description | +| ----------------- | -------------------- | ---------------------------------------------------------------- | +| `rules` | `Vec` | Top-level magic rules | +| `name_table` | `Arc` | Named subroutine definitions extracted from `name` rules | +| `root_rules` | `Arc<[MagicRule]>` | Shared immutable slice of top-level rules for `indirect` re-entry | +| `config` | `EvaluationConfig` | Evaluation configuration | +| `source_path` | `Option` | Optional path to the source magic file or directory | +| `mime_mapper` | `MimeMapper` | MIME type mapper | + #### Constructor Methods | Method | Description | @@ -252,8 +263,65 @@ match MagicDatabase::load_from_file("invalid.magic") { ## Parser Module +### Parser Functions + +#### parse_text_magic_file + +Parses a complete magic file from raw text input. + +```rust +use libmagic_rs::parser::parse_text_magic_file; + +let magic = "0 string \\x7fELF ELF file"; +let parsed = parse_text_magic_file(magic)?; +assert_eq!(parsed.rules.len(), 1); +``` + +Returns `Result` where `ParsedMagic` contains the top-level rules and the name table. + +#### load_magic_file + +Loads magic rules from a file or directory, automatically detecting the format. + +```rust +use libmagic_rs::parser::load_magic_file; + +let parsed = load_magic_file("/usr/share/misc/magic")?; +println!("Loaded {} magic rules", parsed.rules.len()); +``` + +Returns `Result`. + +#### load_magic_directory + +Loads and merges magic rules from all files in a directory. + +```rust +use libmagic_rs::parser::load_magic_directory; + +let parsed = load_magic_directory("/usr/share/file/magic.d")?; +println!("Loaded {} rules from directory", parsed.rules.len()); +``` + +Returns `Result`. + ### AST Types +#### ParsedMagic + +Result of parsing a text magic file. + +```rust +use libmagic_rs::parser::ParsedMagic; +``` + +Contains the top-level rule list with any `name`-declared subroutines hoisted into a separate name table keyed by identifier. + +| Field | Type | Description | +| ------------ | ------------- | ------------------------------------------------------------ | +| `rules` | `Vec` | Top-level rules after `Name` subroutines have been removed | +| `name_table` | `NameTable` (internal) | Extracted `name` subroutine definitions, consulted by the evaluator when a rule of type `TypeKind::Meta(MetaType::Use(_))` is reached | + #### MagicRule Represents a parsed magic rule. @@ -307,6 +375,24 @@ use libmagic_rs::TypeKind; | `Date { endian, utc }` | 32-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. Date values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | | `QDate { endian, utc }` | 64-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. QDate values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | | `String { max_length }` | String data | +| `Meta(MetaType)` | Meta-type directives for control flow, conditional execution, and named subroutines. Variants: `Default`, `Clear`, `Name`, `Use`, `Indirect`, `Offset`. See [`MetaType`](#metatype) for details. | + +##### MetaType + +Control-flow directive variants carried by `TypeKind::Meta`. + +```rust +use libmagic_rs::parser::ast::MetaType; +``` + +| Variant | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| `Default` | Fires when no sibling at the same indentation level has matched | +| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | +| `Name(id)` | Declares a named subroutine with identifier `id` that can be invoked later via `Use` | +| `Use(id)` | Invokes a named subroutine previously declared via `Name` | +| `Indirect` | Re-applies the entire magic database at the resolved offset | +| `Offset` | Reports the current file offset as `Value::Uint(pos)` rather than reading a typed value from the buffer; operator must be `AnyValue` (`x`) | ##### 64-bit Integer Types @@ -443,7 +529,7 @@ use libmagic_rs::evaluator::MatchResult; | Field | Type | Description | | ------------ | -------- | ----------------- | -| `message` | `String` | Match description | +| `message` | `String` | Match description (printf-style format specifiers like `%d`, `%x`, `%s` are substituted with the matched value) | | `offset` | `usize` | Match offset | | `level` | `u32` | Rule level | | `value` | `Value` | Matched value | @@ -558,3 +644,16 @@ Currently, libmagic-rs does not have optional feature flags. All functionality i - **Minimum Rust Version**: 1.89 - **Edition**: 2024 - **License**: Apache-2.0 + +--- + +## Breaking Changes + +### v0.5.0 + +**Meta-type directives and format substitution** (PR #230): + +- Parser functions `parse_text_magic_file`, `load_magic_file`, and `load_magic_directory` return `Result` instead of `Result, ParseError>`. `ParsedMagic` is a struct with fields `rules: Vec` and `name_table: NameTable`. +- `MagicDatabase` struct now includes internal fields `root_rules: Arc<[MagicRule]>` and `name_table: Arc` to support meta-type evaluation. +- Printf-style format substitution (`%d`, `%x`, `%s`, etc.) is applied to the `message` field in `MatchResult`. Messages containing literal `%` characters that were previously passed through verbatim will now be interpreted as format specifiers. Escape literal `%` as `%%`. +- `TypeKind::Meta(MetaType)` enum added with variants `Default`, `Clear`, `Name`, `Use`, `Indirect`, `Offset`. diff --git a/docs/src/api-reference.md b/docs/src/api-reference.md index f3c2e95..5ae9480 100644 --- a/docs/src/api-reference.md +++ b/docs/src/api-reference.md @@ -230,6 +230,24 @@ use libmagic_rs::TypeKind; | `Double { endian }` | 64-bit IEEE 754 double-precision floating-point (added in v0.5.0) | | `String { max_length }` | String data (discriminant changed from 4 to 6 in v0.5.0) | | `PString { max_length }` | Pascal string - length-prefixed byte followed by string data (returns `Value::String`) | +| `Meta(MetaType)` | Control flow and subroutine directives for conditional execution and code reuse | + +### MetaType + +Control-flow directives carried by `TypeKind::Meta`. + +```rust +use libmagic_rs::MetaType; +``` + +| Variant | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------- | +| `Default` | Fires when no sibling at the same indentation level matched at the current offset | +| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | +| `Name(String)` | Declares a named subroutine that can be invoked later via `Use` | +| `Use(String)` | Invokes a named subroutine previously declared via `Name` | +| `Indirect` | Re-applies the entire magic database at the resolved offset | +| `Offset` | Reports the current file offset as `Value::Uint(position)` rather than reading a typed value | ### Operator @@ -414,7 +432,7 @@ use libmagic_rs::evaluator::MatchResult; | Field | Type | Description | | ------------ | ---------- | ----------------------------------------- | -| `message` | `String` | Match description | +| `message` | `String` | Match description (printf-style format specifiers like `%d`, `%x`, `%s` are substituted at output time) | | `offset` | `usize` | Match offset | | `level` | `u32` | Rule level | | `value` | `Value` | Matched value | @@ -524,6 +542,10 @@ pub use error::{EvaluationError, LibmagicError, ParseError}; - `Value` enum: No longer derives `Eq` trait (only `PartialEq` is available due to floating-point values) - `RuleMatch` struct: Added `type_kind: TypeKind` field to indicate the type used for matching +### Breaking Changes (post-0.5.0) + +- Parser functions (`parse_text_magic_file`, `load_magic_file`, `load_magic_directory`) now return `ParsedMagic { rules, name_table }` instead of `Vec`. Code must destructure: `let ParsedMagic { rules, name_table } = parse_text_magic_file(...)?;` + ### Breaking Changes in v0.2.0 - `TypeKind::Byte` changed from a unit variant to a struct variant `Byte { signed }` to support explicit signedness From d510188da5d6ea02bb5acdaa358053ecdebd55c1 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 21:16:22 -0400 Subject: [PATCH 08/16] fix(meta-types): address PR-review findings for subroutine scope and format substitution Critical: SubroutineScope RAII replaces manual save/restore in evaluate_use_rule so last_match_end and base_offset are restored on all error exit paths (RecursionLimitExceeded and Timeout previously leaked corrupted state). format_magic_message no longer mangles non-ASCII template bytes -- replaced byte-by-byte push with plain-run slice flush. Hygiene: removed stale allow(dead_code) on AnchorScope::context(). Tests: 9 new cases -- 5 for resolve_offset_with_base bias invariants (positive Absolute biased, negative Absolute / FromEnd / Relative / Indirect not biased), 2 for Use subroutine with non-zero use-site (proves bias is active and Relative is not double-biased), 1 continuation-sibling reset regression guard with bytes-consumed first sibling, 1 non-ASCII template format test. 1357/1357 pass. Signed-off-by: UncleSp1d3r --- src/evaluator/engine/mod.rs | 71 +++++++++++---- src/evaluator/engine/tests.rs | 157 +++++++++++++++++++++++++++++++++- src/evaluator/offset/mod.rs | 82 ++++++++++++++++++ src/output/format.rs | 48 +++++++++-- 4 files changed, 334 insertions(+), 24 deletions(-) diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 47da1bc..90f4090 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -44,7 +44,6 @@ impl<'a> AnchorScope<'a> { } /// Access the underlying context for the duration of the guard. - #[allow(dead_code)] fn context(&mut self) -> &mut EvaluationContext { self.context } @@ -56,6 +55,51 @@ impl Drop for AnchorScope<'_> { } } +/// RAII guard for `MetaType::Use` subroutine dispatch. +/// +/// Saves `last_match_end` and `base_offset` on entry, seeds the context +/// with the use-site offset (for both fields so that a subroutine's +/// `&0` relative offset resolves to the use-site and its positive +/// absolute offsets bias against the use-site per magic(5)), and +/// restores both on drop. +/// +/// This is the safety net for early-return paths inside +/// `evaluate_use_rule`: a `RecursionGuard::enter` failure or a +/// `Timeout`/`RecursionLimitExceeded` inside the subroutine body would +/// otherwise leave the caller's context with corrupted anchor and +/// base-offset state. The guard's `Drop` impl restores both fields on +/// every exit path, error or success. +struct SubroutineScope<'a> { + context: &'a mut EvaluationContext, + saved_anchor: usize, + saved_base: usize, +} + +impl<'a> SubroutineScope<'a> { + fn enter(context: &'a mut EvaluationContext, use_site: usize) -> Self { + let saved_anchor = context.last_match_end(); + let saved_base = context.base_offset(); + context.set_last_match_end(use_site); + context.set_base_offset(use_site); + Self { + context, + saved_anchor, + saved_base, + } + } + + fn context(&mut self) -> &mut EvaluationContext { + self.context + } +} + +impl Drop for SubroutineScope<'_> { + fn drop(&mut self) { + self.context.set_last_match_end(self.saved_anchor); + self.context.set_base_offset(self.saved_base); + } +} + /// Process-local once guard for the "use directive without rule environment" /// warning. Ensures we surface the misconfiguration exactly once per process /// so low-level programmatic consumers of [`evaluate_rules`] (tests, fuzz @@ -261,25 +305,20 @@ fn evaluate_use_rule( context.base_offset(), )?; - // Save the anchor and base offset, seed the subroutine body with the - // use-site offset for both, and restore on exit. This gives the - // subroutine: - // * `&N` offsets resolving from the use-site (via last_match_end) - // * `>N` / absolute offsets in the subroutine resolving as - // `use_site + N` (via base_offset), matching magic(5) semantics - let saved_anchor = context.last_match_end(); - let saved_base = context.base_offset(); - context.set_last_match_end(absolute_offset); - context.set_base_offset(absolute_offset); - + // `SubroutineScope` seeds `last_match_end` and `base_offset` with + // the use-site offset and restores both on drop. This is the + // safety net for early-return paths below -- if + // `RecursionGuard::enter` or the inner `evaluate_rules` returns + // `Err(Timeout)` / `Err(RecursionLimitExceeded)`, the `?` unwinds + // through the guard's `Drop` impl and the caller's context + // returns to its pre-use state. Without the RAII wrapper a manual + // save/restore pair would be bypassed on every error path. let subroutine_matches = { - let mut guard = RecursionGuard::enter(context)?; + let mut scope = SubroutineScope::enter(context, absolute_offset); + let mut guard = RecursionGuard::enter(scope.context())?; evaluate_rules(&subroutine_rules, buffer, guard.context())? }; - context.set_last_match_end(saved_anchor); - context.set_base_offset(saved_base); - Ok((Some(absolute_offset), subroutine_matches)) } diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests.rs index 60a66f1..78d18b5 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests.rs @@ -2654,8 +2654,15 @@ fn make_context_with_env(name_table: NameTable, root_rules: &[MagicRule]) -> Eva /// Minimal helper: wrap a `TypeKind::Meta(MetaType::Use(name))` rule at /// offset 0 with the given `message` and empty child list. fn use_rule(name: &str) -> MagicRule { + use_rule_at(name, 0) +} + +/// Build a `Use` rule at a specific use-site offset. Used by tests +/// that need to prove subroutine `base_offset` biasing actually +/// depends on the use-site value. +fn use_rule_at(name: &str, offset: i64) -> MagicRule { MagicRule { - offset: OffsetSpec::Absolute(0), + offset: OffsetSpec::Absolute(offset), typ: TypeKind::Meta(MetaType::Use(name.to_string())), op: Operator::Equal, value: Value::Uint(0), @@ -3466,3 +3473,151 @@ fn test_offset_sets_sibling_matched() { "default must be suppressed when offset sibling matched; got {matches:?}" ); } + +// ======================================================================= +// Subroutine base_offset biasing (issue #42 -- use-site offset +// propagation). Critical coverage per post-PR code review. +// ======================================================================= + +#[test] +fn test_use_subroutine_absolute_offset_biased_by_use_site() { + // Regression guard: if `SubroutineScope::enter` fails to seed + // `base_offset` with the use-site offset, a subroutine rule at + // `Absolute(0)` will read from buffer[0] instead of + // buffer[use_site]. This test proves the bias is active by + // placing distinct magic bytes at two different positions and + // verifying that the subroutine reads the use-site one. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Subroutine body: a single rule reading at Absolute(0). Without + // base_offset biasing this resolves to file position 0. With + // biasing it resolves to the use-site (position 8 in this test). + let subroutine_body = vec![byte_eq_rule(0, 0x42, "sub-match-at-base")]; + let name_table = build_name_table(vec![("sub", subroutine_body)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + + // Use-site at offset 8. buffer[0] = 0x00 (would fail with bias + // missing); buffer[8] = 0x42 (required for bias-active success). + let mut buffer = vec![0u8; 16]; + buffer[8] = 0x42; + + let mut context = EvaluationContext::new(config).with_rule_env(env); + let rules = vec![use_rule_at("sub", 8)]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.iter().any(|m| m.message == "sub-match-at-base"), + "subroutine rule at Absolute(0) must be biased by use-site offset 8 \ + -- reading buffer[8] = 0x42. If bias missing, reads buffer[0] = 0x00 \ + and the test fails. got {matches:?}" + ); +} + +#[test] +fn test_use_subroutine_relative_offset_unaffected_by_use_site() { + // Companion to the bias test above: `Relative(N)` is resolved + // against `last_match_end`, which `SubroutineScope` also seeds + // to the use-site. We verify the Relative rule reads at the + // use-site + N, NOT at use-site + base + N (which would be a + // double-bias bug). + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Subroutine body: a Relative(0) rule that reads at the + // use-site (seeded via last_match_end). + let mut rel_rule = byte_eq_rule(0, 0x42, "rel-sub-match"); + rel_rule.offset = OffsetSpec::Relative(0); + let subroutine_body = vec![rel_rule]; + let name_table = build_name_table(vec![("rsub", subroutine_body)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + + let mut buffer = vec![0u8; 16]; + buffer[5] = 0x42; + + let mut context = EvaluationContext::new(config).with_rule_env(env); + let rules = vec![use_rule_at("rsub", 5)]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.iter().any(|m| m.message == "rel-sub-match"), + "subroutine Relative(0) rule must read at use-site (5) via last_match_end, \ + not at use-site+base (10). got {matches:?}" + ); +} + +#[test] +fn test_continuation_sibling_reset_after_bytes_consumed() { + // Stronger regression guard than + // `test_offset_does_not_advance_anchor_for_continuation_siblings`, + // which used Relative(0) on both siblings and was trivially + // non-advancing. Here the first sibling consumes actual bytes, + // so if the `is_child_sibling_list` reset is removed the second + // sibling would read from a shifted anchor. + // + // Parent byte at 0 matches 0x01 -> anchor = 1. + // Sibling-1: Long at &0 (resolves to 1, reads 4 bytes, + // advances anchor to 5 WITHOUT the reset). + // Sibling-2: Byte at &0 (must resolve to 1, not 5). + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + let long_sibling = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Long { + endian: crate::parser::ast::Endianness::Little, + signed: false, + }, + op: Operator::Equal, + value: Value::Uint(0x0403_0201), + message: "long-sibling".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let byte_sibling = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + // buffer[1] = 0x01 -- if reset is removed, sibling-2 reads + // buffer[5] instead and matches 0x42 (wrong!). + value: Value::Uint(0x01), + message: "byte-sibling-sees-parent-anchor".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x01), + message: "parent".to_string(), + children: vec![long_sibling, byte_sibling], + level: 0, + strength_modifier: None, + }; + + // buffer[0]=0x01 parent; buffer[1..5]=0x01,0x02,0x03,0x04 long + // match; buffer[5]=0x42 bait for missing-reset failure. + let buffer = [0x01u8, 0x01, 0x02, 0x03, 0x04, 0x42, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent", "long-sibling", "byte-sibling-sees-parent-anchor"], + "byte-sibling must read buffer[1]=0x01 via parent-level anchor reset; \ + if reset is missing it reads buffer[5]=0x42 and test fails. got {matches:?}" + ); +} diff --git a/src/evaluator/offset/mod.rs b/src/evaluator/offset/mod.rs index 40c42c8..88c7d62 100644 --- a/src/evaluator/offset/mod.rs +++ b/src/evaluator/offset/mod.rs @@ -272,6 +272,88 @@ mod tests { assert_eq!(resolve_offset_with_context(&spec, buffer, 42).unwrap(), 5); } + #[test] + fn test_resolve_offset_with_base_biases_positive_absolute() { + // Positive Absolute inside a subroutine body is biased by + // `base_offset`. This is the load-bearing invariant of + // `MetaType::Use` subroutine semantics. + let buffer = b"0123456789ABCDEF"; + let spec = OffsetSpec::Absolute(4); + // base_offset = 10 -> resolves to 14 (not 4). + assert_eq!( + resolve_offset_with_base(&spec, buffer, 0, 10).unwrap(), + 14, + "positive Absolute must be biased by base_offset inside a subroutine" + ); + } + + #[test] + fn test_resolve_offset_with_base_does_not_bias_negative_absolute() { + // Negative Absolute means "from-end" semantics (magic(5) + // allows either explicit `FromEnd` or negative `Absolute`). + // The subroutine base_offset is relative to the file start + // and has no meaning for from-end positions. + let buffer = b"0123456789ABCDEF"; + let spec = OffsetSpec::Absolute(-4); + // Without bias: resolves to len - 4 = 12. + // Buggy with-bias would give: 10 + (len - 4) or similar. + assert_eq!( + resolve_offset_with_base(&spec, buffer, 0, 10).unwrap(), + 12, + "negative Absolute must NOT be biased" + ); + } + + #[test] + fn test_resolve_offset_with_base_does_not_bias_from_end() { + // `FromEnd` is always relative to the buffer, not the + // subroutine's use-site. + let buffer = b"0123456789ABCDEF"; + let spec = OffsetSpec::FromEnd(-4); + assert_eq!( + resolve_offset_with_base(&spec, buffer, 0, 10).unwrap(), + 12, + "FromEnd must NOT be biased" + ); + } + + #[test] + fn test_resolve_offset_with_base_does_not_bias_relative() { + // `Relative(N)` resolves against the previous-match anchor, + // not the subroutine base. Inside a subroutine body, + // `last_match_end` is seeded to the use-site by + // `SubroutineScope::enter`, so this already has the correct + // frame of reference without additional bias. + let buffer = b"0123456789ABCDEF"; + let spec = OffsetSpec::Relative(3); + // last_match_end = 2, base_offset = 10. + // Expected: 2 + 3 = 5 (bias does NOT apply). + assert_eq!( + resolve_offset_with_base(&spec, buffer, 2, 10).unwrap(), + 5, + "Relative must NOT be biased (already resolved against last_match_end)" + ); + } + + #[test] + fn test_resolve_offset_with_base_does_not_bias_indirect() { + // `Indirect` reads a pointer from the buffer; the pointer's + // value is an absolute file position, not a subroutine- + // relative one. + let buffer = b"\x05TestXdata"; + let spec = OffsetSpec::Indirect { + base_offset: 0, + pointer_type: crate::parser::ast::TypeKind::Byte { signed: false }, + adjustment: 0, + endian: crate::parser::ast::Endianness::Little, + }; + assert_eq!( + resolve_offset_with_base(&spec, buffer, 0, 10).unwrap(), + 5, + "Indirect must NOT be biased" + ); + } + #[test] fn test_resolve_offset_comprehensive() { let buffer = b"0123456789ABCDEF"; diff --git a/src/output/format.rs b/src/output/format.rs index 4c9decc..6b89c83 100644 --- a/src/output/format.rs +++ b/src/output/format.rs @@ -74,19 +74,24 @@ pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) let mut out = String::with_capacity(template.len()); let bytes = template.as_bytes(); let mut i = 0; + // Start of the most recent run of non-`%` bytes. We copy the run + // as a string slice rather than byte-by-byte so non-ASCII UTF-8 + // code points survive intact. Scanning still happens at the byte + // level (safe because `%` is ASCII 0x25 and cannot appear as a + // UTF-8 continuation byte, which is always >= 0x80). + let mut plain_start = 0; while i < bytes.len() { - let b = bytes[i]; - if b != b'%' { - // SAFETY: iterating by byte but template is valid UTF-8; any - // non-ASCII multi-byte character has all continuation bytes - // > 0x7f which cannot equal b'%' (0x25), so we never split - // a UTF-8 codepoint here. Push as char. - out.push(b as char); + if bytes[i] != b'%' { i += 1; continue; } + // Flush any pending plain-text run as a single UTF-8 slice. + if plain_start < i { + out.push_str(&template[plain_start..i]); + } + // Start of a format specifier at position i. let spec_start = i; let Some(parsed_spec) = parse_spec(bytes, i + 1) else { @@ -97,6 +102,9 @@ pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) "format_magic_message: malformed specifier at byte {i} in template {template:?}; passing through remainder literally", ); out.push_str(&template[i..]); + // Skip the trailing flush -- we have already emitted the + // remainder above. + plain_start = bytes.len(); break; }; let next_i = parsed_spec.end; @@ -112,6 +120,12 @@ pub fn format_magic_message(template: &str, value: &Value, type_kind: &TypeKind) out.push_str(literal); } i = next_i; + plain_start = i; + } + + // Flush any trailing plain-text run. + if plain_start < bytes.len() { + out.push_str(&template[plain_start..]); } out @@ -522,6 +536,26 @@ mod tests { assert_eq!(out, "100% sure"); } + #[test] + fn test_non_ascii_template_preserved() { + // Regression guard: earlier revisions iterated by byte and + // pushed each `b as char`, which re-encoded non-ASCII UTF-8 + // continuation bytes as Latin-1 code points and mangled the + // output (e.g., "café" -> "café"). The plain-run flush path + // must copy slices of the original template to preserve the + // original UTF-8 byte sequences. + let out = format_magic_message("café %d", &Value::Int(42), &long_t()); + assert_eq!(out, "café 42"); + + // Non-ASCII around a specifier on both sides. + let out = format_magic_message("→ %s ←", &Value::String("ok".into()), &byte_t()); + assert_eq!(out, "→ ok ←"); + + // Non-ASCII only, no specifiers. + let out = format_magic_message("über", &Value::Uint(0), &byte_t()); + assert_eq!(out, "über"); + } + #[test] fn test_multiple_specifiers_in_one_template() { // Note: current implementation binds every specifier to the single From 7a5add9439fd0f6b91cbbc9926f2c1c9c10c2f3b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 22:40:37 -0400 Subject: [PATCH 09/16] docs(solutions): add RAII scope guard learning and refresh meta-type dispatch doc New learning: logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md captures the bug class where manual save/restore of EvaluationContext state is bypassable by ? on fallible operations, and documents SubroutineScope as the RAII fix. Includes secondary coverage of the non-ASCII UTF-8 template fix in format_magic_message and a false-positive postmortem on AtomicBool::swap semantics. Refresh: integration-issues/meta-type-subroutine-dispatch-architecture.md updated to reflect PR #230 Phase 2 landing. All six MetaType variants shipped, indirect and default/clear reclassified from 'next phase' to shipped, default/clear implementation note corrected (frame-local sibling_matched, not MatchStateTracker), Layer 3 now reflects base_offset and SubroutineScope, Use-dispatch example references the RAII guard, related links cross-reference the new learning. Marked last_refreshed: 2026-04-22. Signed-off-by: UncleSp1d3r --- ...a-type-subroutine-dispatch-architecture.md | 38 ++-- ...ards-for-evaluator-context-save-restore.md | 201 ++++++++++++++++++ 2 files changed, 226 insertions(+), 13 deletions(-) create mode 100644 docs/solutions/logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md diff --git a/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md b/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md index ef91194..ba9ad5d 100644 --- a/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md +++ b/docs/solutions/integration-issues/meta-type-subroutine-dispatch-architecture.md @@ -1,6 +1,7 @@ --- title: Parse-time name table extraction and context-threaded RuleEnvironment for meta-type subroutines date: 2026-04-22 +last_refreshed: 2026-04-22 status: resolved severity: medium category: integration-issues @@ -10,6 +11,8 @@ components: - parser/loader - evaluator/mod - evaluator/engine + - evaluator/offset + - output/format - MagicDatabase tags: - rust @@ -23,9 +26,10 @@ tags: - control-flow - architecture-pattern issue: '#42' +pr: '#230' branch: 42-parser-implement-default-clear-name-use-and-indirect-meta-types applies_when: - - Implementing a new magic(5) control-flow directive (e.g. indirect, default, clear) + - Implementing a new magic(5) control-flow directive (all six -- default, clear, name, use, indirect, offset -- are now wired through; reference this pattern when adding a seventh or when refactoring existing dispatch) - Adding any whole-database state that evaluation needs to consult outside the current rule - Considering breaking changes to evaluate_rules / evaluate_rules_with_config root_cause: Control-flow directives do not fit the evaluator's "resolve offset -> read typed value -> apply operator" pipeline; whole-database state (name tables, root rule re-entry) must live somewhere @@ -35,12 +39,16 @@ solution_files: - src/parser/loader.rs - src/evaluator/mod.rs - src/evaluator/engine/mod.rs + - src/evaluator/offset/mod.rs + - src/output/format.rs - src/error.rs - src/lib.rs - tests/meta_types_integration.rs related_gotchas: - - S2.1 TypeKind exhaustive-match discipline still applies; the new Meta(Use) arm is dispatched from evaluate_rules, not evaluate_single_rule_with_anchor + - S2.1 TypeKind exhaustive-match discipline still applies; the Meta(Use) / Meta(Indirect) / Meta(Offset) arms are dispatched from evaluate_rules, not evaluate_single_rule_with_anchor - S3 parser architecture now produces ParsedMagic { rules, name_table }, not Vec + - S3.8 top-level sibling anchor chaining; S3.10 subroutine base_offset semantics + - S14.2 printf-style format substitution (wired into concatenate_messages via src/output/format.rs) - Property tests synthesize arbitrary TypeKind values; evaluator arms for Meta must debug!-log rather than debug_assert!-panic --- @@ -88,14 +96,15 @@ Whole-database state lives in: ```rust pub(crate) struct RuleEnvironment { name_table: Arc, + root_rules: Arc<[MagicRule]>, } ``` -`EvaluationContext` gained a `rule_env: Option>` field. `MagicDatabase::evaluate_file` attaches the environment before calling `evaluate_rules`; programmatic consumers (`evaluate_rules_with_config`, property tests, fuzz harnesses) default to `None`, and `Use` rules then become silent no-ops. +`EvaluationContext` gained a `rule_env: Option>` field. `MagicDatabase::evaluate_file` attaches the environment before calling `evaluate_rules`; programmatic consumers (`evaluate_rules_with_config`, property tests, fuzz harnesses) default to `None`, and `Use` / `Indirect` rules then become silent no-ops. -`Arc` (not `&`) because the context already outlives individual rule borrows, and property tests construct contexts without a lifetime parameter on `EvaluationContext`. +`Arc` (not `&`) because the context already outlives individual rule borrows, and property tests construct contexts without a lifetime parameter on `EvaluationContext`. `root_rules` was initially staged speculatively for `indirect` and is now live — `MetaType::Indirect` dispatch in `evaluate_rules` reads `root_rules` and re-enters the full ruleset at the resolved offset, bounded by the existing `max_recursion_depth` via `RecursionGuard`. -A second field -- `root_rules: Arc<[MagicRule]>` -- is carried on the real struct to serve `indirect` when it lands. That is a deliberate YAGNI exception: `MagicDatabase` already holds an `Arc<[MagicRule]>` at construction time, so adding the re-entry point now costs one field-copy and zero future parser work. Do not extrapolate from it -- add environment state when the consuming directive is in the same phase, not speculatively. +`EvaluationContext` also grew a companion field — `base_offset: usize` — that is not on `RuleEnvironment` because it is per-evaluation-frame state rather than per-database state. `base_offset` biases positive `OffsetSpec::Absolute(n)` resolution inside a `MetaType::Use` subroutine body so that `>N` rules resolve relative to the use-site (magic(5) semantics). See GOTCHAS S3.10 and the companion learning in `logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md` for why `base_offset` is save/restored via a `SubroutineScope` RAII guard rather than manually. ## Why this matters @@ -111,11 +120,13 @@ Four alternatives were considered and rejected. Each rejection is load-bearing f ## When to apply -The three-layer pattern is the template for every remaining magic(5) control-flow directive: +The three-layer pattern is the template every shipped magic(5) control-flow directive follows, and the template for future ones: -- **`indirect`** (next phase): resolve an offset, reinterpret the bytes there as the beginning of a rule stream, and evaluate `env.root_rules` (already staged on `RuleEnvironment`) at that offset. Layer 1 is trivial (no hoist -- `indirect` is a value-position directive, not a top-level declaration); Layer 3 provides `root_rules` as the re-entry point. Note the anchor semantics differ from `use`: `indirect` starts fresh at the resolved offset and does **not** save/restore the caller's `last_match_end`, whereas `use` is a scoped subroutine that saves, seeds, and restores. (session history) -- **`default`/`clear`**: sibling-chain predicates. These need a new `MatchStateTracker` threaded alongside `last_match_end` in `EvaluationContext` (tracks "did any prior sibling at this level match"). The same "optional per-evaluation state field on the context, programmatic consumers default to off" pattern applies directly. -- **Future `!:mime` / `!:ext` / `!:apple` directive evaluation** (tracked under v0.6.0's `Directive` extension point): same shape -- extracted at parse time into a per-rule directive table, threaded via `RuleEnvironment`, consulted only by the match-accumulation path, not the hot read loop. +- **`indirect`** (shipped in PR #230): resolves an offset, re-enters `env.root_rules` against a sub-slice of the buffer via `AnchorScope`. Layer 1 is trivial (no hoist — `indirect` is a value-position directive, not a top-level declaration); Layer 3 provides `root_rules` as the re-entry point. The anchor semantics differ from `use`: `indirect` starts fresh at the resolved offset and does **not** save/restore the caller's `last_match_end` across sibling evaluation, whereas `use` is a scoped subroutine that saves and restores via `SubroutineScope` (which also covers `base_offset`). +- **`default`/`clear`** (shipped in PR #230): sibling-chain predicates, implemented via a **frame-local `sibling_matched: bool`** inside `evaluate_rules` — explicitly NOT a new field on `EvaluationContext`, because the state's lifetime is the single recursion frame rather than the whole evaluation. `clear` resets the flag, `default` fires only when the flag is still false. The earlier speculation in this doc about a `MatchStateTracker` context field was rejected in favor of the simpler frame-local approach. +- **`offset`** (shipped in PR #230): a value-position directive that reports the resolved file offset as `Value::Uint(pos)` so printf-style format specifiers (`%lld`, `%d`) can substitute it in the rule message. Layer 3 is not involved; the dispatch reads nothing from `RuleEnvironment`. What it does need is the companion printf substitution path in `src/output/format.rs::format_magic_message`, wired into `MagicDatabase::concatenate_messages`. +- **Continuation-sibling anchor reset** (shipped in PR #230): at `recursion_depth > 0`, each sibling's `&N` offset resolves against the parent-level entry anchor rather than the previous sibling's advance. Top-level siblings (depth 0) keep chaining per GOTCHAS S3.8. This is the mechanism that makes `searchbug.magic`-style continuation chains match GNU `file` byte-for-byte. +- **Future `!:mime` / `!:ext` / `!:apple` directive evaluation** (tracked under v0.6.0's `Directive` extension point): same shape — extracted at parse time into a per-rule directive table, threaded via `RuleEnvironment`, consulted only by the match-accumulation path, not the hot read loop. The general rule: **if a directive's meaning depends on state outside the single rule being evaluated, hoist it at parse time into an environment that rides alongside the context. Never reach for the whole rule tree from inside the evaluation loop.** @@ -141,7 +152,7 @@ if let TypeKind::Meta(MetaType::Use(name)) = &rule.typ { } ``` -The anchor save/restore inside `evaluate_use_rule` seeds the subroutine with the use-site offset, then restores the caller's anchor; after returning, the outer loop re-advances to the use-site offset so sibling rules see the `use` as having "consumed" the use-site position. Mutual recursion (`a use b; b use a`) is caught by `RecursionGuard::enter(context)?` and surfaced as `EvaluationError::RecursionLimitExceeded`. +The anchor save/restore inside `evaluate_use_rule` is implemented via `SubroutineScope<'a>`, a Drop-based RAII guard that saves both `last_match_end` and `base_offset` on entry, seeds them with the use-site offset, and restores both on every exit path — including panic unwind and `?` short-circuits from inner `RecursionGuard::enter(context)?` or inner `evaluate_rules(...)?`. After `evaluate_use_rule` returns, the outer loop re-advances the anchor to the use-site offset so sibling rules see the `use` as having "consumed" the use-site position. Mutual recursion (`a use b; b use a`) is caught by `RecursionGuard::enter(context)?` and surfaced as `EvaluationError::RecursionLimitExceeded`; the `SubroutineScope` guarantees the caller's anchor and base_offset are restored even when that error propagates. See `logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md` for the full rationale and the anti-pattern it replaced. One subtlety the first Phase 3 attempt got wrong: the `Use` rule's own *children* (continuation rules at deeper indentation following the `use` directive) must still be evaluated after the subroutine returns. The initial implementation skipped them, silently breaking valid libmagic chains. The fix evaluates the `use` rule's children after the named rule body completes. (session history) @@ -195,9 +206,10 @@ The asymmetry between `debug!` (production-safe and test-safe) and `debug_assert ## Related -- [`integration-issues/indirect-offset-parser-evaluator-sync.md`](indirect-offset-parser-evaluator-sync.md) -- closest sibling pattern: AST variant existed but was unreachable from `MagicDatabase::load_from_file()` until parser and evaluator were wired together. Different surface (offset syntax vs. directive dispatch) but same "parser-evaluator sync" shape. Consolidation review may be worthwhile once `indirect` meta-type lands. +- [`logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md`](../logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md) -- the companion learning from PR #230's post-commit review pass. Documents the `SubroutineScope` RAII guard pattern that replaced the manual save/restore originally shipped in this doc's Use dispatch, plus the secondary UTF-8 byte-preservation fix in `format_magic_message` and a false-positive postmortem on `AtomicBool::swap` semantics. +- [`integration-issues/indirect-offset-parser-evaluator-sync.md`](indirect-offset-parser-evaluator-sync.md) -- closest sibling pattern: AST variant existed but was unreachable from `MagicDatabase::load_from_file()` until parser and evaluator were wired together. Different surface (offset syntax vs. directive dispatch) but same "parser-evaluator sync" shape. The earlier consolidation-review note has been resolved now that `indirect` has shipped: the two docs remain distinct (this doc covers dispatch architecture; that doc covers offset-resolution semantics). - [`integration-issues/implementing-variable-width-typekind-variant.md`](implementing-variable-width-typekind-variant.md) -- same discipline around "adding a TypeKind variant that does not fit the fixed-shape `read_typed_value` pipeline"; relevant precedent for dispatch threading. - [`logic-errors/indirect-offset-gnu-file-semantics.md`](../logic-errors/indirect-offset-gnu-file-semantics.md) -- precedent for honoring GNU `file` semantics in a meta-directive. - [`developer-experience/rust-test-visibility-boundary.md`](../developer-experience/rust-test-visibility-boundary.md) -- the `pub(crate)` accessor pattern used for `RuleEnvironment` and `NameTable`. -- GOTCHAS.md S2.1 (TypeKind exhaustive matches), S3 (parser architecture -- now yields `ParsedMagic { rules, name_table }`), S13 (evaluation configuration -- `use` recursion bounded by the existing recursion-depth guard). -- GitHub issues: #42 (driving), #54 (parent epic: Type System Expansion), #48 (third_party/tests compatibility baseline). +- GOTCHAS.md S2.1 (TypeKind exhaustive matches), S3 (parser architecture -- now yields `ParsedMagic { rules, name_table }`), S3.8 (top-level sibling anchor chaining), S3.10 (subroutine base_offset semantics), S13 (evaluation configuration -- `use` recursion bounded by the existing recursion-depth guard), S14.2 (printf-style format substitution via `format_magic_message`). +- GitHub issues: #42 (driving), #54 (parent epic: Type System Expansion), #48 (third_party/tests compatibility baseline). PR: #230 (the landing PR; all six MetaType variants shipped). diff --git a/docs/solutions/logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md b/docs/solutions/logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md new file mode 100644 index 0000000..67b6436 --- /dev/null +++ b/docs/solutions/logic-errors/raii-scope-guards-for-evaluator-context-save-restore.md @@ -0,0 +1,201 @@ +--- +title: RAII scope guards for error-safe save/restore of evaluator context state +date: 2026-04-22 +status: resolved +severity: high +category: logic-errors +problem_type: logic_error +root_cause: scope_issue +resolution_type: code_fix +components: + - evaluator/engine + - evaluator/mod + - output/format +tags: + - rust + - evaluator + - raii + - drop-guard + - error-safety + - save-restore + - meta-types + - issue-42 +issue: '#42' +pr: '#230' +branch: 42-parser-implement-default-clear-name-use-and-indirect-meta-types +applies_when: + - Adding a new `EvaluationContext` field that needs scoped save/restore around a subroutine, indirect re-entry, or other nested evaluation + - Introducing fallible (`?`-returning) operations inside a block that has previously mutated context state + - Reviewing any `let saved_x = context.x(); ... context.set_x(new); ...?; ... context.set_x(saved_x);` sequence + - Extending an existing RAII guard (`AnchorScope`, `RecursionGuard`) to cover additional fields +solution_files: + - src/evaluator/engine/mod.rs + - src/evaluator/mod.rs + - src/output/format.rs + - src/evaluator/offset/mod.rs +related_gotchas: + - S2.1 TypeKind exhaustive-match discipline (analogous "every site must be updated" pattern) + - S3.8 Relative offsets global-anchor discipline (the anchor field this guard restores) + - S3.10 Subroutine base_offset (the second field this guard restores) + - S14.2 Printf-style format specifiers (the adjacent UTF-8 byte-preservation fix) +--- + +# RAII scope guards for error-safe save/restore of evaluator context state + +## Context + +During the close-out of issue #42 (libmagic meta-type directives, PR #230), a post-commit code review pass surfaced three findings. The first two were real bugs with the same structural shape; the third was a false positive worth documenting because it recurred independently across two reviewers. All three surfaced only after `just ci-check` was green and the full 1348-test suite passed — they were invisible to the test harness because no existing test case exercised the trigger conditions. + +The bug class at the heart of this learning is **manual save/restore of shared mutable state that silently becomes a no-op when `?` short-circuits the restore**. The codebase already had a working fix pattern (`AnchorScope` for `Indirect` dispatch), but it had not been applied to a newer code path that grew organically from one saved field to two. The learning is less "we shipped a bug" and more "when a RAII pattern exists, extending state requires extending the RAII guard, not adding a parallel manual save/restore pair." + +## Symptoms + +A future developer debugging this class of bug would see behavior that looks like non-determinism or misidentification of file types when `use` directives are present in magic rules: + +- After a `RecursionLimitExceeded` or `Timeout` error returned from `evaluate_use_rule`, subsequent calls to `evaluate_rules` on the same `EvaluationContext` (without an intervening `context.reset()`) would resolve relative offsets against the use-site offset rather than the caller-level anchor. Rules using `&+N` / `&-N` would resolve at wrong file positions. +- Base-offset-biased rules inside the next evaluation would silently compute offsets relative to the stale use-site rather than zero (or the caller's correct base). This produced matches at wrong byte positions, missed matches, or `BufferOverrun` errors on otherwise-valid files. +- The corruption was intermittent — it only manifested when an error occurred during `use` rule evaluation, and only affected *subsequent* evaluations on a reused context. Tests that reset between calls, or that never exercised the timeout / recursion-limit paths on subroutine bodies, would not catch it. +- The `EvaluationContext::base_offset` doc comment **referenced a `BaseOffsetScope` RAII guard that did not exist in the codebase** — a ghost reference from a planned-but-not-implemented design. This is a symptom worth searching for directly: any `// ... restored via FooScope` comment where `FooScope` does not grep is a latent manual-restore bug. + +## What Didn't Work: Manual Save/Restore + +The original pattern in `evaluate_use_rule` saved the anchor and base offset at the top, modified both, then restored them at the bottom: + +```rust +let saved_anchor = context.last_match_end(); +let saved_base = context.base_offset(); +context.set_last_match_end(absolute_offset); +context.set_base_offset(absolute_offset); + +let subroutine_matches = { + let mut guard = RecursionGuard::enter(context)?; // <- can return Err + evaluate_rules(&subroutine_rules, buffer, guard.context())? // <- can return Err +}; + +context.set_last_match_end(saved_anchor); // <- skipped on any Err above +context.set_base_offset(saved_base); // <- skipped on any Err above +``` + +This passed review, passed `just ci-check`, and passed all 1348 tests — because no test had a `Use` rule whose subroutine body exceeded `max_recursion_depth` or exceeded the configured timeout. The error-path corruption was data-dependent on a condition the test corpus never triggered. + +The reason the restore is bypassable is structural. Rust's `?` operator is syntactic sugar for early return on `Err`: execution jumps immediately out of the function's stack frame, skipping any remaining lines. The manual restore lines sit below the `?` operators, so they are unreachable on any error path. This is not a logic mistake — it is a fundamental property of using `?` without RAII. + +The session history on this branch (session history) adds a telling detail: `AnchorScope` was introduced earlier in a prior session specifically to guard `MetaType::Indirect` dispatch, which needed to save and restore only `last_match_end`. `AnchorScope` was not reused for `Use` when that path was built. Later, when `base_offset` was added to `EvaluationContext` to implement subroutine-relative absolute offsets (magic(5) semantics — see GOTCHAS S3.10), its doc comment named a `BaseOffsetScope` RAII guard as the intended implementation. The actual implementation shipped a manual save/restore pair. The ghost guard name in the doc comment was written as future-tense design intent, and was never reconciled when the real guard (named `SubroutineScope`) was finally built during PR review. The two-field save/restore problem thus grew organically from a one-field fix with no re-examination of the pattern at each step. + +## Solution: RAII Guard + +Introduce `SubroutineScope<'a>` in `src/evaluator/engine/mod.rs` — a struct that holds a mutable reference to the context along with the two saved values, and restores both in its `Drop` implementation: + +```rust +struct SubroutineScope<'a> { + context: &'a mut EvaluationContext, + saved_anchor: usize, + saved_base: usize, +} + +impl<'a> SubroutineScope<'a> { + fn enter(context: &'a mut EvaluationContext, use_site: usize) -> Self { + let saved_anchor = context.last_match_end(); + let saved_base = context.base_offset(); + context.set_last_match_end(use_site); + context.set_base_offset(use_site); + Self { + context, + saved_anchor, + saved_base, + } + } + + fn context(&mut self) -> &mut EvaluationContext { + self.context + } +} + +impl Drop for SubroutineScope<'_> { + fn drop(&mut self) { + self.context.set_last_match_end(self.saved_anchor); + self.context.set_base_offset(self.saved_base); + } +} +``` + +The call site becomes: + +```rust +let subroutine_matches = { + let mut scope = SubroutineScope::enter(context, absolute_offset); + let mut guard = RecursionGuard::enter(scope.context())?; + evaluate_rules(&subroutine_rules, buffer, guard.context())? +}; +``` + +If `RecursionGuard::enter` returns `Err`, the `?` exits the block; `guard` is not yet constructed, but `scope` has been, and `scope` drops — restoring both fields. If `evaluate_rules` returns `Err`, `guard` drops first (decrementing recursion depth), then `scope` drops (restoring anchor and base). If both succeed, the block completes and the temporary bindings drop at the closing brace. In all three cases the restore happens. + +## Why This Works + +Rust's `Drop` trait is invoked unconditionally when a value goes out of scope, regardless of whether the exit is normal, an early return, a `?` propagation, or a panic unwind. (Panics in library code are separately forbidden by project policy — `unsafe_code = "forbid"` is a workspace lint — but the Drop guarantee holds regardless of the project-level rule.) This means the RAII guard eliminates the entire category of "forgot to restore" bugs: there is no code path through which the fields can be left modified, because the restore is tied to the object's lifetime rather than to a specific line of code. The compiler enforces that `scope` lives exactly as long as its enclosing block; it cannot be moved past the block, dropped early, or accidentally omitted by a refactor. + +The same principle is why `RecursionGuard` was already implemented as RAII. Once the pattern is established for one piece of scoped state, each additional piece that participates in the same save/restore discipline needs to be either folded into an existing guard or given its own guard. The maintenance burden is not "write a Drop impl for each field" — it is "recognize the state-mutation pattern and reach for the existing tool." + +## Prevention + +The canonical smell is a three-part sequence: `let saved_x = context.x()`, followed by `context.set_x(new_value)` followed by any `?` operator (directly or via a nested block that returns via `?`), followed by `context.set_x(saved_x)` at a later position. When reviewing code for this pattern, the single checklist question is: **"Is mutable shared state modified before a fallible operation, and restored manually afterward?"** If the answer is yes, the code is already buggy or one refactor away from being buggy. + +Process habits that catch this class: + +1. **When adding a new field to a context type that participates in scoped evaluation** — where a callee should see a modified value but the caller must see the original — the first question is "does a RAII guard already exist for this kind of state?" If yes, the new field goes into the existing guard. If no, a new guard is built before the manual save/restore is written. +2. **Treat ghost references in doc comments as red flags.** If a doc comment names a type (`...restored via BaseOffsetScope`), grep for that type. If it does not exist, the doc was written as design intent and the implementation shipped the weaker form. Reconcile immediately. +3. **Asymmetry between neighboring save/restore sites is a planning failure.** `AnchorScope` (one field, RAII) and `evaluate_use_rule` (two fields, manual) are the same problem at different scales. Any time a reviewer sees one site using RAII and an adjacent site using manual save/restore, the question is whether the manual site is about to break or has already broken. + +Mechanical detection is possible via a custom Semgrep rule matching the three-part sequence `let saved_$X = ...; ...; ...?; ...; ..set_$X(saved_$X)`. Clippy does not have a built-in lint for this shape. The project's `just ci-check` pipeline does not catch it either; detection relies on review judgment or targeted tests that exercise the error paths. + +Direct regression guards added in PR #230 (`test_use_subroutine_absolute_offset_biased_by_use_site`, `test_use_subroutine_relative_offset_unaffected_by_use_site`) cover the happy path for `base_offset` propagation but intentionally do not exercise `RecursionLimitExceeded` inside a `Use` body, because that would require a fixture with deep nesting and would be fragile. A future reviewer who changes `SubroutineScope` should verify the Drop semantics manually or write a `max_recursion_depth = 1` regression test against a mutually-recursive `use` chain. + +## Secondary Fix: Non-ASCII Template Bytes in `format_magic_message` + +The same PR-review pass caught an independent bug in `src/output/format.rs::format_magic_message`. The original implementation iterated `template.as_bytes()` and pushed each non-`%` byte with `out.push(b as char)`. In Rust, casting a `u8` to `char` produces the Unicode scalar value with that code point — which for bytes in the range 0x80–0xFF yields Latin-1 characters rather than UTF-8 continuation bytes. A two-byte UTF-8 sequence like `é` (0xC3 0xA9) emitted as two separate Latin-1 characters (`Ã` and `©`), corrupting any template containing non-ASCII text. + +The fix tracks a `plain_start` index and copies plain-text runs as string slices (`&template[plain_start..i]`) rather than byte by byte. This is safe because `%` is ASCII (0x25) and cannot appear as a UTF-8 continuation byte (which is always 0x80–0xBF), so scanning for `%` at byte granularity cannot split a multi-byte code point. The slice copy preserves the original UTF-8 byte sequences verbatim. A regression guard (`test_non_ascii_template_preserved`) pins the fix with `café`, `→ ok ←`, and `über`. + +The relationship to the primary learning is the same: this bug shipped a unit-test-green green-on-`just ci-check` implementation that nothing in the test corpus could exercise. The fix pattern is structurally analogous — replace ad-hoc byte-by-byte work with a higher-level primitive (here, string slicing) that preserves the invariant by construction rather than by discipline. + +## False Positive Postmortem: `AtomicBool::swap` Return Value + +Two independent reviewers (the `correctness-reviewer` and the `silent-failure-hunter`) both flagged `USE_WITHOUT_RULE_ENV_WARNED.swap(true, Ordering::Relaxed)` as having inverted logic, and both recommended negating the condition. Both tracings were accurate up to the penultimate step and then concluded the opposite direction. + +The code is: + +```rust +if USE_WITHOUT_RULE_ENV_WARNED.swap(true, Ordering::Relaxed) { + debug!("use directive '{name}' evaluated without a rule environment; no-op"); +} else { + warn!( + "use directive '{name}' evaluated without a rule environment; treating as no-op (subsequent occurrences suppressed)" + ); +} +``` + +`swap` returns the **previous** value. On the first call the previous value is `false` (the `AtomicBool::new(false)` initialization), so the condition takes the `else` branch and emits the `warn!`. On subsequent calls the previous value is `true`, so the condition takes the `if` branch and emits the `debug!`. The code is correct — first call warns, subsequent calls debug. + +The shared mental-model error among the reviewers was reading `swap(true)` as "set to true and return true." The session history shows this finding recurred multiple times across independent review passes. The lesson for future reviews is that `AtomicBool::swap` is subtly different from `fetch_or` or `compare_exchange` in how its return value relates to intent. When the intent is "do X only on the first call," the idiomatic reading is "was it already true before I set it?" — `false` means no (first call), `true` means yes (subsequent calls). + +Writing the branch against a named variable makes the intent hard to misread: + +```rust +let already_warned = USE_WITHOUT_RULE_ENV_WARNED.swap(true, Ordering::Relaxed); +if already_warned { + debug!(...) +} else { + warn!(...) +} +``` + +The production code preserves the inline form because it is correct, the comment above it documents the first-call-vs-subsequent semantics, and rewriting every subsequent review flag would be noise. But new uses of `AtomicBool::swap` as a once-guard in this codebase should prefer the named-variable form to eliminate the inversion risk at the source. + +## Related + +- [`integration-issues/meta-type-subroutine-dispatch-architecture.md`](../integration-issues/meta-type-subroutine-dispatch-architecture.md) — sibling doc covering the three-layer parse-time / `ParsedMagic` / optional `RuleEnvironment` pattern that this RAII guard sits inside. That doc describes the `use` dispatch's save/restore contract in prose but predates the `SubroutineScope` fix; after this learning ships, that doc should be updated to point readers at `SubroutineScope` as the canonical implementation. +- [`security-issues/pstring-anchor-poisoning.md`](../security-issues/pstring-anchor-poisoning.md) — a different failure mode of the same `EvaluationContext::last_match_end` field (attacker-controlled length prefixes poisoning the anchor). Shares the anchor-state-as-shared-mutable-concern framing. +- [`integration-issues/implementing-variable-width-typekind-variant.md`](../integration-issues/implementing-variable-width-typekind-variant.md) — `bytes_consumed` as the source of truth for advancing the anchor; the precondition that makes a corrupted anchor from a `?`-bypassed restore consequential. +- GOTCHAS.md S3.8 (relative-offset anchor discipline), S3.10 (subroutine base_offset semantics), S14.2 (printf format substitution — relevant to the non-ASCII template fix). +- GitHub: issue #42 (parent), PR #230 (where this landed). From 840ac7dc3b8f8bac6d6c44da6762caf7631024ac Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 22:58:27 -0400 Subject: [PATCH 10/16] fix: Address PR #230 review feedback (34 threads) Cluster A (docs drift for Offset, 13 threads): updated MetaType doc comments in src/parser/ast.rs and src/parser/types.rs; updated docs/src/ast-structures.md, docs/src/parser.md, docs/src/magic-format.md to describe all six MetaType variants as fully evaluated; corrected ParsedMagic visibility in docs/src/parser.md to match pub(crate); refreshed tests/meta_types_integration.rs module doc. Cluster B (src/output/format.rs correctness, 8 threads): fixed sign-aware zero-padding (%05d with -7 now yields -0007, not 000-7); fixed %#o prefix to use C convention (leading 0) not Rust 0o; fixed %#X to emit uppercase 0X prefix; added MAX_FORMAT_WIDTH=4096 cap on parsed width to prevent unbounded allocation DoS; added regression tests for each fix. Deferred file-size split and not-addressed the checked-access suggestion with rationale (direct indexing is safe-by-construction here). Cluster C (src/evaluator/engine/mod.rs, 4 threads): replaced Vec clone in evaluate_use_rule with Arc clone by changing NameTable to store Arc<[MagicRule]> per entry (added NameTable::sort_subroutines for load-time strength sorting); extracted the duplicated child-evaluation block across Default/Indirect/Offset/Use arms into evaluate_children_or_warn helper (~120 lines deduplicated); replaced the debug_assert! panic in evaluate_rules_with_config with a debug! log to preserve the evaluator-never-panics invariant. Individual threads: added MetaType::Use injection regression test in codegen.rs; removed #[allow(dead_code)] from parser::name_table module; expanded format-substitution note in api-reference.md with escaping requirement; added MetaType examples to api-reference.md; fixed ParsedMagic destructuring example that used non-public name_table field. Test results: 1359/1359 pass (gained 11 tests from new regression guards). just ci-check green. Signed-off-by: UncleSp1d3r --- docs/API_REFERENCE.md | 50 +++---- docs/src/api-reference.md | 56 +++++--- docs/src/ast-structures.md | 6 + docs/src/magic-format.md | 15 +- docs/src/parser.md | 24 ++-- src/evaluator/engine/mod.rs | 245 +++++++++++++------------------- src/lib.rs | 6 +- src/output/format.rs | 77 +++++++++- src/parser/ast.rs | 12 +- src/parser/codegen.rs | 30 ++++ src/parser/mod.rs | 1 - src/parser/name_table.rs | 38 +++-- src/parser/types.rs | 13 +- tests/meta_types_integration.rs | 11 +- 14 files changed, 335 insertions(+), 249 deletions(-) diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index 08de458..fe26924 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -27,14 +27,14 @@ use libmagic_rs::MagicDatabase; The struct contains internal fields: -| Field (Internal) | Type | Description | -| ----------------- | -------------------- | ---------------------------------------------------------------- | -| `rules` | `Vec` | Top-level magic rules | -| `name_table` | `Arc` | Named subroutine definitions extracted from `name` rules | -| `root_rules` | `Arc<[MagicRule]>` | Shared immutable slice of top-level rules for `indirect` re-entry | -| `config` | `EvaluationConfig` | Evaluation configuration | -| `source_path` | `Option` | Optional path to the source magic file or directory | -| `mime_mapper` | `MimeMapper` | MIME type mapper | +| Field (Internal) | Type | Description | +| ---------------- | ------------------ | ----------------------------------------------------------------- | +| `rules` | `Vec` | Top-level magic rules | +| `name_table` | `Arc` | Named subroutine definitions extracted from `name` rules | +| `root_rules` | `Arc<[MagicRule]>` | Shared immutable slice of top-level rules for `indirect` re-entry | +| `config` | `EvaluationConfig` | Evaluation configuration | +| `source_path` | `Option` | Optional path to the source magic file or directory | +| `mime_mapper` | `MimeMapper` | MIME type mapper | #### Constructor Methods @@ -317,9 +317,9 @@ use libmagic_rs::parser::ParsedMagic; Contains the top-level rule list with any `name`-declared subroutines hoisted into a separate name table keyed by identifier. -| Field | Type | Description | -| ------------ | ------------- | ------------------------------------------------------------ | -| `rules` | `Vec` | Top-level rules after `Name` subroutines have been removed | +| Field | Type | Description | +| ------------ | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `rules` | `Vec` | Top-level rules after `Name` subroutines have been removed | | `name_table` | `NameTable` (internal) | Extracted `name` subroutine definitions, consulted by the evaluator when a rule of type `TypeKind::Meta(MetaType::Use(_))` is reached | #### MagicRule @@ -375,7 +375,7 @@ use libmagic_rs::TypeKind; | `Date { endian, utc }` | 32-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. Date values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | | `QDate { endian, utc }` | 64-bit Unix timestamp (signed seconds since epoch). The `endian` parameter specifies byte order (LittleEndian or BigEndian), and `utc` is a boolean indicating whether to format as UTC or local time. QDate values are formatted as "Www Mmm DD HH:MM:SS YYYY" strings to match GNU file output. | | `String { max_length }` | String data | -| `Meta(MetaType)` | Meta-type directives for control flow, conditional execution, and named subroutines. Variants: `Default`, `Clear`, `Name`, `Use`, `Indirect`, `Offset`. See [`MetaType`](#metatype) for details. | +| `Meta(MetaType)` | Meta-type directives for control flow, conditional execution, and named subroutines. Variants: `Default`, `Clear`, `Name`, `Use`, `Indirect`, `Offset`. See [`MetaType`](#metatype) for details. | ##### MetaType @@ -385,13 +385,13 @@ Control-flow directive variants carried by `TypeKind::Meta`. use libmagic_rs::parser::ast::MetaType; ``` -| Variant | Description | -| ---------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `Default` | Fires when no sibling at the same indentation level has matched | -| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | -| `Name(id)` | Declares a named subroutine with identifier `id` that can be invoked later via `Use` | -| `Use(id)` | Invokes a named subroutine previously declared via `Name` | -| `Indirect` | Re-applies the entire magic database at the resolved offset | +| Variant | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `Default` | Fires when no sibling at the same indentation level has matched | +| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | +| `Name(id)` | Declares a named subroutine with identifier `id` that can be invoked later via `Use` | +| `Use(id)` | Invokes a named subroutine previously declared via `Name` | +| `Indirect` | Re-applies the entire magic database at the resolved offset | | `Offset` | Reports the current file offset as `Value::Uint(pos)` rather than reading a typed value from the buffer; operator must be `AnyValue` (`x`) | ##### 64-bit Integer Types @@ -527,13 +527,13 @@ Result from internal evaluation. use libmagic_rs::evaluator::MatchResult; ``` -| Field | Type | Description | -| ------------ | -------- | ----------------- | +| Field | Type | Description | +| ------------ | -------- | --------------------------------------------------------------------------------------------------------------- | | `message` | `String` | Match description (printf-style format specifiers like `%d`, `%x`, `%s` are substituted with the matched value) | -| `offset` | `usize` | Match offset | -| `level` | `u32` | Rule level | -| `value` | `Value` | Matched value | -| `confidence` | `f64` | Confidence score | +| `offset` | `usize` | Match offset | +| `level` | `u32` | Rule level | +| `value` | `Value` | Matched value | +| `confidence` | `f64` | Confidence score | --- diff --git a/docs/src/api-reference.md b/docs/src/api-reference.md index 5ae9480..1a7fd02 100644 --- a/docs/src/api-reference.md +++ b/docs/src/api-reference.md @@ -240,14 +240,35 @@ Control-flow directives carried by `TypeKind::Meta`. use libmagic_rs::MetaType; ``` -| Variant | Description | -| ----------------- | ----------------------------------------------------------------------------------------------------- | -| `Default` | Fires when no sibling at the same indentation level matched at the current offset | -| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | -| `Name(String)` | Declares a named subroutine that can be invoked later via `Use` | -| `Use(String)` | Invokes a named subroutine previously declared via `Name` | -| `Indirect` | Re-applies the entire magic database at the resolved offset | -| `Offset` | Reports the current file offset as `Value::Uint(position)` rather than reading a typed value | +| Variant | Description | +| -------------- | -------------------------------------------------------------------------------------------------------- | +| `Default` | Fires when no sibling at the same indentation level matched at the current offset | +| `Clear` | Resets the sibling-matched flag so a later `default` sibling can fire even if an earlier sibling matched | +| `Name(String)` | Declares a named subroutine that can be invoked later via `Use` | +| `Use(String)` | Invokes a named subroutine previously declared via `Name` | +| `Indirect` | Re-applies the entire magic database at the resolved offset | +| `Offset` | Reports the current file offset as `Value::Uint(position)` rather than reading a typed value | + +#### Examples + +```rust +use libmagic_rs::{TypeKind, parser::ast::MetaType}; + +// A default fallback rule (fires when no sibling matched) +let default_type = TypeKind::Meta(MetaType::Default); + +// Define a named subroutine +let name_type = TypeKind::Meta(MetaType::Name("riff_header".to_string())); + +// Invoke that subroutine at a given offset +let use_type = TypeKind::Meta(MetaType::Use("riff_header".to_string())); + +// Re-enter the root rule set at a resolved offset (ZIP-in-DOCX etc.) +let indirect_type = TypeKind::Meta(MetaType::Indirect); + +// Emit the current file offset as a match value for printf substitution +let offset_type = TypeKind::Meta(MetaType::Offset); +``` ### Operator @@ -430,14 +451,14 @@ Result from internal evaluation. use libmagic_rs::evaluator::MatchResult; ``` -| Field | Type | Description | -| ------------ | ---------- | ----------------------------------------- | -| `message` | `String` | Match description (printf-style format specifiers like `%d`, `%x`, `%s` are substituted at output time) | -| `offset` | `usize` | Match offset | -| `level` | `u32` | Rule level | -| `value` | `Value` | Matched value | -| `type_kind` | `TypeKind` | Type used to read value (added in v0.5.0) | -| `confidence` | `f64` | Confidence score | +| Field | Type | Description | +| ------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `message` | `String` | Match description. Printf-style format specifiers (`%d`, `%i`, `%u`, `%x`, `%X`, `%o`, `%s`, `%c`, plus width/padding modifiers) are substituted with the rule's read value at output time. **Literal `%` must be escaped as `%%`** -- unescaped `%` is interpreted as a format specifier (breaking change since v0.5.0). | +| `offset` | `usize` | Match offset | +| `level` | `u32` | Rule level | +| `value` | `Value` | Matched value | +| `type_kind` | `TypeKind` | Type used to read value (added in v0.5.0) | +| `confidence` | `f64` | Confidence score | ## Output Module @@ -544,7 +565,8 @@ pub use error::{EvaluationError, LibmagicError, ParseError}; ### Breaking Changes (post-0.5.0) -- Parser functions (`parse_text_magic_file`, `load_magic_file`, `load_magic_directory`) now return `ParsedMagic { rules, name_table }` instead of `Vec`. Code must destructure: `let ParsedMagic { rules, name_table } = parse_text_magic_file(...)?;` +- Parser functions (`parse_text_magic_file`, `load_magic_file`, `load_magic_directory`) now return `ParsedMagic { rules, name_table }` instead of `Vec`. External consumers can only access the public `rules` field — `name_table` is `pub(crate)` and managed internally by `MagicDatabase`. Typical usage: `let parsed = parse_text_magic_file(&source)?; /* use parsed.rules */`. The library wires `name_table` through `MagicDatabase::load_from_file` automatically; direct access is not required (or supported) for external code. +- Rule messages are now rendered through printf-style format substitution: specifiers like `%d`, `%x`, `%02x`, `%s`, `%lld` are replaced with the rule's read value at output time. **Literal `%` in rule messages must be escaped as `%%`.** Messages that were previously emitted verbatim with bare `%` characters will now be interpreted as format specifiers — this is a visible behavior change for existing magic files that used `%` for non-formatting purposes. ### Breaking Changes in v0.2.0 diff --git a/docs/src/ast-structures.md b/docs/src/ast-structures.md index 916b66e..5897d4d 100644 --- a/docs/src/ast-structures.md +++ b/docs/src/ast-structures.md @@ -466,6 +466,9 @@ pub enum MetaType { Use(String), /// `indirect` — re-applies the full rule database at the resolved offset. Indirect, + /// `offset` — emits the resolved file position as `Value::Uint` for + /// printf-style format substitution (e.g. `%lld`). + Offset, } ``` @@ -488,6 +491,9 @@ let use_rule = TypeKind::Meta(MetaType::Use("part2".to_string())); // Re-entry into root rules let indirect_rule = TypeKind::Meta(MetaType::Indirect); + +// Report the resolved file offset for format substitution +let offset_rule = TypeKind::Meta(MetaType::Offset); ``` **Parse-time Name Extraction:** diff --git a/docs/src/magic-format.md b/docs/src/magic-format.md index e60ed2d..2508b19 100644 --- a/docs/src/magic-format.md +++ b/docs/src/magic-format.md @@ -518,13 +518,14 @@ Output: `GIF image data, version 89a` Meta-types are pseudo-types that do not read bytes from the buffer. Instead, they control the evaluation flow: defining named subroutines, invoking them, providing fallbacks when no sibling matched, resetting per-level match state, or re-applying the entire rule database at a resolved offset. -| Keyword | Syntax | Description | -| ----------- | ---------------------- | ------------------------------------------------------------------ | -| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | -| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | -| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | -| `clear` | `0 clear` | Resets the per-level sibling-matched flag | -| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | +| Keyword | Syntax | Description | +| ----------- | --------------------------- | --------------------------------------------------------------------------------- | +| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | +| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | +| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | +| `clear` | `0 clear` | Resets the per-level sibling-matched flag | +| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | +| `offset` | `0 offset x at_offset %lld` | Emits the resolved file position as a `Value::Uint` for printf-style substitution | ### `name` and `use` — Named Subroutines diff --git a/docs/src/parser.md b/docs/src/parser.md index 1160869..5e4bd22 100644 --- a/docs/src/parser.md +++ b/docs/src/parser.md @@ -429,19 +429,20 @@ parse_type_and_operator("search/0") - ✅ Bare `search` and `search/0` rejected at parse time - ✅ Binary-safe literal matching via `memchr::memmem::find` -### Meta-type Directives (`name`, `use`, `default`, `clear`, `indirect`) +### Meta-type Directives (`name`, `use`, `default`, `clear`, `indirect`, `offset`) -The parser supports five meta-type directives that represent control-flow rather than buffer reads. They all parse into the `TypeKind::Meta(MetaType)` AST variant and carry no endianness or width. +The parser supports six meta-type directives that represent control-flow rather than buffer reads. They all parse into the `TypeKind::Meta(MetaType)` AST variant and carry no endianness or width. **Type Keywords and `MetaType` Variants:** -| Keyword | `MetaType` Variant | Role | -| ----------- | ------------------------ | -------------------------------------------------------------- | -| `name ` | `MetaType::Name(String)` | Declares a named subroutine; children form the subroutine body | -| `use ` | `MetaType::Use(String)` | Invokes a named subroutine at the resolved offset | -| `default` | `MetaType::Default` | Fires only when no sibling at the same level has matched | -| `clear` | `MetaType::Clear` | Resets the per-level sibling-matched flag | -| `indirect` | `MetaType::Indirect` | Re-applies the root rule set at the resolved offset | +| Keyword | `MetaType` Variant | Role | +| ----------- | ------------------------ | ----------------------------------------------------------------------------- | +| `name ` | `MetaType::Name(String)` | Declares a named subroutine; children form the subroutine body | +| `use ` | `MetaType::Use(String)` | Invokes a named subroutine at the resolved offset | +| `default` | `MetaType::Default` | Fires only when no sibling at the same level has matched | +| `clear` | `MetaType::Clear` | Resets the per-level sibling-matched flag | +| `indirect` | `MetaType::Indirect` | Re-applies the root rule set at the resolved offset | +| `offset` | `MetaType::Offset` | Emits the resolved file position as `Value::Uint` for printf-style formatting | Meta-types have `bit_width() == None` because they consume zero on-disk bytes. @@ -452,7 +453,7 @@ Meta-types have `bit_width() == None` because they consume zero on-disk bytes. ```rust pub struct ParsedMagic { pub rules: Vec, - pub name_table: NameTable, + pub(crate) name_table: NameTable, } ``` @@ -480,7 +481,7 @@ Top-level `name ` rules are hoisted *out* of `ParsedMagic::rules` by `parser **Features:** -- ✅ All five keywords recognized by `parse_type_keyword` + `type_keyword_to_kind` +- ✅ All six keywords recognized by `parse_type_keyword` + `type_keyword_to_kind` - ✅ Round-trip through `serialize_type_kind` in `codegen.rs` - ✅ Top-level `name` extraction into `NameTable` - ✅ Defensive scrubbing of misplaced nested `name` rules @@ -597,7 +598,6 @@ match detect_format(path)? { ### Not Yet Implemented - **Binary .mgc Format**: Compiled magic database format -- **`offset` pseudo-type**: The `offset` keyword used in `searchbug.magic` for `at_offset %lld` output ### Planned Enhancements diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 90f4090..af5ee07 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -291,10 +291,10 @@ fn evaluate_use_rule( warn!("use directive references unknown name '{name}'"); return Ok((None, Vec::new())); }; - - // Clone the Arc reference to detach from the immutable borrow of - // `context`, so we can mutably borrow the context below. - let subroutine_rules: Vec = subroutine_rules.clone(); + // `NameTable::get` returns an `Arc<[MagicRule]>`, so this clone is a + // reference-count increment rather than a deep copy of the rule tree. + // The Arc is cloned here to release the immutable borrow of `context` + // (via `env`) before we mutably borrow the context below. // Resolve the use-site offset under the *caller's* base, not the // subroutine's -- the use rule itself is in the caller's scope. @@ -394,6 +394,75 @@ fn evaluate_value_rule( Ok((matched, read_value)) } +/// Evaluate a rule's children under the standard recursion-guard/graceful-skip discipline. +/// +/// This helper centralises the `RecursionGuard` + `evaluate_rules` + error-dispatch +/// pattern that is identical across the `Default`, `Indirect`, `Offset`, and `Use` +/// meta-type arms in [`evaluate_rules`]. Extracting it prevents the four copies +/// from drifting apart during future maintenance. +/// +/// # Behaviour +/// +/// * If `rule.children` is empty the function is a no-op (returns `Ok(())`). +/// * Child matches are appended to `matches` in document order. +/// * `LibmagicError::Timeout` and `LibmagicError::EvaluationError(RecursionLimitExceeded)` +/// propagate immediately as `Err` so the caller can bail out. +/// * Data-dependent errors (`BufferOverrun`, `InvalidOffset`, +/// `TypeReadError::BufferOverrun`, `TypeReadError::InvalidPStringLength`, +/// `IoError`) are logged at `warn!` and swallowed; the parent match +/// already in `matches` is left intact. This mirrors the defensive +/// comment in each arm: the inner `evaluate_rules` already catches and +/// logs individual child failures, so this arm only fires if that +/// strategy changes. +/// +/// # Arguments +/// +/// * `rule` – The parent rule whose children will be evaluated. +/// * `rule_kind` – A short label for the rule kind used in the `warn!` +/// message (e.g. `"default"`, `"indirect"`, `"offset"`, `"use"`). +/// * `buffer` – The file buffer passed to the recursive call. +/// * `context` – Mutable evaluation context; the recursion depth is +/// incremented on entry and decremented on drop via [`RecursionGuard`]. +/// * `matches` – Output vector; child matches are appended here. +fn evaluate_children_or_warn( + rule: &MagicRule, + rule_kind: &str, + buffer: &[u8], + context: &mut EvaluationContext, + matches: &mut Vec, +) -> Result<(), LibmagicError> { + if rule.children.is_empty() { + return Ok(()); + } + let mut guard = RecursionGuard::enter(context)?; + match evaluate_rules(&rule.children, buffer, guard.context()) { + Ok(child_matches) => { + matches.extend(child_matches); + } + Err(LibmagicError::Timeout { timeout_ms }) => { + return Err(LibmagicError::Timeout { timeout_ms }); + } + Err( + e @ (LibmagicError::EvaluationError( + crate::error::EvaluationError::BufferOverrun { .. } + | crate::error::EvaluationError::InvalidOffset { .. } + | crate::error::EvaluationError::TypeReadError( + crate::evaluator::types::TypeReadError::BufferOverrun { .. } + | crate::evaluator::types::TypeReadError::InvalidPStringLength { .. }, + ), + ) + | LibmagicError::IoError(_)), + ) => { + warn!( + "Discarding child evaluation under {} rule '{}' due to unexpected error: {} -- parent match is still emitted", + rule_kind, rule.message, e + ); + } + Err(e) => return Err(e), + } + Ok(()) +} + /// Evaluate a list of magic rules against a file buffer with hierarchical processing /// /// This function implements the core hierarchical rule evaluation algorithm with graceful @@ -582,38 +651,7 @@ pub fn evaluate_rules( // `default` is treated as a successful match at this // level, so its children are evaluated under the same // recursion-guard pattern as every other successful rule. - if !rule.children.is_empty() { - let mut guard = RecursionGuard::enter(context)?; - match evaluate_rules(&rule.children, buffer, guard.context()) { - Ok(child_matches) => { - matches.extend(child_matches); - } - Err(LibmagicError::Timeout { timeout_ms }) => { - return Err(LibmagicError::Timeout { timeout_ms }); - } - Err( - e @ (LibmagicError::EvaluationError( - crate::error::EvaluationError::BufferOverrun { .. } - | crate::error::EvaluationError::InvalidOffset { .. } - | crate::error::EvaluationError::TypeReadError( - crate::evaluator::types::TypeReadError::BufferOverrun { - .. - } - | crate::evaluator::types::TypeReadError::InvalidPStringLength { - .. - }, - ), - ) - | LibmagicError::IoError(_)), - ) => { - warn!( - "Discarding child evaluation under default rule '{}' due to unexpected error: {} -- default match is still emitted", - rule.message, e - ); - } - Err(e) => return Err(e), - } - } + evaluate_children_or_warn(rule, "default", buffer, context, &mut matches)?; sibling_matched = true; @@ -734,36 +772,7 @@ pub fn evaluate_rules( // Evaluate the indirect rule's own children under the same // recursion-guard pattern used by every other successful rule. - if !rule.children.is_empty() { - let mut guard = RecursionGuard::enter(context)?; - match evaluate_rules(&rule.children, buffer, guard.context()) { - Ok(child_matches) => { - matches.extend(child_matches); - } - Err(LibmagicError::Timeout { timeout_ms }) => { - return Err(LibmagicError::Timeout { timeout_ms }); - } - Err( - e @ (LibmagicError::EvaluationError( - crate::error::EvaluationError::BufferOverrun { .. } - | crate::error::EvaluationError::InvalidOffset { .. } - | crate::error::EvaluationError::TypeReadError( - crate::evaluator::types::TypeReadError::BufferOverrun { .. } - | crate::evaluator::types::TypeReadError::InvalidPStringLength { - .. - }, - ), - ) - | LibmagicError::IoError(_)), - ) => { - warn!( - "Discarding child evaluation under indirect rule '{}' due to unexpected error: {} -- indirect matches are still emitted", - rule.message, e - ); - } - Err(e) => return Err(e), - } - } + evaluate_children_or_warn(rule, "indirect", buffer, context, &mut matches)?; if matches.len() > matches_before && context.should_stop_at_first_match() { break; @@ -835,36 +844,7 @@ pub fn evaluate_rules( // Evaluate children under the recursion-guard pattern used // by every other successful rule. - if !rule.children.is_empty() { - let mut guard = RecursionGuard::enter(context)?; - match evaluate_rules(&rule.children, buffer, guard.context()) { - Ok(child_matches) => { - matches.extend(child_matches); - } - Err(LibmagicError::Timeout { timeout_ms }) => { - return Err(LibmagicError::Timeout { timeout_ms }); - } - Err( - e @ (LibmagicError::EvaluationError( - crate::error::EvaluationError::BufferOverrun { .. } - | crate::error::EvaluationError::InvalidOffset { .. } - | crate::error::EvaluationError::TypeReadError( - crate::evaluator::types::TypeReadError::BufferOverrun { .. } - | crate::evaluator::types::TypeReadError::InvalidPStringLength { - .. - }, - ), - ) - | LibmagicError::IoError(_)), - ) => { - warn!( - "Discarding child evaluation under offset rule '{}' due to unexpected error: {} -- offset match is still emitted", - rule.message, e - ); - } - Err(e) => return Err(e), - } - } + evaluate_children_or_warn(rule, "offset", buffer, context, &mut matches)?; if matches.len() > matches_before && context.should_stop_at_first_match() { break; @@ -923,41 +903,8 @@ pub fn evaluate_rules( // document order. The recursion guard mirrors the non-`Use` // path so a `use`-site chain cannot blow past the configured // recursion limit. - if use_resolved && !rule.children.is_empty() { - let mut guard = RecursionGuard::enter(context)?; - match evaluate_rules(&rule.children, buffer, guard.context()) { - Ok(child_matches) => { - matches.extend(child_matches); - } - Err(LibmagicError::Timeout { timeout_ms }) => { - return Err(LibmagicError::Timeout { timeout_ms }); - } - Err( - e @ (LibmagicError::EvaluationError( - crate::error::EvaluationError::BufferOverrun { .. } - | crate::error::EvaluationError::InvalidOffset { .. } - | crate::error::EvaluationError::TypeReadError( - crate::evaluator::types::TypeReadError::BufferOverrun { .. } - | crate::evaluator::types::TypeReadError::InvalidPStringLength { - .. - }, - ), - ) - | LibmagicError::IoError(_)), - ) => { - // Same defensive rationale as the main rule path: - // individual child failures are already handled - // inside the recursive `evaluate_rules`, so this - // arm only fires if that error-handling strategy - // changes. Logged at warn! so the asymmetry is - // visible. - warn!( - "Discarding child evaluation under use rule '{name}' due to unexpected error: {e} -- subroutine matches are still emitted; investigate the recursive evaluate_rules error-handling path" - ); - } - Err(e) => return Err(e), - } - // `guard` drops here, decrementing the recursion depth. + if use_resolved { + evaluate_children_or_warn(rule, "use", buffer, context, &mut matches)?; } // A successful `use` site is treated as a sibling match for @@ -1160,19 +1107,21 @@ pub fn evaluate_rules_with_config( // are rejected at the API boundary rather than triggering subtle // failures during evaluation. config.validate()?; - // Debug-only guard: `evaluate_rules_with_config` builds a context + // Diagnostic guard: `evaluate_rules_with_config` builds a context // without an attached `RuleEnvironment`, which means any // `MetaType::Indirect` rule reached during evaluation is silently - // no-op'd at runtime. That is the intentional release behavior - // (matching the `Use`-without-env contract for low-level callers), - // but in debug builds we surface the misconfiguration eagerly so - // consumer tests catch env-less `indirect` usage before it ships. - // Release behavior is unchanged. - debug_assert!( - !contains_indirect_rule(rules), - "{}", - crate::error::EvaluationError::indirect_without_environment() - ); + // no-op'd at runtime. That is the intentional behavior for low-level + // callers (matching the `Use`-without-env contract), but we log the + // misconfiguration at `debug!` level so consumer tests can detect + // env-less `indirect` usage. Using `debug_assert!` would panic in test + // builds and break the "evaluator never panics" invariant documented in + // GOTCHAS S2.4 -- a misconfigured caller should get a no-op, not a crash. + if contains_indirect_rule(rules) { + debug!( + "{}", + crate::error::EvaluationError::indirect_without_environment() + ); + } // Clear the thread-local regex compile cache so it is bounded to // the lifetime of a single top-level evaluation call. Cache // entries from a previous rule set would otherwise persist on the @@ -1186,16 +1135,12 @@ pub fn evaluate_rules_with_config( /// Recursively walk `rules` (including children) looking for any /// [`MetaType::Indirect`] directive. /// -/// Used by the debug-only guard in [`evaluate_rules_with_config`]: the +/// Used by the diagnostic guard in [`evaluate_rules_with_config`]: the /// low-level `_with_config` entry point builds a context without a /// [`crate::evaluator::RuleEnvironment`], so any `indirect` rule is -/// silently no-op'd at runtime. Firing `debug_assert!` here makes that -/// misconfiguration loud in tests without affecting release behavior. -/// -/// Intentionally not gated on `cfg(debug_assertions)` so release builds -/// still compile the `debug_assert!` call site (the macro evaluates its -/// arguments in both modes for type-checking, even though the check -/// itself is stripped in release). +/// silently no-op'd at runtime. The check logs the misconfiguration at +/// `debug!` level so consumer tests can detect it without panicking (see +/// GOTCHAS S2.4 for why `debug_assert!` would be wrong here). fn contains_indirect_rule(rules: &[MagicRule]) -> bool { rules.iter().any(|rule| { matches!(rule.typ, TypeKind::Meta(MetaType::Indirect)) diff --git a/src/lib.rs b/src/lib.rs index 7abd98c..746b192 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -320,9 +320,9 @@ impl MagicDatabase { // Each named subroutine body must be sorted by the same strength // ordering so evaluation of a `use` site is deterministic and // matches the ordering applied to top-level rules. - for subroutine in name_table.values_mut() { - crate::evaluator::strength::sort_rules_by_strength_recursive(subroutine); - } + name_table.sort_subroutines(|rules| { + crate::evaluator::strength::sort_rules_by_strength_recursive(rules); + }); let root_rules: std::sync::Arc<[MagicRule]> = std::sync::Arc::from(rules.as_slice()); Ok(Self { diff --git a/src/output/format.rs b/src/output/format.rs index 6b89c83..69f36b3 100644 --- a/src/output/format.rs +++ b/src/output/format.rs @@ -164,6 +164,13 @@ struct Spec { end: usize, } +/// Maximum width value accepted from a format specifier. +/// +/// Caps the field width to prevent crafted magic rules with enormous widths +/// (e.g., `%999999999d`) from driving unbounded `repeat_n` allocations in the +/// padding helpers. 4096 is generous for any real magic-corpus usage. +const MAX_FORMAT_WIDTH: usize = 4096; + /// Parse a format specifier starting at `start` (the first byte after the /// leading `%`). Returns `None` if the sequence does not end in a /// recognized conversion character. @@ -197,11 +204,15 @@ fn parse_spec(bytes: &[u8], start: usize) -> Option { } } - // Width (decimal digits). + // Width (decimal digits). Capped at MAX_FORMAT_WIDTH to prevent + // unbounded allocations from crafted format strings. let mut width: usize = 0; while i < bytes.len() && bytes[i].is_ascii_digit() { let digit = (bytes[i] - b'0') as usize; - width = width.saturating_mul(10).saturating_add(digit); + width = width + .saturating_mul(10) + .saturating_add(digit) + .min(MAX_FORMAT_WIDTH); i += 1; } @@ -277,15 +288,16 @@ fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option { let n = coerce_to_u64_masked(value, type_kind)?; Some(render_prefixed_int( &format!("{n:X}"), - if spec.alt_form { "0x" } else { "" }, + if spec.alt_form { "0X" } else { "" }, spec, )) } Conv::Octal => { let n = coerce_to_u64_masked(value, type_kind)?; + // C printf uses a single "0" prefix for %#o (not Rust's "0o"). Some(render_prefixed_int( &format!("{n:o}"), - if spec.alt_form { "0o" } else { "" }, + if spec.alt_form { "0" } else { "" }, spec, )) } @@ -398,10 +410,27 @@ fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String { } /// Apply width and padding to an already-rendered numeric body. +/// +/// For zero-padded right-aligned formatting, a leading `-` sign is kept at +/// the front while zeros are inserted between the sign and the magnitude +/// digits -- matching C printf semantics (e.g., `%05d` with `-7` → `-0007`, +/// not `000-7`). fn pad_numeric(body: &str, spec: &Spec) -> String { if body.len() >= spec.width { return body.to_string(); } + // C printf sign-aware zero-padding: sign goes before the zeros. + if spec.zero_pad + && !spec.left_align + && let Some(digits) = body.strip_prefix('-') + { + let needed = spec.width.saturating_sub(1 + digits.len()); + if needed == 0 { + return body.to_string(); + } + let zeros: String = std::iter::repeat_n('0', needed).collect(); + return format!("-{zeros}{digits}"); + } let pad = spec.width - body.len(); let pad_char = if spec.zero_pad && !spec.left_align { '0' @@ -493,9 +522,14 @@ mod tests { let out = format_magic_message("%-#6x|", &Value::Uint(0xab), &byte_t()); assert_eq!(out, "0xab |"); - // %#08o: zero-pad inserts between "0o" prefix and digits. + // %#08o: zero-pad inserts between C-style "0" prefix and digits. + // C printf uses a single "0" prefix for %#o (not Rust's "0o"). let out = format_magic_message("%#08o", &Value::Uint(8), &byte_t()); - assert_eq!(out, "0o000010"); + assert_eq!(out, "00000010"); + + // %#X: uppercase alt-form uses "0X" prefix to match the specifier case. + let out = format_magic_message("%#X", &Value::Uint(0xab), &byte_t()); + assert_eq!(out, "0XAB"); } #[test] @@ -520,8 +554,9 @@ mod tests { fn test_octal_substitution() { let out = format_magic_message("%o", &Value::Uint(8), &byte_t()); assert_eq!(out, "10"); + // C printf %#o uses a single "0" prefix, not Rust's "0o". let out = format_magic_message("%#o", &Value::Uint(8), &byte_t()); - assert_eq!(out, "0o10"); + assert_eq!(out, "010"); } #[test] @@ -568,15 +603,43 @@ mod tests { #[test] fn test_width_padding() { + // Zero-padded width with negative value: sign must precede zeros. + // Regression guard for sign-aware zero-padding (C printf semantics). + let out = format_magic_message("%05d", &Value::Int(-7), &long_t()); + assert_eq!(out, "-0007"); + let out = format_magic_message("%06d", &Value::Int(-42), &long_t()); + assert_eq!(out, "-00042"); // Zero-padded width. let out = format_magic_message("%05d", &Value::Int(42), &long_t()); assert_eq!(out, "00042"); // Space-padded width. let out = format_magic_message("%5d", &Value::Int(42), &long_t()); assert_eq!(out, " 42"); + // Negative with space-padding: sign stays in the body, spaces lead. + let out = format_magic_message("%5d", &Value::Int(-7), &long_t()); + assert_eq!(out, " -7"); // Left-aligned (zero flag ignored when `-` is set). let out = format_magic_message("%-5d|", &Value::Int(42), &long_t()); assert_eq!(out, "42 |"); + // Left-aligned negative: body left-aligned, spaces trail. + let out = format_magic_message("%-6d|", &Value::Int(-7), &long_t()); + assert_eq!(out, "-7 |"); + } + + #[test] + fn test_width_cap_prevents_large_allocation() { + // A width larger than MAX_FORMAT_WIDTH must be silently clamped. + // The output should be valid (the value rendered, possibly padded) + // rather than triggering a huge allocation. + let huge_width = format!("%{}d", usize::MAX); + let out = format_magic_message(&huge_width, &Value::Int(1), &long_t()); + // After clamping, the output is at most MAX_FORMAT_WIDTH+1 chars. + assert!( + out.len() <= MAX_FORMAT_WIDTH + 1, + "output too long: {}", + out.len() + ); + assert!(out.ends_with('1'), "rendered value must appear: {out:?}"); } // ---- edge cases -------------------------------------------------- diff --git a/src/parser/ast.rs b/src/parser/ast.rs index f10a384..55a5a45 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -168,11 +168,13 @@ pub enum OffsetSpec { /// Control-flow directive carried by [`TypeKind::Meta`]. /// /// These are not value-reading types -- they correspond to magic(5) -/// control-flow keywords (`default`, `clear`, `name`, `use`, `indirect`) -/// that modify how a rule set is traversed rather than which bytes are -/// read from the buffer. In this phase they are parsed and preserved in -/// the AST but evaluated as silent no-ops; subsequent phases will wire -/// each variant into the evaluator. +/// control-flow keywords (`default`, `clear`, `name`, `use`, `indirect`, +/// `offset`) that modify how a rule set is traversed rather than reading +/// bytes from the buffer. All six variants are fully evaluated by the +/// engine: `default`/`clear` manage per-level sibling-matched state; +/// `name`/`use` implement subroutine dispatch; `indirect` re-applies the +/// root rule database at a resolved offset; and `offset` emits the +/// current file position as `Value::Uint` for printf-style formatting. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[non_exhaustive] pub enum MetaType { diff --git a/src/parser/codegen.rs b/src/parser/codegen.rs index 963c978..5e22a12 100644 --- a/src/parser/codegen.rs +++ b/src/parser/codegen.rs @@ -550,4 +550,34 @@ mod tests { "escaped quote missing from serialized MetaType::Name identifier:\n{generated}" ); } + + /// Regression guard: `MetaType::Use` uses a separate match arm from + /// `MetaType::Name` in `serialize_type_kind`, so it needs its own + /// injection test to ensure the identifier is string-literal escaped + /// and cannot leak Rust tokens into the generated `builtin_rules.rs`. + #[test] + fn test_serialize_meta_use_escapes_injection() { + let malicious = r#""; panic!("pwned-from-use"); let _ = ""#; + let rule = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Use(malicious.to_string())), + op: Operator::AnyValue, + value: Value::Uint(0), + message: "meta use rule".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + + let generated = serialize_magic_rule(&rule, 0); + + assert!( + !generated.contains(r#"panic!("pwned-from-use")"#), + "injected Rust tokens leaked through MetaType::Use identifier:\n{generated}" + ); + assert!( + generated.contains(r#"\""#), + "escaped quote missing from serialized MetaType::Use identifier:\n{generated}" + ); + } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 4fa0ac9..ec350c4 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -142,7 +142,6 @@ mod format; pub(crate) mod grammar; mod hierarchy; mod loader; -#[allow(dead_code)] pub(crate) mod name_table; pub(crate) mod preprocessing; pub mod types; diff --git a/src/parser/name_table.rs b/src/parser/name_table.rs index 2c9e562..d2392b5 100644 --- a/src/parser/name_table.rs +++ b/src/parser/name_table.rs @@ -10,6 +10,7 @@ //! without re-walking the AST. use std::collections::HashMap; +use std::sync::Arc; use log::warn; @@ -21,9 +22,14 @@ use crate::parser::ast::{MagicRule, MetaType, TypeKind}; /// rule list. The evaluator consults this table when it encounters a /// `TypeKind::Meta(MetaType::Use(name))` rule to retrieve the rules that /// should be evaluated as if inlined at the `use` site. +/// +/// Subroutine bodies are stored as `Arc<[MagicRule]>` so the evaluator can +/// clone the `Arc` (a reference-count increment) rather than deep-cloning the +/// full rule vector on every `use` dispatch. This is important for large magic +/// corpora where the same subroutine may be invoked many times per evaluation. #[derive(Debug, Default, Clone)] pub(crate) struct NameTable { - inner: HashMap>, + inner: HashMap>, } impl NameTable { @@ -36,19 +42,29 @@ impl NameTable { } /// Look up a subroutine's rule list by name. + /// + /// Returns an `Arc` reference so callers can clone it cheaply (reference + /// count increment) and then release the immutable borrow of `self` + /// before mutably borrowing any surrounding context. #[must_use] - pub(crate) fn get(&self, name: &str) -> Option<&Vec> { - self.inner.get(name) + pub(crate) fn get(&self, name: &str) -> Option> { + self.inner.get(name).cloned() } - /// Mutable access to the underlying map. + /// Sort all subroutine rule bodies in place by the provided comparator. /// - /// Used by `MagicDatabase` after load to sort each subroutine's rules - /// by strength without allocating a new map. - pub(crate) fn values_mut( - &mut self, - ) -> std::collections::hash_map::ValuesMut<'_, String, Vec> { - self.inner.values_mut() + /// Used by `MagicDatabase` after load to apply strength-based ordering to + /// each subroutine body, matching the ordering applied to top-level rules. + /// Because subroutine bodies are stored as `Arc<[MagicRule]>` (immutable + /// slices), sorting requires materializing a `Vec`, sorting it, and + /// rebuilding the `Arc`. This is a one-time cost per load, not per + /// evaluation. + pub(crate) fn sort_subroutines(&mut self, mut sort_fn: impl FnMut(&mut Vec)) { + for arc in self.inner.values_mut() { + let mut vec: Vec = arc.iter().cloned().collect(); + sort_fn(&mut vec); + *arc = Arc::from(vec); + } } /// Merge another name table into this one. @@ -91,7 +107,7 @@ pub(crate) fn extract_name_table(rules: Vec) -> (Vec, Name // Recursively scrub nested Name rules from the subroutine's // children (shouldn't appear in practice, but be defensive). let children = scrub_nested_names(rule.children, rule.level); - table.inner.insert(name.clone(), children); + table.inner.insert(name.clone(), Arc::from(children)); } else { let scrubbed_children = scrub_nested_names(rule.children, rule.level + 1); kept.push(MagicRule { diff --git a/src/parser/types.rs b/src/parser/types.rs index 3040b94..b07cad0 100644 --- a/src/parser/types.rs +++ b/src/parser/types.rs @@ -128,9 +128,10 @@ pub fn parse_type_keyword(input: &str) -> IResult<&str, &str> { // none of these collide with other supported keywords. // // `offset` is recognized here so the parser can accept magic files - // that use it (e.g. `searchbug.magic`). In this phase it is - // evaluated as a silent no-op via `TypeKind::Meta(MetaType::Offset)`; - // full offset-reporting semantics are deferred. + // that use it (e.g. `searchbug.magic`). It maps to + // `TypeKind::Meta(MetaType::Offset)` and is fully evaluated by the + // engine: the resolved offset is emitted as `Value::Uint(position)` + // and participates in printf-style format substitution. alt(( tag("indirect"), tag("default"), @@ -220,9 +221,9 @@ pub fn type_keyword_to_kind(type_name: &str) -> Result, Unknown } // Meta / control-flow directives with no trailing operand are fully - // specified by the keyword alone. `offset` is included here because - // parser-only support for it lands it in the AST as a silent no-op - // during this phase; full offset-reporting semantics are deferred. + // specified by the keyword alone. `offset` maps to + // `MetaType::Offset` which the engine evaluates by emitting the + // resolved file position as `Value::Uint` for format substitution. match type_name { "default" => return Ok(Some(TypeKind::Meta(MetaType::Default))), "clear" => return Ok(Some(TypeKind::Meta(MetaType::Clear))), diff --git a/tests/meta_types_integration.rs b/tests/meta_types_integration.rs index c7867ab..befe35a 100644 --- a/tests/meta_types_integration.rs +++ b/tests/meta_types_integration.rs @@ -1,14 +1,15 @@ // Copyright (c) 2025-2026 the libmagic-rs contributors // SPDX-License-Identifier: Apache-2.0 -//! End-to-end smoke tests for meta-type directives (name/use/default/clear/indirect). +//! End-to-end smoke tests for meta-type directives +//! (name/use/default/clear/indirect/offset). //! //! Uses the canonical GNU `file` `searchbug.magic` fixture, which exercises //! the `name`/`use` subroutine machinery together with `offset`, `search/N`, -//! and relative-offset (`&N`) semantics. These tests verify the acceptance -//! surface shipped in this phase and are intentionally loose about the -//! exact result string -- the full byte-for-byte match is deferred to a -//! later phase that wires up the `offset` pseudo-type. +//! and relative-offset (`&N`) semantics. All six meta-type variants are fully +//! evaluated; `test_searchbug_matches_full_result_string` verifies the +//! byte-for-byte output against `searchbug.result` including the `offset` +//! pseudo-type's printf-style format substitution. use std::fs; use std::io::Write; From d2b58e1e7fd02df06c8535fc4cd4a05125d6d490 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 23:15:31 -0400 Subject: [PATCH 11/16] fix(meta-types): Address second-round PR review findings Correctness: %#o / %#x / %#X with Value::Uint(0) now emit a single '0' matching C printf alt-form semantics. Previous implementation produced '00' / '0x0' / '0X0' because the alt-form prefix was unconditionally concatenated. Regression test added (test_alt_form_prefix_suppressed_on_zero_value). Silent-failure fix: diagnostic for env-less Indirect dispatch promoted from debug! to warn! with a once-per-process flag (INDIRECT_WITHOUT_RULE_ENV_WARNED) mirroring USE_WITHOUT_RULE_ENV_WARNED. Misconfigured callers now see the warning at default log levels. The contains_indirect_rule tree walk is gated behind cfg(debug_assertions) so release builds avoid the O(n) scan on every evaluate_rules_with_config invocation. Maintenance: added an explicit RecursionLimitExceeded arm in evaluate_children_or_warn documenting that the variant propagates (previously reached via catch-all only). The explicit arm makes the contract obvious and prevents a future maintainer from accidentally adding it to the swallow list. #[allow(clippy::match_same_arms)] documents the intent -- both arms propagate by design. Test additions: sort_subroutines unit tests (reorders bodies, empty-table no-op, merge-policy preserved) in src/parser/name_table.rs; evaluate_children_or_warn BufferOverrun-swallow regression in src/evaluator/engine/tests.rs. 1364/1364 pass (5 pre-existing skipped); just ci-check green. Signed-off-by: UncleSp1d3r --- src/evaluator/engine/mod.rs | 45 ++++++++++++++++++----- src/evaluator/engine/tests.rs | 43 ++++++++++++++++++++++ src/output/format.rs | 49 ++++++++++++++++--------- src/parser/name_table.rs | 68 +++++++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 24 deletions(-) diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index af5ee07..1e01321 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -107,6 +107,13 @@ impl Drop for SubroutineScope<'_> { /// environment do not flood the log on every `Use` rule they encounter. static USE_WITHOUT_RULE_ENV_WARNED: AtomicBool = AtomicBool::new(false); +/// Process-local once guard for the "`evaluate_rules_with_config` called +/// with an `indirect` rule but without a `RuleEnvironment`" warning. +/// Same rationale as `USE_WITHOUT_RULE_ENV_WARNED`: surface the +/// misconfiguration exactly once per process so a large corpus of +/// env-less `indirect` rules does not flood the log. +static INDIRECT_WITHOUT_RULE_ENV_WARNED: AtomicBool = AtomicBool::new(false); + /// Evaluate a single magic rule against a file buffer /// /// This is a thin wrapper around [`evaluate_rules`] that evaluates exactly @@ -442,6 +449,20 @@ fn evaluate_children_or_warn( Err(LibmagicError::Timeout { timeout_ms }) => { return Err(LibmagicError::Timeout { timeout_ms }); } + // `RecursionLimitExceeded` is listed explicitly (rather than + // relying on the catch-all below) so a future maintainer adding + // another swallowed variant cannot accidentally swallow it. + // Both this arm and the catch-all intentionally propagate via + // `return Err(e)`; `match_same_arms` is suppressed because the + // explicit arm's purpose is documentation and future-proofing, + // not different behavior. See GOTCHAS S13 for the recursion- + // depth guard contract. + #[allow(clippy::match_same_arms)] + Err( + e @ LibmagicError::EvaluationError( + crate::error::EvaluationError::RecursionLimitExceeded { .. }, + ), + ) => return Err(e), Err( e @ (LibmagicError::EvaluationError( crate::error::EvaluationError::BufferOverrun { .. } @@ -1111,14 +1132,22 @@ pub fn evaluate_rules_with_config( // without an attached `RuleEnvironment`, which means any // `MetaType::Indirect` rule reached during evaluation is silently // no-op'd at runtime. That is the intentional behavior for low-level - // callers (matching the `Use`-without-env contract), but we log the - // misconfiguration at `debug!` level so consumer tests can detect - // env-less `indirect` usage. Using `debug_assert!` would panic in test - // builds and break the "evaluator never panics" invariant documented in - // GOTCHAS S2.4 -- a misconfigured caller should get a no-op, not a crash. - if contains_indirect_rule(rules) { - debug!( - "{}", + // callers (matching the `Use`-without-env contract), but we surface + // the misconfiguration at `warn!` level (once per process) so a + // consumer who wires up env-less `indirect` rules will see the + // diagnostic in default logging rather than only at debug level. + // The tree walk runs only in debug builds -- in release builds the + // `cfg(debug_assertions)` gate prevents the O(n) scan on every + // top-level evaluation. Using `debug_assert!` would panic in test + // builds and break the "evaluator never panics" invariant documented + // in GOTCHAS S2.4 -- a misconfigured caller should get a no-op with + // a log entry, not a crash. + #[cfg(debug_assertions)] + if contains_indirect_rule(rules) + && !INDIRECT_WITHOUT_RULE_ENV_WARNED.swap(true, Ordering::Relaxed) + { + warn!( + "{} (subsequent occurrences suppressed)", crate::error::EvaluationError::indirect_without_environment() ); } diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests.rs index 78d18b5..a5b2bff 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests.rs @@ -3621,3 +3621,46 @@ fn test_continuation_sibling_reset_after_bytes_consumed() { if reset is missing it reads buffer[5]=0x42 and test fails. got {matches:?}" ); } + +// ======================================================================= +// evaluate_children_or_warn graceful-error helper (issue #42 close-out) +// ======================================================================= + +#[test] +fn test_evaluate_children_or_warn_swallows_buffer_overrun_keeps_parent_match() { + // Regression guard for the extracted `evaluate_children_or_warn` + // helper: a child with an absolute offset past the buffer end must + // produce a `BufferOverrun` that is swallowed (warn-logged) rather + // than propagated. The parent match must still appear in the + // results. Covers the graceful-skip arm for all four dispatch + // sites (Default/Indirect/Offset/Use) via the Offset arm -- they + // all delegate to the same helper. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Child rule at absolute offset 1000 reads a byte -- far past the + // tiny buffer we supply. The helper should catch the BufferOverrun + // and warn-log, not fail the evaluation. + let child = MagicRule { + offset: OffsetSpec::Absolute(1000), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x00), + message: "unreachable-child".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = offset_rule(0, "parent-offset-match", vec![child]); + + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &[0u8; 4], &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent-offset-match"], + "parent match must survive a child's BufferOverrun; child must be silently skipped, got {matches:?}" + ); +} diff --git a/src/output/format.rs b/src/output/format.rs index 69f36b3..c7cdbaa 100644 --- a/src/output/format.rs +++ b/src/output/format.rs @@ -278,28 +278,24 @@ fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option { } Conv::HexLower => { let n = coerce_to_u64_masked(value, type_kind)?; - Some(render_prefixed_int( - &format!("{n:x}"), - if spec.alt_form { "0x" } else { "" }, - spec, - )) + // C printf suppresses the `0x`/`0X` alt-form prefix when the + // value is zero: `printf("%#x", 0)` emits `"0"`, not `"0x0"`. + let prefix = if spec.alt_form && n != 0 { "0x" } else { "" }; + Some(render_prefixed_int(&format!("{n:x}"), prefix, spec)) } Conv::HexUpper => { let n = coerce_to_u64_masked(value, type_kind)?; - Some(render_prefixed_int( - &format!("{n:X}"), - if spec.alt_form { "0X" } else { "" }, - spec, - )) + let prefix = if spec.alt_form && n != 0 { "0X" } else { "" }; + Some(render_prefixed_int(&format!("{n:X}"), prefix, spec)) } Conv::Octal => { let n = coerce_to_u64_masked(value, type_kind)?; - // C printf uses a single "0" prefix for %#o (not Rust's "0o"). - Some(render_prefixed_int( - &format!("{n:o}"), - if spec.alt_form { "0" } else { "" }, - spec, - )) + // C printf uses a single "0" prefix for %#o (not Rust's "0o"), + // and suppresses the prefix when the value itself is zero -- + // the resulting digit `0` already satisfies the "starts with + // 0" invariant that the alt-form is meant to guarantee. + let prefix = if spec.alt_form && n != 0 { "0" } else { "" }; + Some(render_prefixed_int(&format!("{n:o}"), prefix, spec)) } Conv::Char => { let n = coerce_to_u64(value)?; @@ -550,6 +546,27 @@ mod tests { assert_eq!(out, "data=abc"); } + #[test] + fn test_alt_form_prefix_suppressed_on_zero_value() { + // C printf special-cases `%#o`, `%#x`, `%#X` with value 0: the + // alt-form prefix is suppressed because the rendered digit + // already begins with `0`. Regression guard after pr-review + // caught that our implementation emitted `"00"` / `"0x0"` / + // `"0X0"` for zero values. + let out = format_magic_message("%#o", &Value::Uint(0), &byte_t()); + assert_eq!(out, "0", "%#o with 0 must emit single '0', not '00'"); + + let out = format_magic_message("%#x", &Value::Uint(0), &byte_t()); + assert_eq!(out, "0", "%#x with 0 must emit single '0', not '0x0'"); + + let out = format_magic_message("%#X", &Value::Uint(0), &byte_t()); + assert_eq!(out, "0", "%#X with 0 must emit single '0', not '0X0'"); + + // Non-zero values still get the prefix. + let out = format_magic_message("%#x", &Value::Uint(1), &byte_t()); + assert_eq!(out, "0x1"); + } + #[test] fn test_octal_substitution() { let out = format_magic_message("%o", &Value::Uint(8), &byte_t()); diff --git a/src/parser/name_table.rs b/src/parser/name_table.rs index d2392b5..6446223 100644 --- a/src/parser/name_table.rs +++ b/src/parser/name_table.rs @@ -268,4 +268,72 @@ mod tests { let subroutine = table_a.get("dup").expect("dup kept from first table"); assert_eq!(subroutine[0].message, "first-child"); } + + #[test] + fn test_sort_subroutines_reorders_rule_bodies() { + // `sort_subroutines` materializes each Arc body into a mutable + // Vec, invokes the sort closure, and rebuilds the Arc. A bug in + // that rebuild cycle (e.g., swapping Arc pointers instead of + // re-sorting) would leave the order unchanged. + let body = vec![ + make_rule(1, TypeKind::Byte { signed: false }, "c", vec![]), + make_rule(1, TypeKind::Byte { signed: false }, "a", vec![]), + make_rule(1, TypeKind::Byte { signed: false }, "b", vec![]), + ]; + let name_rule = make_rule( + 0, + TypeKind::Meta(MetaType::Name("sorted".to_string())), + "", + body, + ); + let (_, mut table) = extract_name_table(vec![name_rule]); + + table.sort_subroutines(|rules| rules.sort_by(|x, y| x.message.cmp(&y.message))); + + let after = table.get("sorted").expect("subroutine retained"); + let messages: Vec<&str> = after.iter().map(|r| r.message.as_str()).collect(); + assert_eq!(messages, vec!["a", "b", "c"]); + } + + #[test] + fn test_sort_subroutines_on_empty_table_is_noop() { + let (_, mut table) = extract_name_table(vec![]); + // The closure should never fire for an empty table. + table.sort_subroutines(|_| unreachable!("empty table must not invoke sort_fn")); + assert!(table.get("any").is_none()); + } + + #[test] + fn test_sort_subroutines_preserves_merge_policy() { + // After `sort_subroutines`, `merge` must still honor first-wins. + let first = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "first", + vec![], + )], + ); + let second = make_rule( + 0, + TypeKind::Meta(MetaType::Name("dup".to_string())), + "", + vec![make_rule( + 1, + TypeKind::Byte { signed: false }, + "second", + vec![], + )], + ); + let (_, mut table_a) = extract_name_table(vec![first]); + table_a.sort_subroutines(|_| {}); // no-op sort to trigger rebuild + let (_, table_b) = extract_name_table(vec![second]); + table_a.merge(table_b); + + let subroutine = table_a.get("dup").expect("dup kept from first table"); + assert_eq!(subroutine[0].message, "first"); + } } From dbcbdb7ea4b5ea08a1a4404c3cab0bb10aabf913 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 23:24:58 -0400 Subject: [PATCH 12/16] fix: Round-3 PR #230 review -- holistic docs/comment sweep + 3 bugs Third round of reviewer feedback surfaced 11 more threads, mostly recurring stale docs/comments describing Offset as a silent no-op / deferred feature. Rather than whack-a-mole per-thread, did a repo-wide sweep for stale wording (silent no-op, five meta-types, not yet implemented, deferred) and updated every match. Doc/comment sync (8 files): src/parser/ast.rs TypeKind::Meta doc comment (rewrote to describe all six fully-evaluated variants); src/evaluator/engine/mod.rs evaluate_single_rule_with_anchor comment (Meta arms are defense-in-depth for programmatic callers, not phase-1 no-ops); src/parser/grammar/tests/mod.rs (five -> six meta-types); docs/src/architecture.md MetaType enum snippet (added Offset); docs/src/evaluator.md dispatch section (added Offset semantics); docs/MAGIC_FORMAT.md meta-types table (added offset row); docs/ARCHITECTURE.md MagicDatabase struct snippet (corrected field types: rules is Vec, root_rules is Arc<[MagicRule]>, added mime_mapper); docs/src/api-reference.md (use libmagic_rs::MetaType -> use libmagic_rs::parser::ast::MetaType); ROADMAP.md v0.3.0 meta-types entry (added offset); tests/compatibility_tests.rs test doc (full parity is shipped, not deferred). Bug fixes (3): src/parser/name_table.rs scrub_nested_names warn message now reports child.level and parent_level distinctly (was reporting only parent_level, misleading); src/output/format.rs %c specifier accepts full 0x00-0xff range via char::from(u8) mapping to Latin-1 code points (matching C printf / GNU file unsigned-char semantics -- previously rejected >= 0x80); updated regression test accordingly. Dismissed as nitpick (replied-only): src/parser/name_table.rs merge HashMap iteration order -- reviewer acknowledged semantically safe (both colliding definitions emit warnings; first-wins applied consistently); switching to BTreeMap would be over-engineering. Test: 1364/1364 pass; just ci-check green. Signed-off-by: UncleSp1d3r --- ROADMAP.md | 2 +- docs/ARCHITECTURE.md | 7 ++++--- docs/MAGIC_FORMAT.md | 15 ++++++++------- docs/src/api-reference.md | 2 +- docs/src/architecture.md | 1 + docs/src/evaluator.md | 1 + src/evaluator/engine/mod.rs | 13 ++++++++----- src/output/format.rs | 29 +++++++++++++++++++++++------ src/parser/ast.rs | 14 +++++++++----- src/parser/grammar/tests/mod.rs | 6 +++--- src/parser/name_table.rs | 5 ++++- tests/compatibility_tests.rs | 16 +++++++++------- 12 files changed, 72 insertions(+), 39 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index fb3ed1d..b42a5a1 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -34,7 +34,7 @@ See [GitHub Milestones](https://github.com/EvilBit-Labs/libmagic-rs/milestones) - [x] Float and double types ([#40](https://github.com/EvilBit-Labs/libmagic-rs/issues/40)) - [x] Date and timestamp types ([#41](https://github.com/EvilBit-Labs/libmagic-rs/issues/41)) - [x] Pascal string type ([#43](https://github.com/EvilBit-Labs/libmagic-rs/issues/43)) -- [x] Meta-types: default, clear, name, use, indirect ([#42](https://github.com/EvilBit-Labs/libmagic-rs/issues/42)) +- [x] Meta-types: default, clear, name, use, indirect, offset ([#42](https://github.com/EvilBit-Labs/libmagic-rs/issues/42)) ## v0.4.0 - API and UX Polish diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 5a06c00..f15adbb 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -234,11 +234,12 @@ The main entry point for users. Manages rule loading and evaluation. ```rust pub struct MagicDatabase { - rules: Arc<[MagicRule]>, // Parsed magic rules (shared, immutable) - root_rules: Arc<[MagicRule]>, // Full top-level rule list for `indirect` re-entry - name_table: Arc, // `name`/`use` subroutine dispatch table + rules: Vec, // Parsed magic rules (top-level, strength-sorted) + name_table: Arc, // `name`/`use` subroutine dispatch table (Arc for cheap clone across evaluations) + root_rules: Arc<[MagicRule]>, // Shared immutable slice of top-level rules for `indirect` re-entry config: EvaluationConfig, // Evaluation settings source_path: Option, // Where rules came from + mime_mapper: MimeMapper, // Cached MIME-type lookup } ``` diff --git a/docs/MAGIC_FORMAT.md b/docs/MAGIC_FORMAT.md index 349fad9..b40961f 100644 --- a/docs/MAGIC_FORMAT.md +++ b/docs/MAGIC_FORMAT.md @@ -484,13 +484,14 @@ Output: `GIF image data, version 89a` Meta-types are pseudo-types that do not read bytes from the buffer. Instead, they control the evaluation flow: defining named subroutines, invoking them, providing fallbacks when no sibling matched, resetting per-level match state, or re-applying the entire rule database at a resolved offset. -| Keyword | Syntax | Description | -| ----------- | ---------------------- | ------------------------------------------------------------------ | -| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | -| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | -| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | -| `clear` | `0 clear` | Resets the per-level sibling-matched flag | -| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | +| Keyword | Syntax | Description | +| ----------- | --------------------------- | --------------------------------------------------------------------------------------- | +| `name ` | `0 name part2` | Defines a named subroutine block; children are the subroutine body | +| `use ` | `>0 use part2` | Invokes a named subroutine at the resolved offset | +| `default` | `0 default x Fallback` | Fires only when no sibling at the same level has matched | +| `clear` | `0 clear` | Resets the per-level sibling-matched flag | +| `indirect` | `8 indirect x` | Re-applies the full rule database at the resolved offset | +| `offset` | `>>&0 offset x at_off %lld` | Emits the resolved file position as `Value::Uint` for printf-style message substitution | ### `name` and `use` — Named Subroutines diff --git a/docs/src/api-reference.md b/docs/src/api-reference.md index 1a7fd02..d6d677c 100644 --- a/docs/src/api-reference.md +++ b/docs/src/api-reference.md @@ -237,7 +237,7 @@ use libmagic_rs::TypeKind; Control-flow directives carried by `TypeKind::Meta`. ```rust -use libmagic_rs::MetaType; +use libmagic_rs::parser::ast::MetaType; ``` | Variant | Description | diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 1334d1f..e5d2892 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -114,6 +114,7 @@ pub enum MetaType { Name(String), // `name ` subroutine declaration (hoisted at load time) Use(String), // `use ` subroutine invocation Indirect, // `indirect` re-applies root rules at the resolved offset + Offset, // `offset` emits the resolved file position as Value::Uint for printf-style message substitution } pub enum Operator { diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index 30f2ec9..ba048a2 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -369,6 +369,7 @@ Before calling `evaluate_single_rule_with_anchor` for a value-read rule, `evalua - **`MetaType::Use(name)`**: Looks up `name` in `RuleEnvironment::name_table`. On hit, evaluates the subroutine's child rules at the resolved offset, propagates their matches into the caller's match vector, then also evaluates the `use` rule's own `rule.children`. On miss, logs a `warn!` and returns `Ok(None)` (treated as non-match). - **`MetaType::Indirect`**: Resolves the rule's offset against the buffer, slices the buffer at that point, resets the `EvaluationContext` anchor to 0, calls `evaluate_rules` recursively with `RuleEnvironment::root_rules` (the complete top-level rule list), and then restores the caller's anchor on return. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. - **`MetaType::Name`**: Unreachable after load-time extraction — `name` blocks are hoisted out of the rule list by `parser::name_table::extract_name_table` before the evaluator ever sees them. Defensive arm returns `Ok(None)` and emits a `debug!` rather than `debug_assert!` so that property tests synthesizing arbitrary `TypeKind` values do not break the never-panics invariant. +- **`MetaType::Offset`**: Resolves the rule's offset against the buffer and records a `RuleMatch` whose `value` is `Value::Uint(resolved_offset)`. The rule's `message` is then rendered through `format_magic_message`, which substitutes printf-style specifiers (`%lld`, `%d`) with that value. Used by magic fixtures that need to report "matched at offset N" in the output (e.g., GNU `file`'s `searchbug.magic` fixture). ```mermaid sequenceDiagram diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 1e01321..9164988 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -217,11 +217,14 @@ fn evaluate_single_rule_with_anchor( // metacharacters. // // Meta-type directives (`default`, `clear`, `name`, `use`, - // `indirect`) are silent no-ops in this phase -- the parser - // preserves them in the AST but the evaluator does not yet wire - // them into any control-flow behavior. Short-circuiting here with - // `Ok(None)` keeps them out of the value/pattern paths (which - // would otherwise surface `TypeReadError::UnsupportedType`). + // `indirect`, `offset`) are dispatched by `evaluate_rules` at the + // outer loop level (not here) -- this single-rule helper is only + // invoked for non-meta rules. Short-circuiting the Meta arms here + // with `Ok(None)` is defense-in-depth for programmatic callers + // (property tests, fuzz harnesses) that hand-build a Meta rule + // and feed it directly to `evaluate_single_rule`; without the + // guard, the value/pattern paths would surface + // `TypeReadError::UnsupportedType`. let (matched, read_value) = match &rule.typ { TypeKind::Meta(MetaType::Name(name)) => { // `Name` rules are normally hoisted into the name table at diff --git a/src/output/format.rs b/src/output/format.rs index c7cdbaa..e0ee3aa 100644 --- a/src/output/format.rs +++ b/src/output/format.rs @@ -300,10 +300,16 @@ fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option { Conv::Char => { let n = coerce_to_u64(value)?; let byte = u8::try_from(n).ok()?; - if byte > 0x7f { - return None; - } - Some(pad_numeric(&(byte as char).to_string(), spec)) + // GNU `file` / C printf `%c` converts the int argument to + // unsigned char and emits it directly for all byte values + // 0x00-0xff. Rust's `String` must be valid UTF-8, so we + // embed bytes >= 0x80 as their Latin-1 code points (U+0080 + // through U+00FF) via `char::from(u8)` which is infallible + // and lossless. Consumers with UTF-8 terminals see the + // 2-byte UTF-8 encoding of that code point; consumers + // iterating the returned bytes directly can recover the + // original byte by re-encoding the code point as Latin-1. + Some(pad_numeric(&char::from(byte).to_string(), spec)) } } } @@ -711,9 +717,20 @@ mod tests { } #[test] - fn test_char_specifier_rejects_non_ascii() { - // Values above 0x7f cannot be rendered as `%c` -> pass through literally. + fn test_char_specifier_accepts_full_byte_range() { + // `%c` emits every byte value 0x00..=0xff directly, matching + // GNU `file` / C printf semantics. Bytes 0x80-0xff are embedded + // as their Latin-1 code points via `char::from(u8)`. + // 0xff maps to U+00FF ('ÿ'); UTF-8 encoding is 0xc3 0xbf. let out = format_magic_message("[%c]", &Value::Uint(0xff), &byte_t()); + assert_eq!(out, "[\u{00ff}]"); + + // ASCII boundary stays unchanged. + let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, "[A]"); + + // Out-of-range (doesn't fit u8) passes through literally. + let out = format_magic_message("[%c]", &Value::Uint(0x1_0000), &byte_t()); assert_eq!(out, "[%c]"); } diff --git a/src/parser/ast.rs b/src/parser/ast.rs index 55a5a45..770d124 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -503,13 +503,17 @@ pub enum TypeKind { /// Scan window width in bytes, starting at the rule's offset. range: NonZeroUsize, }, - /// Control-flow directive (`default`, `clear`, `name`, `use`, `indirect`). + /// Control-flow directive (`default`, `clear`, `name`, `use`, + /// `indirect`, `offset`). /// /// These magic(5) keywords do not read or compare bytes; they modify - /// how a rule set is traversed. In the current phase they are parsed - /// into the AST and preserved through codegen, but the evaluator - /// treats them as silent no-ops. See [`MetaType`] for the individual - /// variants and their intended semantics. + /// how a rule set is traversed. All six variants are fully evaluated: + /// `default` fires as a fallback when no sibling at the same level + /// has matched; `clear` resets that flag; `name`/`use` support + /// subroutine definition and invocation; `indirect` re-enters the + /// rule set at a resolved offset; `offset` emits the resolved file + /// position as `Value::Uint` for printf-style message substitution. + /// See [`MetaType`] for the individual variants. /// /// # Examples /// diff --git a/src/parser/grammar/tests/mod.rs b/src/parser/grammar/tests/mod.rs index bdc2040..3e61e68 100644 --- a/src/parser/grammar/tests/mod.rs +++ b/src/parser/grammar/tests/mod.rs @@ -2622,10 +2622,10 @@ fn test_parse_magic_rule_meta_name_use_reject_malformed_identifiers() { #[test] fn test_parse_text_magic_file_meta_roundtrip() { - // Build a small magic file that uses all five meta-types. The `name` + // Build a small magic file that uses the six meta-types. The `name` // block is a level-1 subroutine invoked by the top-level `use`, and - // `indirect` / `default` / `clear` appear as sibling directives to - // exercise the parse path for each variant. + // `indirect` / `default` / `clear` / `offset` appear as sibling + // directives to exercise the parse path for each variant. // // NOTE: all rules use the SAME top-level indentation so // build_rule_hierarchy treats them as siblings. Child rules would diff --git a/src/parser/name_table.rs b/src/parser/name_table.rs index 6446223..4cd27ec 100644 --- a/src/parser/name_table.rs +++ b/src/parser/name_table.rs @@ -125,7 +125,10 @@ fn scrub_nested_names(children: Vec, parent_level: u32) -> Vec Date: Wed, 22 Apr 2026 23:32:28 -0400 Subject: [PATCH 13/16] fix(evaluator): Round-3 semantic fixes -- use terminal anchor + indirect top-level re-entry Bug 1 (PRRT...PrI): evaluate_use_rule now returns the subroutine's TERMINAL anchor (where the subroutine left last_match_end) instead of the use-site offset. Callers propagate that terminal anchor via context.set_last_match_end, so sibling rules after a successful 'use' resolve &N against the subroutine's final match position -- matching GNU file's inlining semantics. Previously siblings resolved against the use-site, silently breaking continuation chains. Bug 2 (PRRT...PrL): MetaType::Indirect root re-entry is evaluated with TOP-LEVEL sibling semantics (anchor chains across siblings per GOTCHAS S3.8), not CONTINUATION semantics (anchor resets). Previously the RecursionGuard wrapping the indirect sub-evaluation forced recursion_depth > 0, which triggered the continuation-reset path and broke top-level siblings inside the re-entered database. Fixed via a new one-shot EvaluationContext::indirect_reentry flag: indirect dispatch sets it before evaluate_rules; evaluate_rules consumes it at entry so only the top-level iteration is affected -- children of matched rules inside the re-entry correctly fall back to recursion_depth > 0 for their own continuation semantics. Deferred (PRRT...PrQ refactor suggestion): split engine tests file into focused modules. Out of scope for a correctness-focused review round; tracked as a follow-on maintenance task. Test: 1364/1364 pass (searchbug byte-for-byte parity preserved through both semantic fixes); just ci-check green. Signed-off-by: UncleSp1d3r --- src/evaluator/engine/mod.rs | 59 ++++++++++++++++++++++++++++--------- src/evaluator/mod.rs | 36 ++++++++++++++++++++++ 2 files changed, 81 insertions(+), 14 deletions(-) diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 9164988..19d1289 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -323,13 +323,22 @@ fn evaluate_use_rule( // through the guard's `Drop` impl and the caller's context // returns to its pre-use state. Without the RAII wrapper a manual // save/restore pair would be bypassed on every error path. - let subroutine_matches = { + // Capture both the subroutine's matches AND the terminal anchor + // where the subroutine left `last_match_end`. The terminal anchor + // is what GNU `file`-compatible inlining semantics require: sibling + // rules after the `use` site must resolve `&N` against the position + // the subroutine reached, not the use-site offset. Reading the + // anchor INSIDE the scope (before Drop restores the caller's value) + // preserves it for the caller. + let (subroutine_matches, terminal_anchor) = { let mut scope = SubroutineScope::enter(context, absolute_offset); let mut guard = RecursionGuard::enter(scope.context())?; - evaluate_rules(&subroutine_rules, buffer, guard.context())? + let matches = evaluate_rules(&subroutine_rules, buffer, guard.context())?; + let terminal = guard.context().last_match_end(); + (matches, terminal) }; - Ok((Some(absolute_offset), subroutine_matches)) + Ok((Some(terminal_anchor), subroutine_matches)) } /// Evaluate a pattern-bearing rule (`TypeKind::Regex` / `TypeKind::Search`). @@ -611,8 +620,22 @@ pub fn evaluate_rules( // post-match anchor (via the current value of `last_match_end()` at // the point of recursion), so child sibling lists see their parent's // resolved position as their own entry anchor. + // + // INDIRECT RE-ENTRY exception: `MetaType::Indirect` dispatches its + // sub-evaluation via `RecursionGuard::enter` (to bound the recursion + // cycle), which forces `recursion_depth > 0`. But an indirect + // re-entry semantically evaluates the root rule list with TOP-LEVEL + // sibling semantics -- each rule is an independent classification + // attempt against the re-entered sub-buffer, NOT a continuation + // list. The indirect dispatch sets `context.set_indirect_reentry(true)` + // just before this call; `take_indirect_reentry()` consumes it at + // entry so only this iteration treats siblings as top-level. + // Children of matched rules inside the re-entry still see the flag + // as false (consumed) and correctly fall back to continuation + // semantics via `recursion_depth > 0`. let entry_anchor = context.last_match_end(); - let is_child_sibling_list = context.recursion_depth() > 0; + let is_indirect_reentry = context.take_indirect_reentry(); + let is_child_sibling_list = context.recursion_depth() > 0 && !is_indirect_reentry; // Entry-point timeout check: ensures every recursive descent is bounded // and that evaluations of small rule sets (< 16 rules) are still guarded. @@ -776,9 +799,16 @@ pub fn evaluate_rules( // Recursion guard + anchor scope: nested indirect / use cycles // surface as `RecursionLimitExceeded` instead of a stack overflow, // and the caller's anchor is restored on every exit path. + // + // Mark the upcoming `evaluate_rules` call as a top-level + // re-entry (consumed at entry) so sibling anchor-reset + // semantics do NOT fire -- root rules in the re-entered + // database chain their anchors across siblings like any + // other top-level evaluation. { let mut guard = RecursionGuard::enter(context)?; let mut anchor_scope = AnchorScope::enter(guard.context(), 0); + anchor_scope.context().set_indirect_reentry(true); match evaluate_rules(&root_rules, sub_buffer, anchor_scope.context()) { Ok(sub_matches) => { matches.extend(sub_matches); @@ -891,18 +921,19 @@ pub fn evaluate_rules( if let TypeKind::Meta(MetaType::Use(name)) = &rule.typ { let matches_before = matches.len(); let use_resolved = match evaluate_use_rule(rule, name, buffer, context) { - Ok((Some(absolute_offset), subroutine_matches)) => { + Ok((Some(terminal_anchor), subroutine_matches)) => { matches.extend(subroutine_matches); - // A `use` rule itself does not produce a surface - // `RuleMatch` in GNU `file` output; the subroutine's - // rules carry the visible messages. We therefore only - // advance the anchor (to the use-site offset, which - // may have been moved by the subroutine; since we - // restored it above, we now re-advance to the - // use-site offset so subsequent sibling rules resolve - // relative offsets from the use-site end). - context.set_last_match_end(absolute_offset); + // A `use` rule does not produce a surface + // `RuleMatch` itself -- the subroutine's rules + // carry the visible messages. Advance the + // caller's anchor to the subroutine's TERMINAL + // anchor (where the subroutine left `last_match_end`), + // not the use-site offset. This makes `use` + // behave like inlining the subroutine: sibling + // rules after the `use` see `&N` resolve against + // the subroutine's final match position. + context.set_last_match_end(terminal_anchor); true } Ok((None, _)) => { diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index ef52d3f..0dbc7c5 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -85,6 +85,21 @@ pub struct EvaluationContext { /// Restored to the caller's value on subroutine exit via the /// `BaseOffsetScope` RAII guard in `engine/mod.rs`. base_offset: usize, + /// One-shot flag set by `MetaType::Indirect` dispatch before + /// re-entering the root rule list. When true, the next entry to + /// `evaluate_rules` treats the iteration as a top-level sibling + /// chain (anchor chains across siblings per GOTCHAS S3.8) rather + /// than as a continuation list (anchor resets between siblings). + /// Consumed at entry — children of a matched rule inside the + /// re-entry see the flag cleared, so their own continuation-reset + /// semantics kick in via the `recursion_depth > 0` gate. + /// + /// Without this flag, `indirect` wrapping re-entry under + /// `RecursionGuard` forces `recursion_depth > 0`, which forces + /// continuation-reset semantics on the root rule list — wrong, + /// because top-level rules in the re-entered database should + /// chain sibling anchors like any other top-level evaluation. + indirect_reentry: bool, } impl EvaluationContext { @@ -112,6 +127,7 @@ impl EvaluationContext { config, rule_env: None, base_offset: 0, + indirect_reentry: false, } } @@ -130,6 +146,25 @@ impl EvaluationContext { self.base_offset = offset; } + /// Read-and-clear the indirect-reentry flag. Used by `evaluate_rules` + /// at entry to decide whether the iteration is a top-level re-entry + /// (no anchor reset between siblings) or a continuation list (reset + /// between siblings). Cleared on read so children of a matched rule + /// inside the re-entry see the flag as false and fall back to the + /// `recursion_depth > 0` gate for their own continuation semantics. + pub(crate) fn take_indirect_reentry(&mut self) -> bool { + std::mem::take(&mut self.indirect_reentry) + } + + /// Set the indirect-reentry flag. + /// + /// `pub(crate)` and owned by the `MetaType::Indirect` dispatch in + /// `engine/mod.rs`. Callers should set this true exactly once + /// before invoking `evaluate_rules` on the root rule list. + pub(crate) fn set_indirect_reentry(&mut self, flag: bool) { + self.indirect_reentry = flag; + } + /// Attach a rule environment to this context. /// /// The environment carries the name-subroutine table and root rule list @@ -304,6 +339,7 @@ impl EvaluationContext { self.last_match_end = 0; self.recursion_depth = 0; self.base_offset = 0; + self.indirect_reentry = false; } } From 5efb0de5118001c5e4c566a6f34cc8aa6f5e01a8 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Wed, 22 Apr 2026 23:33:45 -0400 Subject: [PATCH 14/16] docs: Fix ghost BaseOffsetScope reference in EvaluationContext::base_offset docs The real guard is named SubroutineScope (saves+restores both last_match_end and base_offset). The base_offset doc comment referenced a BaseOffsetScope that never existed -- leftover from design intent in an earlier session. Updated the two doc-comment sites. Signed-off-by: UncleSp1d3r --- src/evaluator/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 0dbc7c5..c88f256 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -83,7 +83,8 @@ pub struct EvaluationContext { /// magic(5) / libmagic semantics: subroutines see offsets relative /// to the caller's invocation point, not absolute file positions). /// Restored to the caller's value on subroutine exit via the - /// `BaseOffsetScope` RAII guard in `engine/mod.rs`. + /// `SubroutineScope` RAII guard in `engine/mod.rs`, which saves + /// and restores both `last_match_end` and `base_offset` together. base_offset: usize, /// One-shot flag set by `MetaType::Indirect` dispatch before /// re-entering the root rule list. When true, the next entry to @@ -140,7 +141,7 @@ impl EvaluationContext { /// Set the subroutine base offset. /// - /// `pub(crate)` and owned by the engine's `BaseOffsetScope` RAII + /// `pub(crate)` and owned by the engine's `SubroutineScope` RAII /// guard -- no external caller should set this directly. pub(crate) fn set_base_offset(&mut self, offset: usize) { self.base_offset = offset; From e9d20a0394f6840596c91b58e267c07d70f840cc Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 23 Apr 2026 00:35:51 -0400 Subject: [PATCH 15/16] refactor(tests): Split engine/tests.rs into focused submodules (#230 PR review) Addresses PRRT...PrQ from the round-3 PR review: src/evaluator/engine/tests.rs grew to 3666 lines, well past the 500-600 line project guideline. Split into the three meta-type suites the reviewer called out. Structure: - src/evaluator/engine/tests/mod.rs (2777 lines): core evaluator tests (byte/short/long/string/pstring/relative-offset/recursion) plus centralized meta-type test helpers (make_context_with_env, use_rule, use_rule_at, build_name_table, default_rule, clear_rule, byte_eq_rule, indirect_rule, offset_rule) marked pub(super) so submodules can share them. - src/evaluator/engine/tests/meta_use_tests.rs (405 lines): MetaType::Use dispatch, subroutine name-table resolution, and subroutine base_offset biasing tests. - src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs (275 lines): MetaType::Default fallback, MetaType::Clear state-reset, and MetaType::Indirect root re-entry tests. - src/evaluator/engine/tests/meta_offset_tests.rs (247 lines): MetaType::Offset dispatch and evaluate_children_or_warn helper coverage. No test content changed -- each test's body is byte-identical to the pre-split version. The three new submodules each have their own SPDX/copyright header and module doc. Test count unchanged (1364/1364 pass, 5 pre-existing skipped). Signed-off-by: UncleSp1d3r --- .../meta_default_clear_indirect_tests.rs | 274 ++++++ .../engine/tests/meta_offset_tests.rs | 247 +++++ src/evaluator/engine/tests/meta_use_tests.rs | 402 ++++++++ .../engine/{tests.rs => tests/mod.rs} | 927 +----------------- 4 files changed, 943 insertions(+), 907 deletions(-) create mode 100644 src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs create mode 100644 src/evaluator/engine/tests/meta_offset_tests.rs create mode 100644 src/evaluator/engine/tests/meta_use_tests.rs rename src/evaluator/engine/{tests.rs => tests/mod.rs} (71%) diff --git a/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs b/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs new file mode 100644 index 0000000..fb23135 --- /dev/null +++ b/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs @@ -0,0 +1,274 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Tests for `MetaType::Default`, `MetaType::Clear`, and +//! `MetaType::Indirect` dispatch. +//! +//! Shared helpers (`default_rule`, `clear_rule`, `byte_eq_rule`, +//! `indirect_rule`, `make_context_with_env`, `build_name_table`) live in +//! the parent `tests/mod.rs` module. + +use super::*; + +#[test] +fn test_default_fires_when_no_sibling_matched() { + let rules = vec![default_rule("DEFAULT-FIRES", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "default with no prior sibling match should fire" + ); + assert_eq!(matches[0].message, "DEFAULT-FIRES"); +} + +#[test] +fn test_default_skipped_when_sibling_matched() { + // Disable stop-at-first-match so we can see whether the default would + // have fired or not. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + byte_eq_rule(0, 0xAA, "real-match"), + default_rule("DEFAULT-SKIPPED", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "default after a successful sibling should not fire" + ); + assert_eq!(matches[0].message, "real-match"); +} + +#[test] +fn test_default_fires_only_once() { + // Two consecutive default rules: the first sets sibling_matched, so + // the second must not fire. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + default_rule("FIRST-DEFAULT", vec![]), + default_rule("SECOND-DEFAULT", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "only the first default should fire when no real sibling matched" + ); + assert_eq!(matches[0].message, "FIRST-DEFAULT"); +} + +#[test] +fn test_default_children_evaluated() { + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let child = byte_eq_rule(0, 0xAA, "default-child"); + let rules = vec![default_rule("PARENT-DEFAULT", vec![child])]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "default rule's children must be evaluated when the default fires" + ); + assert_eq!(matches[0].message, "PARENT-DEFAULT"); + assert_eq!(matches[1].message, "default-child"); +} + +#[test] +fn test_clear_resets_sibling_matched() { + // Sequence: byte-match, default-skipped, clear, default-fires. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + byte_eq_rule(0, 0xAA, "byte-match"), + default_rule("DEFAULT-SKIPPED", vec![]), + clear_rule(), + default_rule("DEFAULT-FIRES-AFTER-CLEAR", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "clear must reset sibling_matched so a later default fires" + ); + assert_eq!(matches[0].message, "byte-match"); + assert_eq!(matches[1].message, "DEFAULT-FIRES-AFTER-CLEAR"); +} + +#[test] +fn test_clear_at_top_is_noop() { + let rules = vec![clear_rule(), default_rule("AFTER-CLEAR", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "clear at top of list is a no-op; default after still fires" + ); + assert_eq!(matches[0].message, "AFTER-CLEAR"); +} + +#[test] +fn test_clear_does_not_produce_match() { + let rules = vec![clear_rule()]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!(matches.is_empty(), "clear alone must produce no match"); +} + +#[test] +fn test_default_clear_per_level_isolation() { + // Parent has its own sibling_matched flag. The child list runs with a + // fresh flag, so a child-level `default` must fire even though the + // parent's flag is true. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "parent-match".to_string(), + children: vec![ + byte_eq_rule(1, 0xBB, "child-byte-match"), + default_rule("CHILD-DEFAULT-SKIPPED", vec![]), + clear_rule(), + default_rule("CHILD-DEFAULT-AFTER-CLEAR", vec![]), + ], + level: 0, + strength_modifier: None, + }; + let mut context = EvaluationContext::new(config); + let buffer = [0xAAu8, 0xBB]; + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + + // Expected order: parent-match, child-byte-match, CHILD-DEFAULT-AFTER-CLEAR + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec![ + "parent-match", + "child-byte-match", + "CHILD-DEFAULT-AFTER-CLEAR" + ], + "child-level sibling_matched must be isolated from parent-level state" + ); +} + +#[test] +fn test_indirect_evaluates_root_rules_at_offset() { + // Root rules: detect a "ZIP-like" header (0x50 0x4b) at offset 0 of the + // sub-buffer. The indirect rule fires at offset 4 of the outer buffer, + // which means the sub-buffer starts at byte 4. Place 0x50 0x4b there. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let root_rule = byte_eq_rule(0, 0x50, "ZIP-like-header"); + let root_rules: Vec = vec![root_rule]; + + // Build an environment where root_rules is the same as the rules we + // dispatch into. + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(root_rules.as_slice()), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + // Buffer: ELF magic at offset 0, ZIP-like at offset 4. The indirect + // rule is the trigger; the root re-entry detects 0x50 at sub-buffer 0. + let buffer = [0x7fu8, 0x45, 0x4c, 0x46, 0x50, 0x4b, 0x03, 0x04]; + let rules = vec![indirect_rule(4, "indirect-trigger", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + + assert!( + matches.iter().any(|m| m.message == "ZIP-like-header"), + "indirect must dispatch root rules against the sub-buffer at offset 4; got {matches:?}" + ); +} + +#[test] +fn test_indirect_out_of_bounds_is_noop() { + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(&[byte_eq_rule(0, 0x00, "root")] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let buffer = [0u8; 4]; + // Indirect at offset 100, which is well past the 4-byte buffer. + let rules = vec![indirect_rule(100, "indirect-oob", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "indirect past buffer end must be a graceful no-op" + ); +} + +#[test] +fn test_indirect_without_env_is_noop() { + // Property tests synthesize Indirect rules without an attached + // RuleEnvironment, so this path must be a graceful no-op (matching the + // `Use`-without-env contract). The engine logs at `debug!` rather than + // panicking via `debug_assert!` to preserve the never-panics invariant + // exercised by `prop_arbitrary_rule_evaluation_never_panics`. + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let buffer = [0u8; 4]; + let rules = vec![indirect_rule(0, "indirect-no-env", vec![])]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "indirect without env must produce no matches" + ); +} + +#[test] +fn test_indirect_recursion_limit() { + // Root rules contain an indirect rule that points back to offset 0, + // creating an infinite re-entry chain. Must surface as + // `RecursionLimitExceeded`, not stack overflow. + let inner_indirect = indirect_rule(0, "recursive-indirect", vec![]); + let root_rules: Vec = vec![inner_indirect]; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(NameTable::empty()), + root_rules: std::sync::Arc::from(root_rules.as_slice()), + }); + let mut context = EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env); + + let buffer = [0u8; 8]; + let rules = vec![indirect_rule(0, "outer-indirect", vec![])]; + let result = evaluate_rules(&rules, &buffer, &mut context); + assert!( + matches!( + result, + Err(LibmagicError::EvaluationError( + crate::error::EvaluationError::RecursionLimitExceeded { .. } + )) + ), + "infinite indirect recursion must surface RecursionLimitExceeded, got {result:?}" + ); +} diff --git a/src/evaluator/engine/tests/meta_offset_tests.rs b/src/evaluator/engine/tests/meta_offset_tests.rs new file mode 100644 index 0000000..a68c790 --- /dev/null +++ b/src/evaluator/engine/tests/meta_offset_tests.rs @@ -0,0 +1,247 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Tests for `MetaType::Offset` dispatch and the related +//! `evaluate_children_or_warn` helper. +//! +//! Shared helpers (`offset_rule`, `byte_eq_rule`) live in the parent +//! `tests/mod.rs` module. + +use super::*; + +#[test] +fn test_offset_emits_match_with_resolved_position() { + let rules = vec![offset_rule(5, "pos=%lld", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 10], &mut context).unwrap(); + assert_eq!(matches.len(), 1, "offset rule must emit exactly one match"); + assert_eq!(matches[0].offset, 5, "match.offset is the resolved offset"); + assert_eq!( + matches[0].value, + Value::Uint(5), + "match.value carries the resolved offset for format substitution" + ); + assert_eq!(matches[0].message, "pos=%lld"); +} + +#[test] +fn test_offset_at_zero() { + // Regression guard: offset 0 must still produce a match (not be + // indistinguishable from "no match"). + let rules = vec![offset_rule(0, "top", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].value, Value::Uint(0)); +} + +#[test] +fn test_offset_out_of_bounds_graceful_skip() { + // Offset past the end of the buffer is a data-dependent skip, not an + // error. Matches the Indirect dispatch's graceful-skip discipline. + let rules = vec![offset_rule(1_000_000, "unreachable", vec![])]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!( + matches.is_empty(), + "offset past buffer end must produce no match" + ); +} + +#[test] +fn test_offset_non_x_operator_is_skipped() { + // magic(5) only allows `x` on an `offset` rule. Anything else is + // semantically undefined -> debug-log + skip. + let mut rule = offset_rule(0, "bogus", vec![]); + rule.op = Operator::Equal; + rule.value = Value::Uint(5); + let rules = vec![rule]; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + assert!( + matches.is_empty(), + "offset rule with non-AnyValue operator must be skipped" + ); +} + +#[test] +fn test_offset_evaluates_children() { + // A child byte rule at offset 0 runs AFTER the parent offset rule + // fires. The child's own offset is resolved independently. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let mut parent = offset_rule( + 0, + "parent-offset", + vec![byte_eq_rule(0, 0x42, "child-byte")], + ); + // Child level must be deeper than parent per MagicRule::validate. + parent.children[0].level = 1; + let buffer = [0x42u8, 0x00, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!(messages, vec!["parent-offset", "child-byte"]); +} + +#[test] +fn test_offset_advances_anchor_for_children() { + // An offset rule at position 5 advances `last_match_end` to 5 *for its + // children* -- but NOT for sibling rules at the same level. This + // matches libmagic's continuation-level semantics: each sibling at + // level L resolves `&N` against the parent-level anchor, not against + // the previous sibling's advance. See the `entry_anchor` discipline + // in `evaluate_rules`. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // A child of the offset rule uses &0 to resolve at the offset rule's + // resolved position (5). buffer[5] = 0x42. + let mut child = byte_eq_rule(0, 0x42, "child-at-offset-anchor"); + child.offset = OffsetSpec::Relative(0); + child.level = 1; + + let buffer = [0x00u8, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00]; + let rules = vec![offset_rule(5, "mark", vec![child])]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + + assert!( + matches + .iter() + .any(|m| m.message == "child-at-offset-anchor"), + "child of offset rule must resolve against offset's anchor (5); got {matches:?}" + ); +} + +#[test] +fn test_offset_does_not_advance_anchor_for_continuation_siblings() { + // Regression guard for the libmagic continuation-sibling anchor + // semantic: two CHILD siblings at the same level resolve `&N` + // against the parent-level anchor, not against the previous + // sibling's advance. This is gated on `recursion_depth > 0`; + // top-level siblings still chain (see + // `relative_anchor_can_decrease_...` in the relative-offset + // integration tests). + // + // Parent `byte` at offset 0 matches 0x01 -> anchor = 1. Two + // child siblings at &0 must both read buffer[1] = 0x42. If the + // first child incorrectly advanced the anchor to 2, the second + // would read buffer[2] = 0x00 and miss. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x01), + message: "parent".to_string(), + children: vec![ + MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sibling-1".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }, + MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sibling-2".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }, + ], + level: 0, + strength_modifier: None, + }; + + let buffer = [0x01u8, 0x42, 0x00, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent", "sibling-1", "sibling-2"], + "both continuation siblings must resolve against parent anchor (1); \ + if sibling-1 advanced the anchor to 2, sibling-2 would read \ + buffer[2]=0x00 and fail" + ); +} + +#[test] +fn test_offset_sets_sibling_matched() { + // An offset rule match suppresses a following `default` sibling -- + // same discipline as any other matching rule. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let rules = vec![ + offset_rule(0, "offset-match", vec![]), + default_rule("DEFAULT-SUPPRESSED", vec![]), + ]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["offset-match"], + "default must be suppressed when offset sibling matched; got {matches:?}" + ); +} + +// ======================================================================= +// evaluate_children_or_warn graceful-error helper (issue #42 close-out) +// ======================================================================= + +#[test] +fn test_evaluate_children_or_warn_swallows_buffer_overrun_keeps_parent_match() { + // Regression guard for the extracted `evaluate_children_or_warn` + // helper: a child with an absolute offset past the buffer end must + // produce a `BufferOverrun` that is swallowed (warn-logged) rather + // than propagated. The parent match must still appear in the + // results. Covers the graceful-skip arm for all four dispatch + // sites (Default/Indirect/Offset/Use) via the Offset arm -- they + // all delegate to the same helper. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Child rule at absolute offset 1000 reads a byte -- far past the + // tiny buffer we supply. The helper should catch the BufferOverrun + // and warn-log, not fail the evaluation. + let child = MagicRule { + offset: OffsetSpec::Absolute(1000), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x00), + message: "unreachable-child".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = offset_rule(0, "parent-offset-match", vec![child]); + + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &[0u8; 4], &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent-offset-match"], + "parent match must survive a child's BufferOverrun; child must be silently skipped, got {matches:?}" + ); +} diff --git a/src/evaluator/engine/tests/meta_use_tests.rs b/src/evaluator/engine/tests/meta_use_tests.rs new file mode 100644 index 0000000..7dfd73f --- /dev/null +++ b/src/evaluator/engine/tests/meta_use_tests.rs @@ -0,0 +1,402 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Tests for `MetaType::Use` dispatch and the subroutine `base_offset` +//! biasing that `use`-site evaluation depends on. +//! +//! Helpers (`use_rule`, `use_rule_at`, `build_name_table`, `byte_eq_rule`, +//! `make_context_with_env`) live in the parent `tests/mod.rs` module so +//! the companion `meta_default_clear_indirect_tests` and +//! `meta_offset_tests` submodules can share them. + +use super::*; + +#[test] +fn test_use_known_name_evaluates_subroutine() { + // The subroutine `part2` reads byte 3 and expects 0x42. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(3), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x42), + message: "sub-match".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("part2", subroutine)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0x00u8, 0x00, 0x00, 0x42, 0x00]; + let rules = vec![use_rule("part2")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "subroutine should produce exactly one match" + ); + assert_eq!(matches[0].message, "sub-match"); +} + +#[test] +fn test_use_unknown_name_returns_no_match() { + // Empty name table so the lookup fails; the evaluator should not panic + // and should produce zero matches. + let table = NameTable::empty(); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0x00u8, 0x42]; + let rules = vec![use_rule("nonexistent")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!(matches.is_empty(), "unknown name should yield no matches"); +} + +#[test] +fn test_use_without_rule_env_returns_no_match() { + // A default context has no rule_env attached; `use` rules should be + // silent no-ops in that case rather than returning an error or panicking. + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let buffer = [0x00u8, 0x42]; + let rules = vec![use_rule("part2")]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.is_empty(), + "Use with no rule_env should produce no matches" + ); +} + +#[test] +fn test_use_recursion_limit() { + // Build a mutually-recursive pair: subroutine A calls B, B calls A. + // With the default recursion limit, this should surface as + // `RecursionLimitExceeded` rather than a stack overflow. + let a_body = vec![use_rule("b")]; + let b_body = vec![use_rule("a")]; + let table = build_name_table(vec![("a", a_body), ("b", b_body)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0u8; 8]; + let rules = vec![use_rule("a")]; + let result = evaluate_rules(&rules, &buffer, &mut context); + assert!( + matches!( + result, + Err(LibmagicError::EvaluationError( + crate::error::EvaluationError::RecursionLimitExceeded { .. } + )) + ), + "mutual recursion through use must surface RecursionLimitExceeded, got {result:?}" + ); +} + +#[test] +fn test_use_child_rules_evaluated_after_subroutine() { + // `Use` itself does not expose a visible RuleMatch today, so we cover + // the "subroutine matches come first" invariant by verifying that the + // subroutine's match appears in the output and is followed by a + // sibling rule's match in the surrounding scope. + // + // `EvaluationConfig::default()` sets `stop_at_first_match = true`, which + // (correctly, after the Comment 2 fix) short-circuits sibling iteration + // once the `use` path produces a match. To exercise the ordering + // invariant between the subroutine and its sibling we opt into the + // "completeness" semantics by disabling first-match short-circuit. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let sibling = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "sibling".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let rules = vec![use_rule("sub"), sibling]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!(matches.len(), 2); + assert_eq!(matches[0].message, "sub-head"); + assert_eq!(matches[1].message, "sibling"); +} + +#[test] +fn test_use_stop_at_first_match_short_circuits_siblings() { + // Comment 2 regression guard: with the default + // `stop_at_first_match = true` config, a successful `use` subroutine + // must prevent later sibling top-level rules from being evaluated, + // matching the short-circuit semantics every other rule kind obeys. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + let mut context = make_context_with_env(table, &[]); + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let sibling = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "sibling".to_string(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let rules = vec![use_rule("sub"), sibling]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 1, + "stop-at-first-match must halt sibling iteration once the use path produces a match" + ); + assert_eq!(matches[0].message, "sub-head"); +} + +#[test] +fn test_use_rule_children_are_evaluated() { + // Comment 1 regression guard: a `use` rule with its own children must + // descend into those children after the subroutine runs, so that + // libmagic chains like `>>0 use part2` followed by continuation rules + // continue producing matches in document order. + let subroutine = vec![MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xAA), + message: "sub-head".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }]; + let table = build_name_table(vec![("sub", subroutine)]); + // Disable stop-at-first-match so both the subroutine and the child + // rule are visible in the match vector. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + let child = MagicRule { + offset: OffsetSpec::Absolute(1), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0xBB), + message: "use-child".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let mut use_with_child = use_rule("sub"); + use_with_child.children = vec![child]; + + let buffer = [0xAAu8, 0xBB, 0xCC]; + let matches = evaluate_rules(&[use_with_child], &buffer, &mut context).unwrap(); + assert_eq!( + matches.len(), + 2, + "use rule's own children must run after the subroutine" + ); + assert_eq!(matches[0].message, "sub-head"); + assert_eq!(matches[1].message, "use-child"); +} + +#[test] +fn test_name_rule_leaked_is_noop() { + // Programmatic consumers may construct a Name rule directly and pass + // it to the evaluator (e.g. property tests). The evaluator must not + // panic; it should instead treat the rule as a silent no-op. + let leaked = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Name("orphan".to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: vec![], + level: 0, + strength_modifier: None, + }; + let mut context = EvaluationContext::new(EvaluationConfig::default()); + let matches = evaluate_rules(&[leaked], &[0u8; 4], &mut context).unwrap(); + assert!(matches.is_empty(), "leaked Name rule should be a no-op"); +} + +// ======================================================================= +// Subroutine base_offset biasing (issue #42 -- use-site offset +// propagation). Critical coverage per post-PR code review. +// ======================================================================= + +#[test] +fn test_use_subroutine_absolute_offset_biased_by_use_site() { + // Regression guard: if `SubroutineScope::enter` fails to seed + // `base_offset` with the use-site offset, a subroutine rule at + // `Absolute(0)` will read from buffer[0] instead of + // buffer[use_site]. This test proves the bias is active by + // placing distinct magic bytes at two different positions and + // verifying that the subroutine reads the use-site one. + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Subroutine body: a single rule reading at Absolute(0). Without + // base_offset biasing this resolves to file position 0. With + // biasing it resolves to the use-site (position 8 in this test). + let subroutine_body = vec![byte_eq_rule(0, 0x42, "sub-match-at-base")]; + let name_table = build_name_table(vec![("sub", subroutine_body)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + + // Use-site at offset 8. buffer[0] = 0x00 (would fail with bias + // missing); buffer[8] = 0x42 (required for bias-active success). + let mut buffer = vec![0u8; 16]; + buffer[8] = 0x42; + + let mut context = EvaluationContext::new(config).with_rule_env(env); + let rules = vec![use_rule_at("sub", 8)]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.iter().any(|m| m.message == "sub-match-at-base"), + "subroutine rule at Absolute(0) must be biased by use-site offset 8 \ + -- reading buffer[8] = 0x42. If bias missing, reads buffer[0] = 0x00 \ + and the test fails. got {matches:?}" + ); +} + +#[test] +fn test_use_subroutine_relative_offset_unaffected_by_use_site() { + // Companion to the bias test above: `Relative(N)` is resolved + // against `last_match_end`, which `SubroutineScope` also seeds + // to the use-site. We verify the Relative rule reads at the + // use-site + N, NOT at use-site + base + N (which would be a + // double-bias bug). + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Subroutine body: a Relative(0) rule that reads at the + // use-site (seeded via last_match_end). + let mut rel_rule = byte_eq_rule(0, 0x42, "rel-sub-match"); + rel_rule.offset = OffsetSpec::Relative(0); + let subroutine_body = vec![rel_rule]; + let name_table = build_name_table(vec![("rsub", subroutine_body)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(&[] as &[MagicRule]), + }); + + let mut buffer = vec![0u8; 16]; + buffer[5] = 0x42; + + let mut context = EvaluationContext::new(config).with_rule_env(env); + let rules = vec![use_rule_at("rsub", 5)]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + assert!( + matches.iter().any(|m| m.message == "rel-sub-match"), + "subroutine Relative(0) rule must read at use-site (5) via last_match_end, \ + not at use-site+base (10). got {matches:?}" + ); +} + +#[test] +fn test_continuation_sibling_reset_after_bytes_consumed() { + // Stronger regression guard than + // `test_offset_does_not_advance_anchor_for_continuation_siblings`, + // which used Relative(0) on both siblings and was trivially + // non-advancing. Here the first sibling consumes actual bytes, + // so if the `is_child_sibling_list` reset is removed the second + // sibling would read from a shifted anchor. + // + // Parent byte at 0 matches 0x01 -> anchor = 1. + // Sibling-1: Long at &0 (resolves to 1, reads 4 bytes, + // advances anchor to 5 WITHOUT the reset). + // Sibling-2: Byte at &0 (must resolve to 1, not 5). + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + let long_sibling = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Long { + endian: crate::parser::ast::Endianness::Little, + signed: false, + }, + op: Operator::Equal, + value: Value::Uint(0x0403_0201), + message: "long-sibling".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let byte_sibling = MagicRule { + offset: OffsetSpec::Relative(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + // buffer[1] = 0x01 -- if reset is removed, sibling-2 reads + // buffer[5] instead and matches 0x42 (wrong!). + value: Value::Uint(0x01), + message: "byte-sibling-sees-parent-anchor".to_string(), + children: vec![], + level: 1, + strength_modifier: None, + }; + let parent = MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(0x01), + message: "parent".to_string(), + children: vec![long_sibling, byte_sibling], + level: 0, + strength_modifier: None, + }; + + // buffer[0]=0x01 parent; buffer[1..5]=0x01,0x02,0x03,0x04 long + // match; buffer[5]=0x42 bait for missing-reset failure. + let buffer = [0x01u8, 0x01, 0x02, 0x03, 0x04, 0x42, 0x00]; + let mut context = EvaluationContext::new(config); + let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); + let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); + assert_eq!( + messages, + vec!["parent", "long-sibling", "byte-sibling-sees-parent-anchor"], + "byte-sibling must read buffer[1]=0x01 via parent-level anchor reset; \ + if reset is missing it reads buffer[5]=0x42 and test fails. got {matches:?}" + ); +} diff --git a/src/evaluator/engine/tests.rs b/src/evaluator/engine/tests/mod.rs similarity index 71% rename from src/evaluator/engine/tests.rs rename to src/evaluator/engine/tests/mod.rs index a5b2bff..802259a 100644 --- a/src/evaluator/engine/tests.rs +++ b/src/evaluator/engine/tests/mod.rs @@ -2632,7 +2632,7 @@ fn test_search_parent_relative_child_at_positive_offset() { } // ============================================================================= -// Tests for MetaType::Use semantics +// Meta-type test helpers (shared across meta_use_tests, meta_default_clear_indirect_tests, meta_offset_tests) // ============================================================================= use crate::evaluator::RuleEnvironment; @@ -2643,7 +2643,10 @@ use crate::parser::name_table::NameTable; /// root-rules list. The root-rules list is retained for parity with the /// `RuleEnvironment` shape even though `MetaType::Use` itself does not /// consult it. -fn make_context_with_env(name_table: NameTable, root_rules: &[MagicRule]) -> EvaluationContext { +pub(super) fn make_context_with_env( + name_table: NameTable, + root_rules: &[MagicRule], +) -> EvaluationContext { let env = std::sync::Arc::new(RuleEnvironment { name_table: std::sync::Arc::new(name_table), root_rules: std::sync::Arc::from(root_rules), @@ -2653,14 +2656,14 @@ fn make_context_with_env(name_table: NameTable, root_rules: &[MagicRule]) -> Eva /// Minimal helper: wrap a `TypeKind::Meta(MetaType::Use(name))` rule at /// offset 0 with the given `message` and empty child list. -fn use_rule(name: &str) -> MagicRule { +pub(super) fn use_rule(name: &str) -> MagicRule { use_rule_at(name, 0) } /// Build a `Use` rule at a specific use-site offset. Used by tests /// that need to prove subroutine `base_offset` biasing actually /// depends on the use-site value. -fn use_rule_at(name: &str, offset: i64) -> MagicRule { +pub(super) fn use_rule_at(name: &str, offset: i64) -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(offset), typ: TypeKind::Meta(MetaType::Use(name.to_string())), @@ -2674,7 +2677,7 @@ fn use_rule_at(name: &str, offset: i64) -> MagicRule { } /// Construct a name table from `(name, subroutine_rules)` pairs. -fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { +pub(super) fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { // Build via the extraction helper so the table construction matches the // real parser path. Wrap each entry in a Name rule whose `children` are // the subroutine body. @@ -2695,254 +2698,8 @@ fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { table } -#[test] -fn test_use_known_name_evaluates_subroutine() { - // The subroutine `part2` reads byte 3 and expects 0x42. - let subroutine = vec![MagicRule { - offset: OffsetSpec::Absolute(3), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "sub-match".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }]; - let table = build_name_table(vec![("part2", subroutine)]); - let mut context = make_context_with_env(table, &[]); - - let buffer = [0x00u8, 0x00, 0x00, 0x42, 0x00]; - let rules = vec![use_rule("part2")]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "subroutine should produce exactly one match" - ); - assert_eq!(matches[0].message, "sub-match"); -} - -#[test] -fn test_use_unknown_name_returns_no_match() { - // Empty name table so the lookup fails; the evaluator should not panic - // and should produce zero matches. - let table = NameTable::empty(); - let mut context = make_context_with_env(table, &[]); - - let buffer = [0x00u8, 0x42]; - let rules = vec![use_rule("nonexistent")]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!(matches.is_empty(), "unknown name should yield no matches"); -} - -#[test] -fn test_use_without_rule_env_returns_no_match() { - // A default context has no rule_env attached; `use` rules should be - // silent no-ops in that case rather than returning an error or panicking. - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let buffer = [0x00u8, 0x42]; - let rules = vec![use_rule("part2")]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!( - matches.is_empty(), - "Use with no rule_env should produce no matches" - ); -} - -#[test] -fn test_use_recursion_limit() { - // Build a mutually-recursive pair: subroutine A calls B, B calls A. - // With the default recursion limit, this should surface as - // `RecursionLimitExceeded` rather than a stack overflow. - let a_body = vec![use_rule("b")]; - let b_body = vec![use_rule("a")]; - let table = build_name_table(vec![("a", a_body), ("b", b_body)]); - let mut context = make_context_with_env(table, &[]); - - let buffer = [0u8; 8]; - let rules = vec![use_rule("a")]; - let result = evaluate_rules(&rules, &buffer, &mut context); - assert!( - matches!( - result, - Err(LibmagicError::EvaluationError( - crate::error::EvaluationError::RecursionLimitExceeded { .. } - )) - ), - "mutual recursion through use must surface RecursionLimitExceeded, got {result:?}" - ); -} - -#[test] -fn test_use_child_rules_evaluated_after_subroutine() { - // `Use` itself does not expose a visible RuleMatch today, so we cover - // the "subroutine matches come first" invariant by verifying that the - // subroutine's match appears in the output and is followed by a - // sibling rule's match in the surrounding scope. - // - // `EvaluationConfig::default()` sets `stop_at_first_match = true`, which - // (correctly, after the Comment 2 fix) short-circuits sibling iteration - // once the `use` path produces a match. To exercise the ordering - // invariant between the subroutine and its sibling we opt into the - // "completeness" semantics by disabling first-match short-circuit. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let subroutine = vec![MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xAA), - message: "sub-head".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }]; - let table = build_name_table(vec![("sub", subroutine)]); - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(table), - root_rules: std::sync::Arc::from(&[] as &[MagicRule]), - }); - let mut context = EvaluationContext::new(config).with_rule_env(env); - - let buffer = [0xAAu8, 0xBB, 0xCC]; - let sibling = MagicRule { - offset: OffsetSpec::Absolute(1), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xBB), - message: "sibling".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - }; - let rules = vec![use_rule("sub"), sibling]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!(matches.len(), 2); - assert_eq!(matches[0].message, "sub-head"); - assert_eq!(matches[1].message, "sibling"); -} - -#[test] -fn test_use_stop_at_first_match_short_circuits_siblings() { - // Comment 2 regression guard: with the default - // `stop_at_first_match = true` config, a successful `use` subroutine - // must prevent later sibling top-level rules from being evaluated, - // matching the short-circuit semantics every other rule kind obeys. - let subroutine = vec![MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xAA), - message: "sub-head".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }]; - let table = build_name_table(vec![("sub", subroutine)]); - let mut context = make_context_with_env(table, &[]); - - let buffer = [0xAAu8, 0xBB, 0xCC]; - let sibling = MagicRule { - offset: OffsetSpec::Absolute(1), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xBB), - message: "sibling".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - }; - let rules = vec![use_rule("sub"), sibling]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "stop-at-first-match must halt sibling iteration once the use path produces a match" - ); - assert_eq!(matches[0].message, "sub-head"); -} - -#[test] -fn test_use_rule_children_are_evaluated() { - // Comment 1 regression guard: a `use` rule with its own children must - // descend into those children after the subroutine runs, so that - // libmagic chains like `>>0 use part2` followed by continuation rules - // continue producing matches in document order. - let subroutine = vec![MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xAA), - message: "sub-head".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }]; - let table = build_name_table(vec![("sub", subroutine)]); - // Disable stop-at-first-match so both the subroutine and the child - // rule are visible in the match vector. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(table), - root_rules: std::sync::Arc::from(&[] as &[MagicRule]), - }); - let mut context = EvaluationContext::new(config).with_rule_env(env); - - let child = MagicRule { - offset: OffsetSpec::Absolute(1), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xBB), - message: "use-child".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }; - let mut use_with_child = use_rule("sub"); - use_with_child.children = vec![child]; - - let buffer = [0xAAu8, 0xBB, 0xCC]; - let matches = evaluate_rules(&[use_with_child], &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 2, - "use rule's own children must run after the subroutine" - ); - assert_eq!(matches[0].message, "sub-head"); - assert_eq!(matches[1].message, "use-child"); -} - -#[test] -fn test_name_rule_leaked_is_noop() { - // Programmatic consumers may construct a Name rule directly and pass - // it to the evaluator (e.g. property tests). The evaluator must not - // panic; it should instead treat the rule as a silent no-op. - let leaked = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Meta(MetaType::Name("orphan".to_string())), - op: Operator::Equal, - value: Value::Uint(0), - message: String::new(), - children: vec![], - level: 0, - strength_modifier: None, - }; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&[leaked], &[0u8; 4], &mut context).unwrap(); - assert!(matches.is_empty(), "leaked Name rule should be a no-op"); -} - -// ============================================================================= -// MetaType::Default / Clear / Indirect tests -// ============================================================================= - /// Build a `Default` rule with the given message and (optional) children. -fn default_rule(message: &str, children: Vec) -> MagicRule { +pub(super) fn default_rule(message: &str, children: Vec) -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(0), typ: TypeKind::Meta(MetaType::Default), @@ -2957,7 +2714,7 @@ fn default_rule(message: &str, children: Vec) -> MagicRule { /// Build a `Clear` rule. Carries no message in the magic file syntax, but the /// AST requires a message field. -fn clear_rule() -> MagicRule { +pub(super) fn clear_rule() -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(0), typ: TypeKind::Meta(MetaType::Clear), @@ -2971,7 +2728,7 @@ fn clear_rule() -> MagicRule { } /// Build a single byte-equality rule at `offset` for `value`. -fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { +pub(super) fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(offset), typ: TypeKind::Byte { signed: false }, @@ -2984,173 +2741,8 @@ fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { } } -#[test] -fn test_default_fires_when_no_sibling_matched() { - let rules = vec![default_rule("DEFAULT-FIRES", vec![])]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "default with no prior sibling match should fire" - ); - assert_eq!(matches[0].message, "DEFAULT-FIRES"); -} - -#[test] -fn test_default_skipped_when_sibling_matched() { - // Disable stop-at-first-match so we can see whether the default would - // have fired or not. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let rules = vec![ - byte_eq_rule(0, 0xAA, "real-match"), - default_rule("DEFAULT-SKIPPED", vec![]), - ]; - let mut context = EvaluationContext::new(config); - let buffer = [0xAAu8, 0xBB]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "default after a successful sibling should not fire" - ); - assert_eq!(matches[0].message, "real-match"); -} - -#[test] -fn test_default_fires_only_once() { - // Two consecutive default rules: the first sets sibling_matched, so - // the second must not fire. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let rules = vec![ - default_rule("FIRST-DEFAULT", vec![]), - default_rule("SECOND-DEFAULT", vec![]), - ]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "only the first default should fire when no real sibling matched" - ); - assert_eq!(matches[0].message, "FIRST-DEFAULT"); -} - -#[test] -fn test_default_children_evaluated() { - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let child = byte_eq_rule(0, 0xAA, "default-child"); - let rules = vec![default_rule("PARENT-DEFAULT", vec![child])]; - let mut context = EvaluationContext::new(config); - let buffer = [0xAAu8, 0xBB]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 2, - "default rule's children must be evaluated when the default fires" - ); - assert_eq!(matches[0].message, "PARENT-DEFAULT"); - assert_eq!(matches[1].message, "default-child"); -} - -#[test] -fn test_clear_resets_sibling_matched() { - // Sequence: byte-match, default-skipped, clear, default-fires. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let rules = vec![ - byte_eq_rule(0, 0xAA, "byte-match"), - default_rule("DEFAULT-SKIPPED", vec![]), - clear_rule(), - default_rule("DEFAULT-FIRES-AFTER-CLEAR", vec![]), - ]; - let mut context = EvaluationContext::new(config); - let buffer = [0xAAu8, 0xBB]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert_eq!( - matches.len(), - 2, - "clear must reset sibling_matched so a later default fires" - ); - assert_eq!(matches[0].message, "byte-match"); - assert_eq!(matches[1].message, "DEFAULT-FIRES-AFTER-CLEAR"); -} - -#[test] -fn test_clear_at_top_is_noop() { - let rules = vec![clear_rule(), default_rule("AFTER-CLEAR", vec![])]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert_eq!( - matches.len(), - 1, - "clear at top of list is a no-op; default after still fires" - ); - assert_eq!(matches[0].message, "AFTER-CLEAR"); -} - -#[test] -fn test_clear_does_not_produce_match() { - let rules = vec![clear_rule()]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert!(matches.is_empty(), "clear alone must produce no match"); -} - -#[test] -fn test_default_clear_per_level_isolation() { - // Parent has its own sibling_matched flag. The child list runs with a - // fresh flag, so a child-level `default` must fire even though the - // parent's flag is true. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xAA), - message: "parent-match".to_string(), - children: vec![ - byte_eq_rule(1, 0xBB, "child-byte-match"), - default_rule("CHILD-DEFAULT-SKIPPED", vec![]), - clear_rule(), - default_rule("CHILD-DEFAULT-AFTER-CLEAR", vec![]), - ], - level: 0, - strength_modifier: None, - }; - let mut context = EvaluationContext::new(config); - let buffer = [0xAAu8, 0xBB]; - let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); - - // Expected order: parent-match, child-byte-match, CHILD-DEFAULT-AFTER-CLEAR - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!( - messages, - vec![ - "parent-match", - "child-byte-match", - "CHILD-DEFAULT-AFTER-CLEAR" - ], - "child-level sibling_matched must be isolated from parent-level state" - ); -} - /// Build an `Indirect` rule at `offset` with optional children. -fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicRule { +pub(super) fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(offset), typ: TypeKind::Meta(MetaType::Indirect), @@ -3163,111 +2755,9 @@ fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicR } } -#[test] -fn test_indirect_evaluates_root_rules_at_offset() { - // Root rules: detect a "ZIP-like" header (0x50 0x4b) at offset 0 of the - // sub-buffer. The indirect rule fires at offset 4 of the outer buffer, - // which means the sub-buffer starts at byte 4. Place 0x50 0x4b there. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let root_rule = byte_eq_rule(0, 0x50, "ZIP-like-header"); - let root_rules: Vec = vec![root_rule]; - - // Build an environment where root_rules is the same as the rules we - // dispatch into. - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(NameTable::empty()), - root_rules: std::sync::Arc::from(root_rules.as_slice()), - }); - let mut context = EvaluationContext::new(config).with_rule_env(env); - - // Buffer: ELF magic at offset 0, ZIP-like at offset 4. The indirect - // rule is the trigger; the root re-entry detects 0x50 at sub-buffer 0. - let buffer = [0x7fu8, 0x45, 0x4c, 0x46, 0x50, 0x4b, 0x03, 0x04]; - let rules = vec![indirect_rule(4, "indirect-trigger", vec![])]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - - assert!( - matches.iter().any(|m| m.message == "ZIP-like-header"), - "indirect must dispatch root rules against the sub-buffer at offset 4; got {matches:?}" - ); -} - -#[test] -fn test_indirect_out_of_bounds_is_noop() { - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(NameTable::empty()), - root_rules: std::sync::Arc::from(&[byte_eq_rule(0, 0x00, "root")] as &[MagicRule]), - }); - let mut context = EvaluationContext::new(config).with_rule_env(env); - - let buffer = [0u8; 4]; - // Indirect at offset 100, which is well past the 4-byte buffer. - let rules = vec![indirect_rule(100, "indirect-oob", vec![])]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!( - matches.is_empty(), - "indirect past buffer end must be a graceful no-op" - ); -} - -#[test] -fn test_indirect_without_env_is_noop() { - // Property tests synthesize Indirect rules without an attached - // RuleEnvironment, so this path must be a graceful no-op (matching the - // `Use`-without-env contract). The engine logs at `debug!` rather than - // panicking via `debug_assert!` to preserve the never-panics invariant - // exercised by `prop_arbitrary_rule_evaluation_never_panics`. - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let buffer = [0u8; 4]; - let rules = vec![indirect_rule(0, "indirect-no-env", vec![])]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!( - matches.is_empty(), - "indirect without env must produce no matches" - ); -} - -#[test] -fn test_indirect_recursion_limit() { - // Root rules contain an indirect rule that points back to offset 0, - // creating an infinite re-entry chain. Must surface as - // `RecursionLimitExceeded`, not stack overflow. - let inner_indirect = indirect_rule(0, "recursive-indirect", vec![]); - let root_rules: Vec = vec![inner_indirect]; - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(NameTable::empty()), - root_rules: std::sync::Arc::from(root_rules.as_slice()), - }); - let mut context = EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env); - - let buffer = [0u8; 8]; - let rules = vec![indirect_rule(0, "outer-indirect", vec![])]; - let result = evaluate_rules(&rules, &buffer, &mut context); - assert!( - matches!( - result, - Err(LibmagicError::EvaluationError( - crate::error::EvaluationError::RecursionLimitExceeded { .. } - )) - ), - "infinite indirect recursion must surface RecursionLimitExceeded, got {result:?}" - ); -} - -// ======================================================================= -// MetaType::Offset dispatch (issue #42) -// ======================================================================= - /// Build an `Offset` rule at `offset` with an `x` (`AnyValue`) operator and /// the given message. Mirrors `default_rule`/`indirect_rule` helpers. -fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRule { +pub(super) fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRule { MagicRule { offset: OffsetSpec::Absolute(offset), typ: TypeKind::Meta(MetaType::Offset), @@ -3280,387 +2770,10 @@ fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRul } } -#[test] -fn test_offset_emits_match_with_resolved_position() { - let rules = vec![offset_rule(5, "pos=%lld", vec![])]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 10], &mut context).unwrap(); - assert_eq!(matches.len(), 1, "offset rule must emit exactly one match"); - assert_eq!(matches[0].offset, 5, "match.offset is the resolved offset"); - assert_eq!( - matches[0].value, - Value::Uint(5), - "match.value carries the resolved offset for format substitution" - ); - assert_eq!(matches[0].message, "pos=%lld"); -} - -#[test] -fn test_offset_at_zero() { - // Regression guard: offset 0 must still produce a match (not be - // indistinguishable from "no match"). - let rules = vec![offset_rule(0, "top", vec![])]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert_eq!(matches.len(), 1); - assert_eq!(matches[0].value, Value::Uint(0)); -} - -#[test] -fn test_offset_out_of_bounds_graceful_skip() { - // Offset past the end of the buffer is a data-dependent skip, not an - // error. Matches the Indirect dispatch's graceful-skip discipline. - let rules = vec![offset_rule(1_000_000, "unreachable", vec![])]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert!( - matches.is_empty(), - "offset past buffer end must produce no match" - ); -} - -#[test] -fn test_offset_non_x_operator_is_skipped() { - // magic(5) only allows `x` on an `offset` rule. Anything else is - // semantically undefined -> debug-log + skip. - let mut rule = offset_rule(0, "bogus", vec![]); - rule.op = Operator::Equal; - rule.value = Value::Uint(5); - let rules = vec![rule]; - let mut context = EvaluationContext::new(EvaluationConfig::default()); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - assert!( - matches.is_empty(), - "offset rule with non-AnyValue operator must be skipped" - ); -} - -#[test] -fn test_offset_evaluates_children() { - // A child byte rule at offset 0 runs AFTER the parent offset rule - // fires. The child's own offset is resolved independently. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let mut parent = offset_rule( - 0, - "parent-offset", - vec![byte_eq_rule(0, 0x42, "child-byte")], - ); - // Child level must be deeper than parent per MagicRule::validate. - parent.children[0].level = 1; - let buffer = [0x42u8, 0x00, 0x00]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!(messages, vec!["parent-offset", "child-byte"]); -} - -#[test] -fn test_offset_advances_anchor_for_children() { - // An offset rule at position 5 advances `last_match_end` to 5 *for its - // children* -- but NOT for sibling rules at the same level. This - // matches libmagic's continuation-level semantics: each sibling at - // level L resolves `&N` against the parent-level anchor, not against - // the previous sibling's advance. See the `entry_anchor` discipline - // in `evaluate_rules`. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - - // A child of the offset rule uses &0 to resolve at the offset rule's - // resolved position (5). buffer[5] = 0x42. - let mut child = byte_eq_rule(0, 0x42, "child-at-offset-anchor"); - child.offset = OffsetSpec::Relative(0); - child.level = 1; - - let buffer = [0x00u8, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00]; - let rules = vec![offset_rule(5, "mark", vec![child])]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - - assert!( - matches - .iter() - .any(|m| m.message == "child-at-offset-anchor"), - "child of offset rule must resolve against offset's anchor (5); got {matches:?}" - ); -} - -#[test] -fn test_offset_does_not_advance_anchor_for_continuation_siblings() { - // Regression guard for the libmagic continuation-sibling anchor - // semantic: two CHILD siblings at the same level resolve `&N` - // against the parent-level anchor, not against the previous - // sibling's advance. This is gated on `recursion_depth > 0`; - // top-level siblings still chain (see - // `relative_anchor_can_decrease_...` in the relative-offset - // integration tests). - // - // Parent `byte` at offset 0 matches 0x01 -> anchor = 1. Two - // child siblings at &0 must both read buffer[1] = 0x42. If the - // first child incorrectly advanced the anchor to 2, the second - // would read buffer[2] = 0x00 and miss. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x01), - message: "parent".to_string(), - children: vec![ - MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "sibling-1".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }, - MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "sibling-2".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }, - ], - level: 0, - strength_modifier: None, - }; - - let buffer = [0x01u8, 0x42, 0x00, 0x00]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!( - messages, - vec!["parent", "sibling-1", "sibling-2"], - "both continuation siblings must resolve against parent anchor (1); \ - if sibling-1 advanced the anchor to 2, sibling-2 would read \ - buffer[2]=0x00 and fail" - ); -} - -#[test] -fn test_offset_sets_sibling_matched() { - // An offset rule match suppresses a following `default` sibling -- - // same discipline as any other matching rule. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - let rules = vec![ - offset_rule(0, "offset-match", vec![]), - default_rule("DEFAULT-SUPPRESSED", vec![]), - ]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&rules, &[0u8; 4], &mut context).unwrap(); - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!( - messages, - vec!["offset-match"], - "default must be suppressed when offset sibling matched; got {matches:?}" - ); -} - -// ======================================================================= -// Subroutine base_offset biasing (issue #42 -- use-site offset -// propagation). Critical coverage per post-PR code review. -// ======================================================================= - -#[test] -fn test_use_subroutine_absolute_offset_biased_by_use_site() { - // Regression guard: if `SubroutineScope::enter` fails to seed - // `base_offset` with the use-site offset, a subroutine rule at - // `Absolute(0)` will read from buffer[0] instead of - // buffer[use_site]. This test proves the bias is active by - // placing distinct magic bytes at two different positions and - // verifying that the subroutine reads the use-site one. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - - // Subroutine body: a single rule reading at Absolute(0). Without - // base_offset biasing this resolves to file position 0. With - // biasing it resolves to the use-site (position 8 in this test). - let subroutine_body = vec![byte_eq_rule(0, 0x42, "sub-match-at-base")]; - let name_table = build_name_table(vec![("sub", subroutine_body)]); - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(name_table), - root_rules: std::sync::Arc::from(&[] as &[MagicRule]), - }); - - // Use-site at offset 8. buffer[0] = 0x00 (would fail with bias - // missing); buffer[8] = 0x42 (required for bias-active success). - let mut buffer = vec![0u8; 16]; - buffer[8] = 0x42; - - let mut context = EvaluationContext::new(config).with_rule_env(env); - let rules = vec![use_rule_at("sub", 8)]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!( - matches.iter().any(|m| m.message == "sub-match-at-base"), - "subroutine rule at Absolute(0) must be biased by use-site offset 8 \ - -- reading buffer[8] = 0x42. If bias missing, reads buffer[0] = 0x00 \ - and the test fails. got {matches:?}" - ); -} - -#[test] -fn test_use_subroutine_relative_offset_unaffected_by_use_site() { - // Companion to the bias test above: `Relative(N)` is resolved - // against `last_match_end`, which `SubroutineScope` also seeds - // to the use-site. We verify the Relative rule reads at the - // use-site + N, NOT at use-site + base + N (which would be a - // double-bias bug). - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - - // Subroutine body: a Relative(0) rule that reads at the - // use-site (seeded via last_match_end). - let mut rel_rule = byte_eq_rule(0, 0x42, "rel-sub-match"); - rel_rule.offset = OffsetSpec::Relative(0); - let subroutine_body = vec![rel_rule]; - let name_table = build_name_table(vec![("rsub", subroutine_body)]); - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(name_table), - root_rules: std::sync::Arc::from(&[] as &[MagicRule]), - }); - - let mut buffer = vec![0u8; 16]; - buffer[5] = 0x42; - - let mut context = EvaluationContext::new(config).with_rule_env(env); - let rules = vec![use_rule_at("rsub", 5)]; - let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); - assert!( - matches.iter().any(|m| m.message == "rel-sub-match"), - "subroutine Relative(0) rule must read at use-site (5) via last_match_end, \ - not at use-site+base (10). got {matches:?}" - ); -} - -#[test] -fn test_continuation_sibling_reset_after_bytes_consumed() { - // Stronger regression guard than - // `test_offset_does_not_advance_anchor_for_continuation_siblings`, - // which used Relative(0) on both siblings and was trivially - // non-advancing. Here the first sibling consumes actual bytes, - // so if the `is_child_sibling_list` reset is removed the second - // sibling would read from a shifted anchor. - // - // Parent byte at 0 matches 0x01 -> anchor = 1. - // Sibling-1: Long at &0 (resolves to 1, reads 4 bytes, - // advances anchor to 5 WITHOUT the reset). - // Sibling-2: Byte at &0 (must resolve to 1, not 5). - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - - let long_sibling = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Long { - endian: crate::parser::ast::Endianness::Little, - signed: false, - }, - op: Operator::Equal, - value: Value::Uint(0x0403_0201), - message: "long-sibling".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }; - let byte_sibling = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - // buffer[1] = 0x01 -- if reset is removed, sibling-2 reads - // buffer[5] instead and matches 0x42 (wrong!). - value: Value::Uint(0x01), - message: "byte-sibling-sees-parent-anchor".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x01), - message: "parent".to_string(), - children: vec![long_sibling, byte_sibling], - level: 0, - strength_modifier: None, - }; - - // buffer[0]=0x01 parent; buffer[1..5]=0x01,0x02,0x03,0x04 long - // match; buffer[5]=0x42 bait for missing-reset failure. - let buffer = [0x01u8, 0x01, 0x02, 0x03, 0x04, 0x42, 0x00]; - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&[parent], &buffer, &mut context).unwrap(); - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!( - messages, - vec!["parent", "long-sibling", "byte-sibling-sees-parent-anchor"], - "byte-sibling must read buffer[1]=0x01 via parent-level anchor reset; \ - if reset is missing it reads buffer[5]=0x42 and test fails. got {matches:?}" - ); -} - -// ======================================================================= -// evaluate_children_or_warn graceful-error helper (issue #42 close-out) -// ======================================================================= - -#[test] -fn test_evaluate_children_or_warn_swallows_buffer_overrun_keeps_parent_match() { - // Regression guard for the extracted `evaluate_children_or_warn` - // helper: a child with an absolute offset past the buffer end must - // produce a `BufferOverrun` that is swallowed (warn-logged) rather - // than propagated. The parent match must still appear in the - // results. Covers the graceful-skip arm for all four dispatch - // sites (Default/Indirect/Offset/Use) via the Offset arm -- they - // all delegate to the same helper. - let config = EvaluationConfig { - stop_at_first_match: false, - ..EvaluationConfig::default() - }; - - // Child rule at absolute offset 1000 reads a byte -- far past the - // tiny buffer we supply. The helper should catch the BufferOverrun - // and warn-log, not fail the evaluation. - let child = MagicRule { - offset: OffsetSpec::Absolute(1000), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x00), - message: "unreachable-child".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - }; - let parent = offset_rule(0, "parent-offset-match", vec![child]); - - let mut context = EvaluationContext::new(config); - let matches = evaluate_rules(&[parent], &[0u8; 4], &mut context).unwrap(); - let messages: Vec<&str> = matches.iter().map(|m| m.message.as_str()).collect(); - assert_eq!( - messages, - vec!["parent-offset-match"], - "parent match must survive a child's BufferOverrun; child must be silently skipped, got {matches:?}" - ); -} +// Submodule declarations +#[cfg(test)] +mod meta_default_clear_indirect_tests; +#[cfg(test)] +mod meta_offset_tests; +#[cfg(test)] +mod meta_use_tests; From 4c576c71bd0154801433e91e427d02c24fd38a46 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 23 Apr 2026 21:20:56 -0400 Subject: [PATCH 16/16] fix: address PR #230 review feedback (round 4) Sixteen unresolved review threads across correctness, performance, documentation, and test organization: - Evaluator: AnchorScope now saves/restores both last_match_end AND base_offset so `indirect` inside a `use` subroutine re-enters root rules with base_offset=0 (RFn1). - Offset resolution: resolve_offset_with_base switches to checked arithmetic and maps overflow to InvalidOffset rather than saturating into a spurious BufferOverrun (RU0). - MagicDatabase: drop the redundant rules Vec field; root_rules is the sole storage built via Arc::from(rules.into_boxed_slice()) to avoid Arc::from-slice clones (Vdy, VeD). - Format: Conv::Char uses pad_non_numeric (space-only) so %03c follows POSIX instead of zero-padding; Conv::Char doc now says full 0x00-0xff range via Latin-1 (VeM, RFn6). - Docs: MetaType::Offset added to architecture.md inline-dispatch list; evaluator.md dispatch wording separated from output formatting; evaluate_rules usage examples updated to pass &mut EvaluationContext; checklist bullet includes offset (RFnh, RFnl, RFnx, RFnz). - Grammar: meta-type directives reject attached operators (`default&0xf` no longer silently drops the mask) with a regression test (RUs). - Release workflow: dist-workspace.toml pin for actions/upload-artifact bumped from v7.0.0 to v7.0.1; release.yml regenerated via `dist generate` (RU7). - Property test: timeout doc comment rewritten to reflect that the test constructs an env-less context, so MetaType::Indirect/Use take the silent no-op path rather than recursing (VeW). - Name table: scrub_nested_names is now called with the actual parent level rather than parent.level + 1 (RFn-). - Test layout: engine test helpers moved to src/evaluator/engine/tests/helpers/meta.rs (RFn3); grammar meta-type parser tests moved to src/parser/grammar/tests/meta_types.rs (RFn8). Signed-off-by: UncleSp1d3r --- .github/workflows/release.yml | 10 +- dist-workspace.toml | 2 +- docs/src/architecture.md | 2 +- docs/src/evaluator.md | 27 ++- src/evaluator/engine/mod.rs | 24 +- src/evaluator/engine/tests/helpers/meta.rs | 138 +++++++++++ src/evaluator/engine/tests/helpers/mod.rs | 12 + .../meta_default_clear_indirect_tests.rs | 62 +++++ src/evaluator/engine/tests/mod.rs | 150 +----------- src/evaluator/offset/mod.rs | 58 ++++- src/lib.rs | 21 +- src/output/format.rs | 52 ++++- src/parser/grammar/mod.rs | 11 + src/parser/grammar/tests/meta_types.rs | 216 ++++++++++++++++++ src/parser/grammar/tests/mod.rs | 177 +------------- src/parser/name_table.rs | 4 +- src/tests.rs | 5 +- tests/property_tests.rs | 10 +- 18 files changed, 621 insertions(+), 360 deletions(-) create mode 100644 src/evaluator/engine/tests/helpers/meta.rs create mode 100644 src/evaluator/engine/tests/helpers/mod.rs create mode 100644 src/parser/grammar/tests/meta_types.rs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cd56279..1561ce7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -66,7 +66,7 @@ jobs: shell: bash run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.31.0/cargo-dist-installer.sh | sh" - name: Cache dist - uses: actions/upload-artifact@v7.0.0 + uses: actions/upload-artifact@v7.0.1 with: name: cargo-dist-cache path: ~/.cargo/bin/dist @@ -82,7 +82,7 @@ jobs: cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v7.0.0 + uses: actions/upload-artifact@v7.0.1 with: name: artifacts-plan-dist-manifest path: plan-dist-manifest.json @@ -168,7 +168,7 @@ jobs: cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v7.0.0 + uses: actions/upload-artifact@v7.0.1 with: name: artifacts-build-local-${{ join(matrix.targets, '_') }} path: | @@ -233,7 +233,7 @@ jobs: find . -name '*.cdx.xml' | tee -a "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - name: "Upload artifacts" - uses: actions/upload-artifact@v7.0.0 + uses: actions/upload-artifact@v7.0.1 with: name: artifacts-build-global path: | @@ -279,7 +279,7 @@ jobs: cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v7.0.0 + uses: actions/upload-artifact@v7.0.1 with: # Overwrite the previous copy name: artifacts-dist-manifest diff --git a/dist-workspace.toml b/dist-workspace.toml index 7f91241..3fee717 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -53,4 +53,4 @@ publish-jobs = [ "homebrew" ] "actions/checkout" = "v6.0.2" "actions/download-artifact" = "3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c" "actions/attest-build-provenance" = "a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32" -"actions/upload-artifact" = "v7.0.0" +"actions/upload-artifact" = "v7.0.1" diff --git a/docs/src/architecture.md b/docs/src/architecture.md index e5d2892..9118656 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -172,7 +172,7 @@ The evaluator executes magic rules against file buffers to identify file types. - `mod.rs`: Public API surface (~720 lines) with `EvaluationContext`, `RuleMatch` types, and re-exports. Also defines `pub(crate) struct RuleEnvironment { root_rules, name_table }` — the optional environment threaded through `EvaluationContext::rule_env` so the engine can dispatch `MetaType::Use` and `MetaType::Indirect` without taking an extra parameter on every function. - `engine/`: Core evaluation engine submodule - - `mod.rs`: `evaluate_single_rule`, `evaluate_rules`, and `evaluate_rules_with_config` functions. Inline dispatch for `MetaType::Default`, `MetaType::Clear`, `MetaType::Use`, and `MetaType::Indirect` lives in the `evaluate_rules` loop body. + - `mod.rs`: `evaluate_single_rule`, `evaluate_rules`, and `evaluate_rules_with_config` functions. Inline dispatch for `MetaType::Default`, `MetaType::Clear`, `MetaType::Use`, `MetaType::Indirect`, and `MetaType::Offset` lives in the `evaluate_rules` loop body. - `tests.rs`: Engine unit tests - `types/`: Type interpretation submodule - `mod.rs`: Public API surface with `read_typed_value`, `coerce_value_to_type`, and type re-exports diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index ba048a2..38a3cec 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -369,7 +369,7 @@ Before calling `evaluate_single_rule_with_anchor` for a value-read rule, `evalua - **`MetaType::Use(name)`**: Looks up `name` in `RuleEnvironment::name_table`. On hit, evaluates the subroutine's child rules at the resolved offset, propagates their matches into the caller's match vector, then also evaluates the `use` rule's own `rule.children`. On miss, logs a `warn!` and returns `Ok(None)` (treated as non-match). - **`MetaType::Indirect`**: Resolves the rule's offset against the buffer, slices the buffer at that point, resets the `EvaluationContext` anchor to 0, calls `evaluate_rules` recursively with `RuleEnvironment::root_rules` (the complete top-level rule list), and then restores the caller's anchor on return. Recursion is bounded by `EvaluationConfig::max_recursion_depth`. - **`MetaType::Name`**: Unreachable after load-time extraction — `name` blocks are hoisted out of the rule list by `parser::name_table::extract_name_table` before the evaluator ever sees them. Defensive arm returns `Ok(None)` and emits a `debug!` rather than `debug_assert!` so that property tests synthesizing arbitrary `TypeKind` values do not break the never-panics invariant. -- **`MetaType::Offset`**: Resolves the rule's offset against the buffer and records a `RuleMatch` whose `value` is `Value::Uint(resolved_offset)`. The rule's `message` is then rendered through `format_magic_message`, which substitutes printf-style specifiers (`%lld`, `%d`) with that value. Used by magic fixtures that need to report "matched at offset N" in the output (e.g., GNU `file`'s `searchbug.magic` fixture). +- **`MetaType::Offset`**: Resolves the rule's offset against the buffer and records a `RuleMatch` whose `value` is `Value::Uint(resolved_offset)`. The evaluator stores the raw resolved offset as `value` without substituting any printf specifiers — printf substitution (`%lld`, `%d`, etc.) is performed later during output/message assembly by `format_magic_message` (called from `MagicDatabase::build_result`), not inside `evaluate_rules`. Used by magic fixtures that need to report "matched at offset N" in the output (e.g., GNU `file`'s `searchbug.magic` fixture). ```mermaid sequenceDiagram @@ -517,7 +517,7 @@ pub fn evaluate_single_rule( ### Usage Example ```rust -use libmagic_rs::{evaluate_rules, EvaluationConfig}; +use libmagic_rs::{evaluate_rules, EvaluationConfig, EvaluationContext}; use libmagic_rs::parser::parse_text_magic_file; // Parse magic rules @@ -534,7 +534,8 @@ let buffer = std::fs::read("sample.bin")?; // Evaluate with default config. The low-level `evaluate_rules` takes only // the top-level rules; `parsed.name_table` is handled by `MagicDatabase` // (see library-api.md) and is ignored here. -let matches = evaluate_rules(&parsed.rules, &buffer)?; +let mut ctx = EvaluationContext::new(EvaluationConfig::default()); +let matches = evaluate_rules(&parsed.rules, &buffer, &mut ctx)?; for m in matches { println!("Match at offset {}: {}", m.offset, m.message); @@ -544,7 +545,7 @@ for m in matches { **Example with comparison operators (v0.2.0+):** ```rust -use libmagic_rs::{evaluate_rules, EvaluationConfig}; +use libmagic_rs::{evaluate_rules, EvaluationConfig, EvaluationContext}; use libmagic_rs::parser::parse_text_magic_file; // Parse magic rule with comparison operator @@ -555,7 +556,8 @@ let magic_content = r#" let parsed = parse_text_magic_file(magic_content)?; let buffer = vec![0x0A, 0x00]; // Little-endian 10 -let matches = evaluate_rules(&parsed.rules, &buffer)?; +let mut ctx = EvaluationContext::new(EvaluationConfig::default()); +let matches = evaluate_rules(&parsed.rules, &buffer, &mut ctx)?; // Matches first rule (<100) assert_eq!(matches[0].message, "Small value detected"); @@ -564,7 +566,7 @@ assert_eq!(matches[0].message, "Small value detected"); **Example with floating-point types:** ```rust -use libmagic_rs::{evaluate_rules, EvaluationConfig}; +use libmagic_rs::{evaluate_rules, EvaluationConfig, EvaluationContext}; use libmagic_rs::parser::parse_text_magic_file; // Parse magic rule with float type @@ -576,7 +578,8 @@ let parsed = parse_text_magic_file(magic_content)?; // IEEE 754 little-endian representation of 3.14159f32 let buffer = vec![0xd0, 0x0f, 0x49, 0x40]; -let matches = evaluate_rules(&parsed.rules, &buffer)?; +let mut ctx = EvaluationContext::new(EvaluationConfig::default()); +let matches = evaluate_rules(&parsed.rules, &buffer, &mut ctx)?; assert_eq!(matches[0].message, "Pi constant detected"); ``` @@ -584,7 +587,7 @@ assert_eq!(matches[0].message, "Pi constant detected"); **Example with pstring types:** ```rust -use libmagic_rs::{evaluate_rules, EvaluationConfig}; +use libmagic_rs::{evaluate_rules, EvaluationConfig, EvaluationContext}; use libmagic_rs::parser::parse_text_magic_file; // Parse magic rules with pstring variants @@ -599,7 +602,8 @@ let parsed = parse_text_magic_file(magic_content)?; // 1-byte prefix: length=5, then "MAGIC" let buffer = b"\x05MAGIC"; -let matches = evaluate_rules(&parsed.rules, &buffer)?; +let mut ctx = EvaluationContext::new(EvaluationConfig::default()); +let matches = evaluate_rules(&parsed.rules, &buffer, &mut ctx)?; assert_eq!(matches[0].message, "Pascal string (1-byte prefix)"); // 2-byte big-endian prefix with /J flag: stored length 7 (includes 2-byte prefix), effective content 5 bytes @@ -608,7 +612,8 @@ let magic_content_j = r#" "#; let parsed_j = parse_text_magic_file(magic_content_j)?; let buffer_j = b"\x00\x07MAGIC"; // 2-byte BE prefix: value 7, minus 2 = 5 bytes of content -let matches_j = evaluate_rules(&parsed_j.rules, &buffer_j)?; +let mut ctx_j = EvaluationContext::new(EvaluationConfig::default()); +let matches_j = evaluate_rules(&parsed_j.rules, &buffer_j, &mut ctx_j)?; assert_eq!(matches_j[0].message, "JPEG-style pstring with self-inclusive length"); ``` @@ -628,7 +633,7 @@ assert_eq!(matches_j[0].message, "JPEG-style pstring with self-inclusive length" - [x] Relative offset support (GNU `file` anchor semantics, issue #38) - [x] Regex type support (binary-safe `regex::bytes::Regex` with `/c`, `/s`, `/l` flags and 8192-byte cap; unconditional `REG_NEWLINE`) - [x] Search type support (bounded literal pattern scan via `memchr::memmem::find` with mandatory `NonZeroUsize` range) -- [x] Meta-type directives: `default`, `clear`, `name`/`use` subroutines, `indirect` re-evaluation (issue #42) +- [x] Meta-type directives: `default`, `clear`, `name`/`use` subroutines, `indirect` re-evaluation, `offset` resolved-address reporting (issue #42) - [ ] Performance optimizations (rule ordering, caching) ## Performance Considerations diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index 19d1289..a88af0a 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -19,27 +19,38 @@ use super::{EvaluationContext, RecursionGuard, RuleMatch, offset, operators, typ use log::{debug, warn}; use std::sync::atomic::{AtomicBool, Ordering}; -/// RAII guard that saves the GNU `file` previous-match anchor on entry and -/// restores it on drop. +/// RAII guard that saves the GNU `file` previous-match anchor **and** +/// `base_offset` on entry and restores both on drop. /// /// `MetaType::Indirect` re-evaluates the root rule list at the resolved -/// offset, which means it must seed the anchor with that offset for the -/// nested call and then put the caller's anchor back when it returns. +/// offset. The re-entered rules are top-level-semantic (`base_offset=0`) +/// and must start with a fresh anchor (the resolved indirect offset). +/// When `indirect` fires inside a `MetaType::Use` subroutine, the outer +/// subroutine's non-zero `base_offset` would otherwise leak into the +/// root re-entry, causing every positive absolute offset in the re-entered +/// database to be biased by the outer use-site -- producing reads at the +/// wrong positions. Saving and restoring `base_offset` here prevents that. +/// /// Without an RAII wrapper, every early-return path inside the indirect -/// branch would have to remember to restore the anchor manually. +/// branch would have to remember to restore both fields manually. struct AnchorScope<'a> { context: &'a mut EvaluationContext, saved_anchor: usize, + saved_base: usize, } impl<'a> AnchorScope<'a> { - /// Save the current anchor and seed the context with `new_anchor`. + /// Save the current anchor and `base_offset`, then seed the context + /// with `new_anchor` and reset `base_offset` to 0. fn enter(context: &'a mut EvaluationContext, new_anchor: usize) -> Self { let saved_anchor = context.last_match_end(); + let saved_base = context.base_offset(); context.set_last_match_end(new_anchor); + context.set_base_offset(0); Self { context, saved_anchor, + saved_base, } } @@ -52,6 +63,7 @@ impl<'a> AnchorScope<'a> { impl Drop for AnchorScope<'_> { fn drop(&mut self) { self.context.set_last_match_end(self.saved_anchor); + self.context.set_base_offset(self.saved_base); } } diff --git a/src/evaluator/engine/tests/helpers/meta.rs b/src/evaluator/engine/tests/helpers/meta.rs new file mode 100644 index 0000000..b1a4ed6 --- /dev/null +++ b/src/evaluator/engine/tests/helpers/meta.rs @@ -0,0 +1,138 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Builders for meta-type-related `MagicRule`s and matching +//! `EvaluationContext`s. Used by `meta_use_tests`, +//! `meta_default_clear_indirect_tests`, and `meta_offset_tests`. + +use crate::evaluator::{EvaluationConfig, EvaluationContext, RuleEnvironment}; +use crate::parser::ast::{MagicRule, MetaType, OffsetSpec, Operator, TypeKind, Value}; +use crate::parser::name_table::NameTable; + +/// Build an `EvaluationContext` with the supplied name table and (optional) +/// root-rules list. The root-rules list is retained for parity with the +/// `RuleEnvironment` shape even though `MetaType::Use` itself does not +/// consult it. +pub fn make_context_with_env(name_table: NameTable, root_rules: &[MagicRule]) -> EvaluationContext { + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(name_table), + root_rules: std::sync::Arc::from(root_rules), + }); + EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env) +} + +/// Minimal helper: wrap a `TypeKind::Meta(MetaType::Use(name))` rule at +/// offset 0 with the given `message` and empty child list. +pub fn use_rule(name: &str) -> MagicRule { + use_rule_at(name, 0) +} + +/// Build a `Use` rule at a specific use-site offset. Used by tests +/// that need to prove subroutine `base_offset` biasing actually +/// depends on the use-site value. +pub fn use_rule_at(name: &str, offset: i64) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Meta(MetaType::Use(name.to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: format!("use {name}"), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +/// Construct a name table from `(name, subroutine_rules)` pairs. +pub fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { + // Build via the extraction helper so the table construction matches the + // real parser path. Wrap each entry in a Name rule whose `children` are + // the subroutine body. + let mut top = Vec::new(); + for (name, body) in entries { + top.push(MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Name(name.to_string())), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: body, + level: 0, + strength_modifier: None, + }); + } + let (_rules, table) = crate::parser::name_table::extract_name_table(top); + table +} + +/// Build a `Default` rule with the given message and (optional) children. +pub fn default_rule(message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Default), + op: Operator::Equal, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} + +/// Build a `Clear` rule. Carries no message in the magic file syntax, but the +/// AST requires a message field. +pub fn clear_rule() -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(0), + typ: TypeKind::Meta(MetaType::Clear), + op: Operator::Equal, + value: Value::Uint(0), + message: String::new(), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +/// Build a single byte-equality rule at `offset` for `value`. +pub fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Byte { signed: false }, + op: Operator::Equal, + value: Value::Uint(value), + message: message.to_string(), + children: vec![], + level: 0, + strength_modifier: None, + } +} + +/// Build an `Indirect` rule at `offset` with optional children. +pub fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Meta(MetaType::Indirect), + op: Operator::Equal, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} + +/// Build an `Offset` rule at `offset` with an `x` (`AnyValue`) operator and +/// the given message. Mirrors `default_rule`/`indirect_rule` helpers. +pub fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRule { + MagicRule { + offset: OffsetSpec::Absolute(offset), + typ: TypeKind::Meta(MetaType::Offset), + op: Operator::AnyValue, + value: Value::Uint(0), + message: message.to_string(), + children, + level: 0, + strength_modifier: None, + } +} diff --git a/src/evaluator/engine/tests/helpers/mod.rs b/src/evaluator/engine/tests/helpers/mod.rs new file mode 100644 index 0000000..67cc117 --- /dev/null +++ b/src/evaluator/engine/tests/helpers/mod.rs @@ -0,0 +1,12 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Shared test helpers for the engine test suite. +//! +//! The sibling `meta_*_tests` submodules pull everything they need from +//! `super::*` in `tests/mod.rs`; this directory splits those helpers out +//! of the oversized parent module so each concern lives in a focused +//! sub-file (per AGENTS.md `**/*.rs`: "Keep source files under 500-600 +//! lines; split larger files into focused modules."). + +pub mod meta; diff --git a/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs b/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs index fb23135..0a9744f 100644 --- a/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs +++ b/src/evaluator/engine/tests/meta_default_clear_indirect_tests.rs @@ -272,3 +272,65 @@ fn test_indirect_recursion_limit() { "infinite indirect recursion must surface RecursionLimitExceeded, got {result:?}" ); } + +/// Regression test for `RFn1`: `indirect` inside a `use` subroutine must reset +/// `base_offset` to 0 when re-entering root rules. +/// +/// Before the fix, `AnchorScope::enter` only saved/restored `last_match_end`. +/// When `indirect` fired inside a `use` subroutine that had set `base_offset` +/// to a non-zero use-site, the re-entered root rules inherited that +/// `base_offset`. Every positive `Absolute(N)` offset in the root rules was +/// then biased by the outer use-site offset, causing reads at the wrong +/// positions (e.g., a rule at `Absolute(0)` would read from `use-site + 0` +/// rather than from byte 0 of the sub-buffer). +/// +/// Layout of this test: +/// - Outer buffer: 8 bytes +/// - Bytes 0-3: dummy header (0x11 0x22 0x33 0x44) +/// - Bytes 4-7: embedded payload (0xAA 0xBB 0xCC 0xDD) +/// - A `use sub` rule at offset 4 dispatches a subroutine (`base_offset=4`). +/// - The subroutine contains an `indirect` rule also at offset 0 (use-site +/// relative, resolves to absolute 4 under `base_offset` biasing = byte 4). +/// - The root rules check byte 0 of the sub-buffer (== outer byte 4) for 0xAA. +/// - After the fix: root rules see `base_offset=0`, so `Absolute(0)` reads +/// `sub-buffer[0]` == 0xAA => match. +/// - Before the fix: root rules inherited `base_offset=4`, so `Absolute(0)` was +/// biased to effective offset 4 of the 4-byte sub-buffer => `BufferOverrun` => +/// no match. +#[test] +fn test_indirect_inside_use_subroutine_resets_base_offset() { + let config = EvaluationConfig { + stop_at_first_match: false, + ..EvaluationConfig::default() + }; + + // Root rules: check that byte 0 of the re-entered sub-buffer equals 0xAA. + // `Absolute(0)` must resolve to sub-buffer[0], NOT sub-buffer[4] (which + // would be the biased result if base_offset leaked from the use subroutine). + let root_rules: Vec = vec![byte_eq_rule(0, 0xAA, "root-payload-match")]; + + // Subroutine body: an `indirect` rule at offset 0 (relative to the use-site + // base). Inside a use subroutine with base_offset=4, `Absolute(0)` resolves + // to absolute 4 -- the start of the payload in the outer buffer. That slice + // becomes the sub-buffer passed to root rule re-entry. + let subroutine_body: Vec = vec![indirect_rule(0, "inner-indirect", vec![])]; + + let table = build_name_table(vec![("sub", subroutine_body)]); + let env = std::sync::Arc::new(RuleEnvironment { + name_table: std::sync::Arc::new(table), + root_rules: std::sync::Arc::from(root_rules.as_slice()), + }); + let mut context = EvaluationContext::new(config).with_rule_env(env); + + // Buffer: 4 dummy bytes, then 4 payload bytes starting with 0xAA. + let buffer = [0x11u8, 0x22, 0x33, 0x44, 0xAA, 0xBB, 0xCC, 0xDD]; + // `use sub` at use-site offset 4 -- sets base_offset=4 for the subroutine. + let rules = vec![use_rule_at("sub", 4)]; + let matches = evaluate_rules(&rules, &buffer, &mut context).unwrap(); + + assert!( + matches.iter().any(|m| m.message == "root-payload-match"), + "indirect inside use must reset base_offset to 0 so root rules read from \ + sub-buffer[0], not sub-buffer[base+0]; got {matches:?}" + ); +} diff --git a/src/evaluator/engine/tests/mod.rs b/src/evaluator/engine/tests/mod.rs index 802259a..3d066e8 100644 --- a/src/evaluator/engine/tests/mod.rs +++ b/src/evaluator/engine/tests/mod.rs @@ -2631,144 +2631,18 @@ fn test_search_parent_relative_child_at_positive_offset() { assert_eq!(matches[1].message, "a after"); } -// ============================================================================= -// Meta-type test helpers (shared across meta_use_tests, meta_default_clear_indirect_tests, meta_offset_tests) -// ============================================================================= - -use crate::evaluator::RuleEnvironment; -use crate::parser::ast::MetaType; -use crate::parser::name_table::NameTable; - -/// Build an `EvaluationContext` with the supplied name table and (optional) -/// root-rules list. The root-rules list is retained for parity with the -/// `RuleEnvironment` shape even though `MetaType::Use` itself does not -/// consult it. -pub(super) fn make_context_with_env( - name_table: NameTable, - root_rules: &[MagicRule], -) -> EvaluationContext { - let env = std::sync::Arc::new(RuleEnvironment { - name_table: std::sync::Arc::new(name_table), - root_rules: std::sync::Arc::from(root_rules), - }); - EvaluationContext::new(EvaluationConfig::default()).with_rule_env(env) -} - -/// Minimal helper: wrap a `TypeKind::Meta(MetaType::Use(name))` rule at -/// offset 0 with the given `message` and empty child list. -pub(super) fn use_rule(name: &str) -> MagicRule { - use_rule_at(name, 0) -} - -/// Build a `Use` rule at a specific use-site offset. Used by tests -/// that need to prove subroutine `base_offset` biasing actually -/// depends on the use-site value. -pub(super) fn use_rule_at(name: &str, offset: i64) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(offset), - typ: TypeKind::Meta(MetaType::Use(name.to_string())), - op: Operator::Equal, - value: Value::Uint(0), - message: format!("use {name}"), - children: vec![], - level: 0, - strength_modifier: None, - } -} - -/// Construct a name table from `(name, subroutine_rules)` pairs. -pub(super) fn build_name_table(entries: Vec<(&str, Vec)>) -> NameTable { - // Build via the extraction helper so the table construction matches the - // real parser path. Wrap each entry in a Name rule whose `children` are - // the subroutine body. - let mut top = Vec::new(); - for (name, body) in entries { - top.push(MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Meta(MetaType::Name(name.to_string())), - op: Operator::Equal, - value: Value::Uint(0), - message: String::new(), - children: body, - level: 0, - strength_modifier: None, - }); - } - let (_rules, table) = crate::parser::name_table::extract_name_table(top); - table -} - -/// Build a `Default` rule with the given message and (optional) children. -pub(super) fn default_rule(message: &str, children: Vec) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Meta(MetaType::Default), - op: Operator::Equal, - value: Value::Uint(0), - message: message.to_string(), - children, - level: 0, - strength_modifier: None, - } -} - -/// Build a `Clear` rule. Carries no message in the magic file syntax, but the -/// AST requires a message field. -pub(super) fn clear_rule() -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Meta(MetaType::Clear), - op: Operator::Equal, - value: Value::Uint(0), - message: String::new(), - children: vec![], - level: 0, - strength_modifier: None, - } -} - -/// Build a single byte-equality rule at `offset` for `value`. -pub(super) fn byte_eq_rule(offset: i64, value: u64, message: &str) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(offset), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(value), - message: message.to_string(), - children: vec![], - level: 0, - strength_modifier: None, - } -} - -/// Build an `Indirect` rule at `offset` with optional children. -pub(super) fn indirect_rule(offset: i64, message: &str, children: Vec) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(offset), - typ: TypeKind::Meta(MetaType::Indirect), - op: Operator::Equal, - value: Value::Uint(0), - message: message.to_string(), - children, - level: 0, - strength_modifier: None, - } -} - -/// Build an `Offset` rule at `offset` with an `x` (`AnyValue`) operator and -/// the given message. Mirrors `default_rule`/`indirect_rule` helpers. -pub(super) fn offset_rule(offset: i64, message: &str, children: Vec) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(offset), - typ: TypeKind::Meta(MetaType::Offset), - op: Operator::AnyValue, - value: Value::Uint(0), - message: message.to_string(), - children, - level: 0, - strength_modifier: None, - } -} +// Shared test helpers have been extracted into the `helpers` sub-tree so +// this module stays focused on its own test wiring; the meta_* submodules +// continue to access helpers via `super::*` thanks to the glob re-export +// below. The three bare `use` items are for types that the submodules +// still reference directly (e.g. `MetaType::Default`, `RuleEnvironment { +// ... }` literal construction) and therefore must stay in this module's +// namespace for `super::*` to reach. +mod helpers; +pub(super) use crate::evaluator::RuleEnvironment; +pub(super) use crate::parser::ast::MetaType; +pub(super) use crate::parser::name_table::NameTable; +pub(super) use helpers::meta::*; // Submodule declarations #[cfg(test)] diff --git a/src/evaluator/offset/mod.rs b/src/evaluator/offset/mod.rs index 88c7d62..d4ef1a2 100644 --- a/src/evaluator/offset/mod.rs +++ b/src/evaluator/offset/mod.rs @@ -143,9 +143,24 @@ pub(crate) fn resolve_offset_with_base( // Negative values mean "from end" and should not be shifted // by the subroutine base. let effective = if *offset >= 0 { - let abs = usize::try_from(*offset).unwrap_or(usize::MAX); - let biased = base_offset.saturating_add(abs); - i64::try_from(biased).unwrap_or(i64::MAX) + // Use checked conversions so overflow is reported as + // InvalidOffset rather than silently producing a huge + // biased value that later surfaces as BufferOverrun. + let abs = usize::try_from(*offset).map_err(|_| { + LibmagicError::EvaluationError(crate::error::EvaluationError::InvalidOffset { + offset: *offset, + }) + })?; + let biased = base_offset + .checked_add(abs) + .ok_or(LibmagicError::EvaluationError( + crate::error::EvaluationError::InvalidOffset { offset: *offset }, + ))?; + i64::try_from(biased).map_err(|_| { + LibmagicError::EvaluationError(crate::error::EvaluationError::InvalidOffset { + offset: *offset, + }) + })? } else { *offset }; @@ -376,4 +391,41 @@ mod tests { assert_eq!(result, expected, "Failed for spec: {spec:?}"); } } + + /// Regression test for RU0: `base_offset + large_positive_absolute` that + /// overflows `usize` must produce `InvalidOffset`, not `BufferOverrun`. + /// + /// Before the fix, saturating arithmetic turned overflow into `usize::MAX` + /// (or `i64::MAX`), which then flowed into `resolve_absolute_offset` and + /// surfaced as a `BufferOverrun` at that giant offset -- losing the more + /// precise overflow signal. + #[test] + fn test_resolve_offset_with_base_overflow_yields_invalid_offset() { + let buffer = b"0123456789ABCDEF"; // 16 bytes + // base_offset near usize::MAX combined with any positive Absolute + // must overflow. Use usize::MAX - 1 so that adding even 2 overflows. + let base = usize::MAX - 1; + let spec = OffsetSpec::Absolute(2); // base + 2 overflows usize + + let result = resolve_offset_with_base(&spec, buffer, 0, base); + assert!( + result.is_err(), + "overflow of base_offset + absolute must fail" + ); + match result.unwrap_err() { + LibmagicError::EvaluationError(crate::error::EvaluationError::InvalidOffset { + .. + }) => { + // Correct: overflow reported as InvalidOffset, not BufferOverrun. + } + LibmagicError::EvaluationError(crate::error::EvaluationError::BufferOverrun { + .. + }) => { + panic!( + "overflow of base_offset + absolute must be InvalidOffset, not BufferOverrun" + ); + } + other => panic!("unexpected error variant: {other:?}"), + } + } } diff --git a/src/lib.rs b/src/lib.rs index 746b192..54e571a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -150,15 +150,14 @@ impl From for LibmagicError { /// Main interface for magic rule database #[derive(Debug)] pub struct MagicDatabase { - rules: Vec, /// Named subroutine definitions extracted from magic file `name` rules, /// keyed by identifier. The evaluator consults this table when a rule of /// type `TypeKind::Meta(MetaType::Use(name))` is reached. name_table: std::sync::Arc, - /// Top-level rules retained as a shared immutable slice. Passed through - /// the evaluation context as part of the rule environment so future - /// whole-database operations (e.g. `indirect`) can re-enter at the root - /// without re-sorting or cloning the rule tree. + /// Top-level rules as a shared immutable slice. This is the primary rule + /// storage for the database. Passed through the evaluation context as part + /// of the rule environment so whole-database operations (e.g. `indirect`) + /// can re-enter at the root without re-sorting or cloning the rule tree. root_rules: std::sync::Arc<[MagicRule]>, config: EvaluationConfig, /// Optional path to the source magic file or directory from which rules were loaded. @@ -250,9 +249,9 @@ impl MagicDatabase { config.validate()?; let mut rules = crate::builtin_rules::get_builtin_rules(); crate::evaluator::strength::sort_rules_by_strength_recursive(&mut rules); - let root_rules: std::sync::Arc<[MagicRule]> = std::sync::Arc::from(rules.as_slice()); + let root_rules: std::sync::Arc<[MagicRule]> = + std::sync::Arc::from(rules.into_boxed_slice()); Ok(Self { - rules, name_table: std::sync::Arc::new(crate::parser::name_table::NameTable::empty()), root_rules, config, @@ -324,9 +323,9 @@ impl MagicDatabase { crate::evaluator::strength::sort_rules_by_strength_recursive(rules); }); - let root_rules: std::sync::Arc<[MagicRule]> = std::sync::Arc::from(rules.as_slice()); + let root_rules: std::sync::Arc<[MagicRule]> = + std::sync::Arc::from(rules.into_boxed_slice()); Ok(Self { - rules, name_table: std::sync::Arc::new(name_table), root_rules, config, @@ -475,7 +474,7 @@ impl MagicDatabase { // `evaluate_rules` returns `Ok(vec![])` for an empty rule list, // so no `is_empty()` guard is needed here. - let matches = evaluate_rules(&self.rules, buffer, &mut context)?; + let matches = evaluate_rules(&self.root_rules, buffer, &mut context)?; Ok(self.build_result(matches, file_size, start_time)) } @@ -515,7 +514,7 @@ impl MagicDatabase { metadata: EvaluationMetadata { file_size, evaluation_time_ms: start_time.elapsed().as_secs_f64() * 1000.0, - rules_evaluated: self.rules.len(), + rules_evaluated: self.root_rules.len(), magic_file: self.source_path.clone(), timed_out: false, }, diff --git a/src/output/format.rs b/src/output/format.rs index e0ee3aa..8e682da 100644 --- a/src/output/format.rs +++ b/src/output/format.rs @@ -146,7 +146,7 @@ enum Conv { Octal, /// `%s` -- string. Str, - /// `%c` -- single character (from an integer codepoint, ASCII range only). + /// `%c` -- single character (full 0x00-0xff byte range via Latin-1 code points). Char, /// `%%` -- literal percent. Percent, @@ -309,7 +309,11 @@ fn render(spec: &Spec, value: &Value, type_kind: &TypeKind) -> Option { // 2-byte UTF-8 encoding of that code point; consumers // iterating the returned bytes directly can recover the // original byte by re-encoding the code point as Latin-1. - Some(pad_numeric(&char::from(byte).to_string(), spec)) + // + // POSIX: the `0` flag is ignored for `%c` -- zero-padding only + // applies to numeric/float conversions. Always use space-padding + // for `%c`, matching C printf behavior. + Some(pad_non_numeric(&char::from(byte).to_string(), spec)) } } } @@ -411,6 +415,23 @@ fn render_prefixed_int(digits: &str, prefix: &str, spec: &Spec) -> String { } } +/// Apply width and alignment to a non-numeric rendered body using space-only padding. +/// +/// Used for `%c` (and any other non-numeric conversion where the POSIX `0` flag +/// must be ignored). Zero-padding is not applied regardless of `spec.zero_pad`. +fn pad_non_numeric(body: &str, spec: &Spec) -> String { + if body.len() >= spec.width { + return body.to_string(); + } + let pad = spec.width - body.len(); + let padding: String = std::iter::repeat_n(' ', pad).collect(); + if spec.left_align { + format!("{body}{padding}") + } else { + format!("{padding}{body}") + } +} + /// Apply width and padding to an already-rendered numeric body. /// /// For zero-padded right-aligned formatting, a leading `-` sign is kept at @@ -586,6 +607,33 @@ mod tests { fn test_char_substitution() { let out = format_magic_message("[%c]", &Value::Uint(u64::from(b'A')), &byte_t()); assert_eq!(out, "[A]"); + + // Full 0x00-0xff range: bytes >= 0x80 are embedded as Latin-1 code points. + let out = format_magic_message("%c", &Value::Uint(0xa9), &byte_t()); + assert_eq!(out, "\u{00a9}"); // U+00A9 COPYRIGHT SIGN + + // Width with space-padding (right-aligned). + let out = format_magic_message("%3c", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, " A"); + + // Left-aligned width. + let out = format_magic_message("%-3c|", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, "A |"); + } + + #[test] + fn test_char_zero_flag_ignored() { + // POSIX: the `0` flag is ignored for `%c` -- zero-padding applies only to + // numeric conversions. `%03c` must produce space-padded " A", not "00A". + // Regression guard: an earlier revision called `pad_numeric` for `Conv::Char`, + // which applied zero-padding and diverged from C printf semantics. + let out = format_magic_message("%03c", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, " A", "%03c must use space-padding, not zero-padding"); + + // Combined zero and left-align: `-` overrides `0` for numerics; for %c + // `0` was never active, but `-` still triggers left-alignment. + let out = format_magic_message("%-03c|", &Value::Uint(u64::from(b'A')), &byte_t()); + assert_eq!(out, "A |", "%-03c must left-align with spaces"); } #[test] diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs index a998873..bf05958 100644 --- a/src/parser/grammar/mod.rs +++ b/src/parser/grammar/mod.rs @@ -806,6 +806,17 @@ pub fn parse_magic_rule(input: &str) -> IResult<&str, MagicRule> { // already consumed their identifier operand, so the `x` stripping // is a no-op for them. if matches!(typ, TypeKind::Meta(_)) { + // Meta-type directives have no operand, so an attached operator + // like `default&0xf` is malformed — reject it here rather than + // silently dropping it on the floor. `name`/`use` short-circuit in + // `parse_type_and_operator` and never carry an attached op, so only + // `default`/`clear`/`indirect`/`offset` can trip this. + if attached_op.is_some() { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::Verify, + ))); + } let input = strip_optional_x_operator(input); let (input, message) = if input.trim().is_empty() { (input, String::new()) diff --git a/src/parser/grammar/tests/meta_types.rs b/src/parser/grammar/tests/meta_types.rs new file mode 100644 index 0000000..bfd557e --- /dev/null +++ b/src/parser/grammar/tests/meta_types.rs @@ -0,0 +1,216 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +use super::*; + +// Meta-type directive parsing tests +// +// Covers `default`, `clear`, `name`, `use`, `indirect`, and `offset`. Exercises +// the optional `x` (AnyValue) placeholder strip, `name`/`use` identifier +// validation, end-to-end text-magic-file hoisting into the name table, and +// the `searchbug.magic` fixture as a single-file acceptance check. + +#[test] +fn test_parse_magic_rule_meta_types() { + // Table: (input, expected_level, expected_typ, expected_message) + let cases: &[(&str, u32, TypeKind, &str)] = &[ + // `x` is the AnyValue operator; for meta types the parser strips + // it (with surrounding whitespace) before taking the rest of the + // line as the message. See `strip_optional_x_operator` in + // `parser/grammar/mod.rs`. Without that strip, rules like + // `>>&0 offset x at_offset %lld` would render as + // `x\tat_offset 11` and diverge from GNU `file` output. + ( + "0 default x msg", + 0, + TypeKind::Meta(MetaType::Default), + "msg", + ), + // And a message without a leading `x` passes through unchanged. + ("0 default msg", 0, TypeKind::Meta(MetaType::Default), "msg"), + ("0 clear", 0, TypeKind::Meta(MetaType::Clear), ""), + ( + "0 offset x pos=%lld", + 0, + TypeKind::Meta(MetaType::Offset), + "pos=%lld", + ), + ("0 indirect x", 0, TypeKind::Meta(MetaType::Indirect), ""), + ( + "0 name part2", + 0, + TypeKind::Meta(MetaType::Name("part2".to_string())), + "", + ), + ( + "0 use part2", + 0, + TypeKind::Meta(MetaType::Use("part2".to_string())), + "", + ), + ("0 indirect", 0, TypeKind::Meta(MetaType::Indirect), ""), + ( + ">0 use part2", + 1, + TypeKind::Meta(MetaType::Use("part2".to_string())), + "", + ), + ]; + + for (input, expected_level, expected_typ, expected_message) in cases { + let (remaining, rule) = + parse_magic_rule(input).unwrap_or_else(|e| panic!("parse failed for {input:?}: {e:?}")); + assert_eq!(remaining, "", "remaining mismatch for {input:?}"); + assert_eq!(rule.level, *expected_level, "level mismatch for {input:?}"); + assert_eq!(rule.typ, *expected_typ, "typ mismatch for {input:?}"); + assert_eq!( + rule.message, *expected_message, + "message mismatch for {input:?}" + ); + } + + // Bare `name` / `use` with no identifier must be a parse error. + assert!( + parse_magic_rule("0 name").is_err(), + "bare `name` with no identifier must fail" + ); + assert!( + parse_magic_rule("0 use").is_err(), + "bare `use` with no identifier must fail" + ); +} + +#[test] +fn test_parse_magic_rule_meta_name_use_reject_malformed_identifiers() { + // Operator-adjacent continuation must reject the truncated identifier + // (`part2=foo`, `part2!bar`, etc.) rather than silently dropping the + // operator text into the message slot. + let operator_cases = [ + "0 use part2=foo", + "0 use part2!=bar", + "0 use partfoo", + "0 name part&foo", + "0 name part^foo", + "0 name part~foo", + "0 name part|foo", + ]; + for input in operator_cases { + assert!( + parse_magic_rule(input).is_err(), + "operator-adjacent identifier must fail: {input:?}" + ); + } + + // Split identifiers with embedded whitespace (`part 2`) must also fail: + // the phase requires that `name`/`use` identifiers are terminated by + // whitespace followed only by EOL/EOF, with no trailing content. + let split_cases = [ + "0 name part 2", + "0 use part2 extra", + "0 name my id", + "0 use foo bar", + ]; + for input in split_cases { + assert!( + parse_magic_rule(input).is_err(), + "split identifier must fail: {input:?}" + ); + } + + // Sanity check: an identifier followed only by trailing whitespace still parses. + let (_, rule) = parse_magic_rule("0 name part2 ").expect("trailing ws is ok"); + assert_eq!( + rule.typ, + TypeKind::Meta(MetaType::Name("part2".to_string())) + ); + let (_, rule) = parse_magic_rule("0 use part2\t").expect("trailing tab is ok"); + assert_eq!(rule.typ, TypeKind::Meta(MetaType::Use("part2".to_string()))); +} + +#[test] +fn test_parse_magic_rule_meta_rejects_attached_operator() { + // Meta-type directives (`default`, `clear`, `indirect`, `offset`) have + // no operand, so an attached operator like `default&0xf` is malformed. + // Before the fix for RUs, `parse_attached_operator` consumed the `&` + // (and optional mask) and `parse_magic_rule` then silently dropped the + // captured operator on the floor, producing a rule whose `op` field + // would be `AnyValue` even though the source text contained a mask. + // `name`/`use` short-circuit in `parse_type_and_operator` and cannot + // reach the attached-op path, so they are not exercised here. Only + // `&`-attached forms round-trip through `parse_attached_operator`; + // other operator-adjacent glyphs (`^`, `~`, `>`, etc.) fall through + // to `parse_message` and are covered by message-parsing tests, not + // here. + let malformed = [ + "0 default&0xf msg", + "0 default& msg", + "0 clear&0xff", + "0 indirect&0x1", + "0 offset&0xf0 pos", + ]; + for input in malformed { + assert!( + parse_magic_rule(input).is_err(), + "meta-type with attached operator must fail: {input:?}" + ); + } +} + +#[test] +fn test_parse_text_magic_file_meta_roundtrip() { + // Build a small magic file that uses the six meta-types. The `name` + // block is a level-1 subroutine invoked by the top-level `use`, and + // `indirect` / `default` / `clear` / `offset` appear as sibling + // directives to exercise the parse path for each variant. + // + // NOTE: all rules use the SAME top-level indentation so + // build_rule_hierarchy treats them as siblings. Child rules would + // require a preceding parent match, which meta-types do not produce. + let magic = "\ +0 name subroutine +0 use subroutine +0 default default-msg +0 clear +0 indirect +"; + let parsed = + crate::parser::parse_text_magic_file(magic).expect("meta-type magic file should parse"); + // Only the `name` declaration is hoisted into the name table; the + // other four meta-types remain as top-level rules in document order. + let rules = parsed.rules; + assert_eq!( + rules.len(), + 4, + "expected 4 top-level rules after name hoist, got {rules:?}" + ); + assert!( + parsed.name_table.get("subroutine").is_some(), + "name subroutine should be extracted into the name table" + ); + + assert_eq!( + rules[0].typ, + TypeKind::Meta(MetaType::Use("subroutine".to_string())) + ); + assert_eq!(rules[1].typ, TypeKind::Meta(MetaType::Default)); + assert_eq!(rules[2].typ, TypeKind::Meta(MetaType::Clear)); + assert_eq!(rules[3].typ, TypeKind::Meta(MetaType::Indirect)); +} + +#[test] +fn test_parse_text_magic_file_searchbug_fixture() { + // Regression: the canonical GNU `file` testfile `searchbug.magic` + // exercises the `offset` keyword, `&N` relative-offset syntax, the + // `name`/`use` subroutine machinery, and `search/N` -- every piece of + // this phase's acceptance surface in a single fixture. Previously the + // parser rejected the file on the unknown `offset` type keyword. + let magic = std::fs::read_to_string("third_party/tests/searchbug.magic") + .expect("searchbug.magic fixture must exist"); + let parsed = crate::parser::parse_text_magic_file(&magic) + .expect("searchbug.magic must parse end-to-end"); + assert!( + !parsed.rules.is_empty(), + "searchbug.magic must produce at least one top-level rule" + ); +} diff --git a/src/parser/grammar/tests/mod.rs b/src/parser/grammar/tests/mod.rs index 3e61e68..d4fe0eb 100644 --- a/src/parser/grammar/tests/mod.rs +++ b/src/parser/grammar/tests/mod.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 mod indirect_offset; +mod meta_types; use super::*; use crate::parser::ast::Endianness; @@ -2501,179 +2502,3 @@ fn test_parse_magic_rule_regex_and_search() { ); assert_eq!(rule.message, "version line"); } - -#[test] -fn test_parse_magic_rule_meta_types() { - // Table: (input, expected_level, expected_typ, expected_message) - let cases: &[(&str, u32, TypeKind, &str)] = &[ - // `x` is the AnyValue operator; for meta types the parser strips - // it (with surrounding whitespace) before taking the rest of the - // line as the message. See `strip_optional_x_operator` in - // `parser/grammar/mod.rs`. Without that strip, rules like - // `>>&0 offset x at_offset %lld` would render as - // `x\tat_offset 11` and diverge from GNU `file` output. - ( - "0 default x msg", - 0, - TypeKind::Meta(MetaType::Default), - "msg", - ), - // And a message without a leading `x` passes through unchanged. - ("0 default msg", 0, TypeKind::Meta(MetaType::Default), "msg"), - ("0 clear", 0, TypeKind::Meta(MetaType::Clear), ""), - ( - "0 offset x pos=%lld", - 0, - TypeKind::Meta(MetaType::Offset), - "pos=%lld", - ), - ("0 indirect x", 0, TypeKind::Meta(MetaType::Indirect), ""), - ( - "0 name part2", - 0, - TypeKind::Meta(MetaType::Name("part2".to_string())), - "", - ), - ( - "0 use part2", - 0, - TypeKind::Meta(MetaType::Use("part2".to_string())), - "", - ), - ("0 indirect", 0, TypeKind::Meta(MetaType::Indirect), ""), - ( - ">0 use part2", - 1, - TypeKind::Meta(MetaType::Use("part2".to_string())), - "", - ), - ]; - - for (input, expected_level, expected_typ, expected_message) in cases { - let (remaining, rule) = - parse_magic_rule(input).unwrap_or_else(|e| panic!("parse failed for {input:?}: {e:?}")); - assert_eq!(remaining, "", "remaining mismatch for {input:?}"); - assert_eq!(rule.level, *expected_level, "level mismatch for {input:?}"); - assert_eq!(rule.typ, *expected_typ, "typ mismatch for {input:?}"); - assert_eq!( - rule.message, *expected_message, - "message mismatch for {input:?}" - ); - } - - // Bare `name` / `use` with no identifier must be a parse error. - assert!( - parse_magic_rule("0 name").is_err(), - "bare `name` with no identifier must fail" - ); - assert!( - parse_magic_rule("0 use").is_err(), - "bare `use` with no identifier must fail" - ); -} - -#[test] -fn test_parse_magic_rule_meta_name_use_reject_malformed_identifiers() { - // Operator-adjacent continuation must reject the truncated identifier - // (`part2=foo`, `part2!bar`, etc.) rather than silently dropping the - // operator text into the message slot. - let operator_cases = [ - "0 use part2=foo", - "0 use part2!=bar", - "0 use partfoo", - "0 name part&foo", - "0 name part^foo", - "0 name part~foo", - "0 name part|foo", - ]; - for input in operator_cases { - assert!( - parse_magic_rule(input).is_err(), - "operator-adjacent identifier must fail: {input:?}" - ); - } - - // Split identifiers with embedded whitespace (`part 2`) must also fail: - // the phase requires that `name`/`use` identifiers are terminated by - // whitespace followed only by EOL/EOF, with no trailing content. - let split_cases = [ - "0 name part 2", - "0 use part2 extra", - "0 name my id", - "0 use foo bar", - ]; - for input in split_cases { - assert!( - parse_magic_rule(input).is_err(), - "split identifier must fail: {input:?}" - ); - } - - // Sanity check: an identifier followed only by trailing whitespace still parses. - let (_, rule) = parse_magic_rule("0 name part2 ").expect("trailing ws is ok"); - assert_eq!( - rule.typ, - TypeKind::Meta(MetaType::Name("part2".to_string())) - ); - let (_, rule) = parse_magic_rule("0 use part2\t").expect("trailing tab is ok"); - assert_eq!(rule.typ, TypeKind::Meta(MetaType::Use("part2".to_string()))); -} - -#[test] -fn test_parse_text_magic_file_meta_roundtrip() { - // Build a small magic file that uses the six meta-types. The `name` - // block is a level-1 subroutine invoked by the top-level `use`, and - // `indirect` / `default` / `clear` / `offset` appear as sibling - // directives to exercise the parse path for each variant. - // - // NOTE: all rules use the SAME top-level indentation so - // build_rule_hierarchy treats them as siblings. Child rules would - // require a preceding parent match, which meta-types do not produce. - let magic = "\ -0 name subroutine -0 use subroutine -0 default default-msg -0 clear -0 indirect -"; - let parsed = - crate::parser::parse_text_magic_file(magic).expect("meta-type magic file should parse"); - // Only the `name` declaration is hoisted into the name table; the - // other four meta-types remain as top-level rules in document order. - let rules = parsed.rules; - assert_eq!( - rules.len(), - 4, - "expected 4 top-level rules after name hoist, got {rules:?}" - ); - assert!( - parsed.name_table.get("subroutine").is_some(), - "name subroutine should be extracted into the name table" - ); - - assert_eq!( - rules[0].typ, - TypeKind::Meta(MetaType::Use("subroutine".to_string())) - ); - assert_eq!(rules[1].typ, TypeKind::Meta(MetaType::Default)); - assert_eq!(rules[2].typ, TypeKind::Meta(MetaType::Clear)); - assert_eq!(rules[3].typ, TypeKind::Meta(MetaType::Indirect)); -} - -#[test] -fn test_parse_text_magic_file_searchbug_fixture() { - // Regression: the canonical GNU `file` testfile `searchbug.magic` - // exercises the `offset` keyword, `&N` relative-offset syntax, the - // `name`/`use` subroutine machinery, and `search/N` -- every piece of - // this phase's acceptance surface in a single fixture. Previously the - // parser rejected the file on the unknown `offset` type keyword. - let magic = std::fs::read_to_string("third_party/tests/searchbug.magic") - .expect("searchbug.magic fixture must exist"); - let parsed = crate::parser::parse_text_magic_file(&magic) - .expect("searchbug.magic must parse end-to-end"); - assert!( - !parsed.rules.is_empty(), - "searchbug.magic must produce at least one top-level rule" - ); -} diff --git a/src/parser/name_table.rs b/src/parser/name_table.rs index 4cd27ec..816dc3c 100644 --- a/src/parser/name_table.rs +++ b/src/parser/name_table.rs @@ -109,7 +109,7 @@ pub(crate) fn extract_name_table(rules: Vec) -> (Vec, Name let children = scrub_nested_names(rule.children, rule.level); table.inner.insert(name.clone(), Arc::from(children)); } else { - let scrubbed_children = scrub_nested_names(rule.children, rule.level + 1); + let scrubbed_children = scrub_nested_names(rule.children, rule.level); kept.push(MagicRule { children: scrubbed_children, ..rule @@ -131,7 +131,7 @@ fn scrub_nested_names(children: Vec, parent_level: u32) -> Vec