From 97476a71c7134fbac1e5be1259c56f5dcc588c2b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 1 Mar 2026 22:07:51 -0500 Subject: [PATCH 01/12] docs(ci): update Mergify merge protections and bot PR handling Signed-off-by: UncleSp1d3r --- AGENTS.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index b6c411f1..2cf70e9d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -404,7 +404,7 @@ This pattern ensures build-time failures (e.g., invalid magic files) are properl ### Automated Checks -The project uses GitHub Actions CI with Mergify merge queue: +The project uses GitHub Actions CI with Mergify merge protections: 1. **Formatting**: `cargo fmt` for consistent code style 2. **Linting**: `cargo clippy -- -D warnings` for best practices @@ -435,9 +435,9 @@ All pull requests require review before merging. Reviews are performed by mainta - **Style**: Follows project conventions, passes `cargo fmt` and `cargo clippy -- -D warnings` - **Documentation**: Public APIs have rustdoc with examples, AGENTS.md updated if architecture changes -CI must pass before merge. Mergify merge queue and merge protections enforce these checks. -PRs enter the merge queue when approved (or automatically for release-plz/dependabot). -Mergify rebases against main, runs CI, and squash-merges on success. +CI must pass before merge. Mergify merge protections enforce these checks. +Bot PRs (dependabot, dosubot, release-plz) are auto-merged by Mergify when CI passes. +Human PRs are merged manually by maintainers. ## Project Context @@ -516,8 +516,9 @@ This guide ensures consistent, high-quality development practices for the libmag ## Quick Reference -- Merging is managed by Mergify merge queue -- PRs are squash-merged after CI passes -- `.mergify.yml` configures merge queue rules, auto-queue, and merge protections +- Mergify auto-merges bot PRs (dependabot, dosubot, release-plz) via direct `merge` action (no merge queue) +- Human PRs are merged manually -- Mergify only provides merge protections for those +- `.mergify.yml` configures auto-merge rules and merge protections - `cargo deny check` uses `deny.toml` (default) -- do not specify a custom config path - `.github/workflows/release.yml` is auto-generated by cargo-dist -- do not modify manually - All `.rs` files must have copyright and SPDX headers (see any source file for format) From 52ebc5ab35d5c5808d75fdf96bdca2f8d9e0f202 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 1 Mar 2026 22:53:17 -0500 Subject: [PATCH 02/12] feat(types): implement 64-bit integer type with endian variants Signed-off-by: UncleSp1d3r --- AGENTS.md | 11 +- ROADMAP.md | 2 +- build.rs | 5 + src/build_helpers.rs | 26 +++ src/evaluator/strength.rs | 18 +++ src/evaluator/types.rs | 207 ++++++++++++++++++++++++ src/parser/ast.rs | 24 +++ src/parser/grammar.rs | 327 +++++++++++++++++++++++++++++++++----- tests/property_tests.rs | 37 ++--- 9 files changed, 587 insertions(+), 70 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 2cf70e9d..adfd2fe7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -68,6 +68,7 @@ pub enum TypeKind { Byte { signed: bool }, Short { endian: Endianness, signed: bool }, Long { endian: Endianness, signed: bool }, + Quad { endian: Endianness, signed: bool }, String { max_length: Option }, } pub enum Operator { @@ -144,7 +145,7 @@ pub fn evaluate_magic_rules( - Clippy pedantic lints are active (e.g., prefer `trailing_zeros()` over bitwise masks) - All public enum variants need `# Examples` rustdoc sections - Comparison operators share a `compare_values() -> Option` helper in `operators.rs` -- new comparison logic goes there, not in individual `apply_*` functions -- libmagic types are signed by default (`byte`, `short`, `long`); unsigned variants use `u` prefix (`ubyte`, `ushort`, `ulong`, etc.) +- libmagic types are signed by default (`byte`, `short`, `long`, `quad`); unsigned variants use `u` prefix (`ubyte`, `ushort`, `ulong`, `uquad`, etc.) ### Naming Conventions @@ -190,7 +191,7 @@ cargo test --doc # Test documentation examples ### Currently Implemented (v0.1.0) - **Offsets**: Absolute and from-end specifications (indirect and relative are parsed but not yet evaluated) -- **Types**: `byte`, `short`, `long`, `string` with endianness support; unsigned variants `ubyte`, `ushort`/`ubeshort`/`uleshort`, `ulong`/`ubelong`/`ulelong`; types are signed by default (libmagic-compatible) +- **Types**: `byte`, `short`, `long`, `quad`, `string` with endianness support; unsigned variants `ubyte`, `ushort`/`ubeshort`/`uleshort`, `ulong`/`ubelong`/`ulelong`, `uquad`/`ubequad`/`ulequad`; types are signed by default (libmagic-compatible) - **Operators**: `=` (equal), `!=` (not equal), `<` (less than), `>` (greater than), `<=` (less equal), `>=` (greater equal), `&` (bitwise AND with optional mask) - **Nested Rules**: Hierarchical rule evaluation with proper indentation - **String Matching**: Exact string matching with null-termination @@ -199,7 +200,7 @@ cargo test --doc # Test documentation examples - Bitwise XOR operator: `^` - Regex type: Pattern matching with binary-safe regex support -- Additional types: 64-bit integers, floats, doubles, dates +- Additional types: floats, doubles, dates - Search type: Multi-pattern string searching ### Future Enhancement: Binary-Safe Regex Handling @@ -222,7 +223,7 @@ impl BinaryRegex for regex::bytes::Regex { ### Type System - No regex/search pattern matching -- No 64-bit integer types (quad, qquad) +- 64-bit integer types: `quad`/`uquad`, `bequad`/`ubequad`, `lequad`/`ulequad` are implemented; `qquad` (128-bit) is not yet supported - No floating-point types (float, double, befloat, lefloat) - No date/time types (date, qdate, ldate, qldate) - String evaluation reads until first NUL or end-of-buffer by default; `max_length: Some(_)` is supported internally but no dedicated fixed-length string parser syntax exists yet @@ -308,7 +309,7 @@ sample.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) ### Adding New Type Support -> **Note:** Currently implemented types are `Byte`, `Short`, `Long`, and `String`. Regex and other advanced types are planned for future releases. +> **Note:** Currently implemented types are `Byte`, `Short`, `Long`, `Quad`, and `String`. Regex and other advanced types are planned for future releases. 1. Extend `TypeKind` enum in `src/parser/ast.rs` 2. Add parsing logic in `src/parser/grammar.rs` diff --git a/ROADMAP.md b/ROADMAP.md index a6e0ceee..4baa5d1f 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -25,7 +25,7 @@ See [GitHub Milestones](https://github.com/EvilBit-Labs/libmagic-rs/milestones) - [ ] Bitwise XOR, NOT, and any-value operators ([#35](https://github.com/EvilBit-Labs/libmagic-rs/issues/35)) - [ ] Indirect offset resolution ([#37](https://github.com/EvilBit-Labs/libmagic-rs/issues/37)) - [ ] Relative offset resolution ([#38](https://github.com/EvilBit-Labs/libmagic-rs/issues/38)) -- [ ] Quad (64-bit integer) type ([#36](https://github.com/EvilBit-Labs/libmagic-rs/issues/36)) +- [x] Quad (64-bit integer) type ([#36](https://github.com/EvilBit-Labs/libmagic-rs/issues/36)) ## v0.3.0 - Advanced Features diff --git a/build.rs b/build.rs index cd7b935a..ff3dcfd6 100644 --- a/build.rs +++ b/build.rs @@ -280,6 +280,11 @@ fn serialize_type_kind(typ: &TypeKind) -> String { serialize_endianness(*endian), signed ), + TypeKind::Quad { endian, signed } => format!( + "TypeKind::Quad {{ endian: {}, signed: {} }}", + serialize_endianness(*endian), + signed + ), TypeKind::String { max_length } => match max_length { Some(value) => { format!("TypeKind::String {{ max_length: Some({value}) }}") diff --git a/src/build_helpers.rs b/src/build_helpers.rs index 882755f7..4e8f6a40 100644 --- a/src/build_helpers.rs +++ b/src/build_helpers.rs @@ -222,6 +222,11 @@ fn serialize_type_kind(typ: &TypeKind) -> String { serialize_endianness(*endian), signed ), + TypeKind::Quad { endian, signed } => format!( + "TypeKind::Quad {{ endian: {}, signed: {} }}", + serialize_endianness(*endian), + signed + ), TypeKind::String { max_length } => match max_length { Some(value) => { format!("TypeKind::String {{ max_length: Some({value}) }}") @@ -477,6 +482,27 @@ mod tests { assert!(serialized.contains("signed: true")); } + #[test] + fn test_serialize_type_kind_quad() { + let typ = TypeKind::Quad { + endian: Endianness::Little, + signed: true, + }; + let serialized = serialize_type_kind(&typ); + assert!(serialized.contains("TypeKind::Quad")); + assert!(serialized.contains("Endianness::Little")); + assert!(serialized.contains("signed: true")); + + let typ2 = TypeKind::Quad { + endian: Endianness::Big, + signed: false, + }; + let serialized2 = serialize_type_kind(&typ2); + assert!(serialized2.contains("TypeKind::Quad")); + assert!(serialized2.contains("Endianness::Big")); + assert!(serialized2.contains("signed: false")); + } + #[test] fn test_serialize_type_kind_string() { let typ1 = TypeKind::String { max_length: None }; diff --git a/src/evaluator/strength.rs b/src/evaluator/strength.rs index 109540b2..285c5c2b 100644 --- a/src/evaluator/strength.rs +++ b/src/evaluator/strength.rs @@ -77,6 +77,8 @@ pub fn calculate_default_strength(rule: &MagicRule) -> i32 { // Add bonus for limited-length strings (more constrained match) if max_length.is_some() { base + 5 } else { base } } + // 64-bit integers are most specific among numerics + TypeKind::Quad { .. } => 16, // 32-bit integers are fairly specific TypeKind::Long { .. } => 15, // 16-bit integers are moderately specific @@ -409,6 +411,22 @@ mod tests { assert_eq!(strength, 35); } + #[test] + fn test_strength_type_quad() { + let rule = make_rule( + TypeKind::Quad { + endian: Endianness::Little, + signed: false, + }, + Operator::Equal, + OffsetSpec::Absolute(0), + Value::Uint(0), + ); + let strength = calculate_default_strength(&rule); + // Quad: 16, Equal: 10, Absolute: 10, Numeric: 0 = 36 + assert_eq!(strength, 36); + } + #[test] fn test_strength_type_string() { let rule = make_rule( diff --git a/src/evaluator/types.rs b/src/evaluator/types.rs index b0b3a790..125bf29d 100644 --- a/src/evaluator/types.rs +++ b/src/evaluator/types.rs @@ -218,6 +218,73 @@ pub fn read_long( } } +/// Safely reads a 64-bit integer from the buffer at the specified offset +/// +/// # Arguments +/// +/// * `buffer` - The byte buffer to read from +/// * `offset` - The offset position to read the 64-bit value from +/// * `endian` - The byte order to use for interpretation +/// * `signed` - Whether to interpret the value as signed or unsigned +/// +/// # Returns +/// +/// Returns `Ok(Value::Uint(value))` for unsigned values or `Ok(Value::Int(value))` for signed values +/// if the read is successful, or `Err(TypeReadError::BufferOverrun)` if there are insufficient bytes. +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::evaluator::types::read_quad; +/// use libmagic_rs::parser::ast::{Endianness, Value}; +/// +/// let buffer = &[0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12]; +/// +/// // Read unsigned little-endian quad (0x1234567890abcdef) +/// let result = read_quad(buffer, 0, Endianness::Little, false).unwrap(); +/// assert_eq!(result, Value::Uint(0x1234_5678_90ab_cdef)); +/// +/// // Read signed little-endian quad (positive value fits in i64) +/// let result = read_quad(buffer, 0, Endianness::Little, true).unwrap(); +/// assert_eq!(result, Value::Int(0x1234_5678_90ab_cdef)); +/// +/// // Read signed little-endian quad with high bit set (sign extension) +/// let neg_buffer = &[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80]; +/// let result = read_quad(neg_buffer, 0, Endianness::Little, true).unwrap(); +/// assert_eq!(result, Value::Int(-9_223_372_036_854_775_808)); // i64::MIN +/// ``` +/// +/// # Errors +/// +/// Returns `TypeReadError::BufferOverrun` if there are fewer than 8 bytes available +/// starting at the specified offset. +pub fn read_quad( + buffer: &[u8], + offset: usize, + endian: Endianness, + signed: bool, +) -> Result { + let bytes = buffer + .get(offset..offset + 8) + .ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; + + let value = match endian { + Endianness::Little => LittleEndian::read_u64(bytes), + Endianness::Big => BigEndian::read_u64(bytes), + Endianness::Native => NativeEndian::read_u64(bytes), + }; + + if signed { + #[allow(clippy::cast_possible_wrap)] + Ok(Value::Int(value as i64)) + } else { + Ok(Value::Uint(value)) + } +} + /// Safely reads a null-terminated string from the buffer at the specified offset /// /// This function reads bytes from the buffer starting at the given offset until it encounters @@ -365,6 +432,7 @@ pub fn read_typed_value( TypeKind::Byte { signed } => read_byte(buffer, offset, *signed), TypeKind::Short { endian, signed } => read_short(buffer, offset, *endian, *signed), TypeKind::Long { endian, signed } => read_long(buffer, offset, *endian, *signed), + TypeKind::Quad { endian, signed } => read_quad(buffer, offset, *endian, *signed), TypeKind::String { max_length } => read_string(buffer, offset, *max_length), } } @@ -415,6 +483,11 @@ pub fn coerce_value_to_type(value: &Value, type_kind: &TypeKind) -> Value { #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] Value::Int(i64::from(*v as u32 as i32)) } + (Value::Uint(v), TypeKind::Quad { signed: true, .. }) if *v > i64::MAX as u64 => + { + #[allow(clippy::cast_possible_wrap)] + Value::Int(*v as i64) + } _ => value.clone(), } } @@ -785,6 +858,105 @@ mod tests { assert_eq!(zero_result, Value::Uint(0)); } + // Tests for read_quad function + #[test] + fn test_read_quad_endianness_and_signedness() { + let cases: Vec<(&[u8], Endianness, bool, Value)> = vec![ + // Little-endian unsigned + ( + &[0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12], + Endianness::Little, + false, + Value::Uint(0x1234_5678_90ab_cdef), + ), + // Big-endian unsigned + ( + &[0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef], + Endianness::Big, + false, + Value::Uint(0x1234_5678_90ab_cdef), + ), + // Little-endian signed positive + ( + &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f], + Endianness::Little, + true, + Value::Int(i64::MAX), + ), + // Little-endian signed negative + ( + &[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80], + Endianness::Little, + true, + Value::Int(i64::MIN), + ), + // Big-endian signed negative (-1) + ( + &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff], + Endianness::Big, + true, + Value::Int(-1), + ), + // Unsigned max value + ( + &[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff], + Endianness::Little, + false, + Value::Uint(u64::MAX), + ), + // Zero + ( + &[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], + Endianness::Little, + false, + Value::Uint(0), + ), + ]; + for (buffer, endian, signed, expected) in cases { + let result = read_quad(buffer, 0, endian, signed).unwrap(); + assert_eq!(result, expected, "endian={endian:?}, signed={signed}"); + } + } + + #[test] + fn test_read_quad_buffer_overrun() { + // Too few bytes (only 7) + let buffer = &[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07]; + assert_eq!( + read_quad(buffer, 0, Endianness::Little, false).unwrap_err(), + TypeReadError::BufferOverrun { + offset: 0, + buffer_len: 7 + } + ); + + // Empty buffer + assert_eq!( + read_quad(&[], 0, Endianness::Big, false).unwrap_err(), + TypeReadError::BufferOverrun { + offset: 0, + buffer_len: 0 + } + ); + + // Offset past end + let buffer = &[0x00; 16]; + assert_eq!( + read_quad(buffer, 10, Endianness::Little, false).unwrap_err(), + TypeReadError::BufferOverrun { + offset: 10, + buffer_len: 16 + } + ); + } + + #[test] + fn test_read_quad_at_offset() { + let buffer = &[0x00, 0x00, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12]; + let result = read_quad(buffer, 2, Endianness::Little, false).unwrap(); + assert_eq!(result, Value::Uint(0x1234_5678_90ab_cdef)); + } + #[test] fn test_read_short_extreme_values() { // Test maximum unsigned 16-bit value @@ -1585,6 +1757,41 @@ fn test_coerce_value_to_type() { }, Value::Uint(0xffff_ffff), ), + // Signed quad: values above i64::MAX get coerced + ( + Value::Uint(0xffff_ffff_ffff_ffff), + TypeKind::Quad { + endian: Endianness::Native, + signed: true, + }, + Value::Int(-1), + ), + ( + Value::Uint(0x8000_0000_0000_0000), + TypeKind::Quad { + endian: Endianness::Native, + signed: true, + }, + Value::Int(i64::MIN), + ), + // Signed quad: values in signed range pass through + ( + Value::Uint(0x7fff_ffff_ffff_ffff), + TypeKind::Quad { + endian: Endianness::Native, + signed: true, + }, + Value::Uint(0x7fff_ffff_ffff_ffff), + ), + // Unsigned quad: all values pass through + ( + Value::Uint(0xffff_ffff_ffff_ffff), + TypeKind::Quad { + endian: Endianness::Native, + signed: false, + }, + Value::Uint(0xffff_ffff_ffff_ffff), + ), // Non-Uint values pass through unchanged ( Value::Int(-1), diff --git a/src/parser/ast.rs b/src/parser/ast.rs index 02e742f6..9fa0dc0d 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -99,6 +99,22 @@ pub enum TypeKind { /// Whether value is signed signed: bool, }, + /// 64-bit integer + /// + /// # Examples + /// + /// ``` + /// use libmagic_rs::parser::ast::{TypeKind, Endianness}; + /// + /// let quad = TypeKind::Quad { endian: Endianness::Big, signed: true }; + /// assert_eq!(quad, TypeKind::Quad { endian: Endianness::Big, signed: true }); + /// ``` + Quad { + /// Byte order + endian: Endianness, + /// Whether value is signed + signed: bool, + }, /// String data String { /// Maximum length to read @@ -658,6 +674,14 @@ mod tests { endian: Endianness::Big, signed: true, }, + TypeKind::Quad { + endian: Endianness::Little, + signed: false, + }, + TypeKind::Quad { + endian: Endianness::Big, + signed: true, + }, TypeKind::String { max_length: None }, TypeKind::String { max_length: Some(128), diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index b36d98f0..9c2e501e 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -40,6 +40,24 @@ fn parse_decimal_number(input: &str) -> IResult<&str, i64> { Ok((input, number)) } +/// Parse a decimal number as unsigned `u64` with overflow protection +fn parse_unsigned_decimal_number(input: &str) -> IResult<&str, u64> { + let (input, digits) = digit1(input)?; + + // u64::MAX (18446744073709551615) has 20 digits + if digits.len() > 20 { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::MapRes, + ))); + } + + let number = digits.parse::().map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes)) + })?; + Ok((input, number)) +} + /// Parse a hexadecimal number (with 0x prefix) with overflow protection fn parse_hex_number(input: &str) -> IResult<&str, i64> { let (input, _) = tag("0x")(input)?; @@ -60,6 +78,38 @@ fn parse_hex_number(input: &str) -> IResult<&str, i64> { Ok((input, number)) } +/// Parse a hexadecimal number (with 0x prefix) as unsigned `u64` +fn parse_unsigned_hex_number(input: &str) -> IResult<&str, u64> { + let (input, _) = tag("0x")(input)?; + let (input, hex_str) = hex_digit1(input)?; + + // u64 can hold up to 16 hex digits (0xFFFFFFFFFFFFFFFF) + if hex_str.len() > 16 { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::MapRes, + ))); + } + + let number = u64::from_str_radix(hex_str, 16).map_err(|_| { + nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::MapRes)) + })?; + + Ok((input, number)) +} + +/// Parse a non-negative number as unsigned `u64` +/// +/// Supports both decimal and hexadecimal (0x prefix) formats. +/// Does not handle a leading minus sign -- callers handle sign detection. +fn parse_unsigned_number(input: &str) -> IResult<&str, u64> { + if input.starts_with("0x") { + parse_unsigned_hex_number(input) + } else { + parse_unsigned_decimal_number(input) + } +} + /// Parse a decimal or hexadecimal number /// /// Supports both decimal (123, -456) and hexadecimal (0x1a2b, -0xFF) formats. @@ -452,18 +502,24 @@ fn parse_quoted_string(input: &str) -> IResult<&str, String> { } /// Parse a numeric value (integer) +/// +/// Non-negative literals are parsed directly as `u64` so the full unsigned +/// 64-bit range is representable (required for `uquad` values above `i64::MAX`). +/// Negative literals go through the signed `i64` path. fn parse_numeric_value(input: &str) -> IResult<&str, Value> { let (input, _) = multispace0(input)?; - let (input, number) = parse_number(input)?; - let (input, _) = multispace0(input)?; - // Convert to appropriate Value variant based on sign - let value = if number >= 0 { - Value::Uint(number.unsigned_abs()) + let (input, value) = if input.starts_with('-') { + // Negative: parse as i64 + let (input, number) = parse_number(input)?; + (input, Value::Int(number)) } else { - Value::Int(number) + // Non-negative: parse as u64 to support full unsigned 64-bit range + let (input, number) = parse_unsigned_number(input)?; + (input, Value::Uint(number)) }; + let (input, _) = multispace0(input)?; Ok((input, value)) } @@ -1274,6 +1330,38 @@ mod tests { assert_eq!(parse_numeric_value("0xFF)"), Ok((")", Value::Uint(255)))); } + #[test] + fn test_parse_numeric_value_large_unsigned_quad() { + // Full u64 range -- values above i64::MAX required for uquad + let test_cases = [ + // u64::MAX in hex + ("0xffffffffffffffff", Value::Uint(u64::MAX)), + // u64::MAX in decimal + ("18446744073709551615", Value::Uint(u64::MAX)), + // Exactly i64::MAX + 1 (first value that overflows i64) + ("0x8000000000000000", Value::Uint(0x8000_0000_0000_0000)), + // i64::MAX + 1 in decimal + ( + "9223372036854775808", + Value::Uint(9_223_372_036_854_775_808), + ), + // i64::MAX still works as Uint + ("0x7fffffffffffffff", Value::Uint(i64::MAX as u64)), + ("9223372036854775807", Value::Uint(i64::MAX as u64)), + // Common magic constant patterns + ("0xDEADBEEFDEADBEEF", Value::Uint(0xDEAD_BEEF_DEAD_BEEF)), + ("0xCAFEBABECAFEBABE", Value::Uint(0xCAFE_BABE_CAFE_BABE)), + ]; + + for (input, expected) in test_cases { + assert_eq!( + parse_numeric_value(input), + Ok(("", expected)), + "Failed to parse large unsigned quad literal: '{input}'" + ); + } + } + #[test] fn test_parse_value_string_literals() { // String value parsing @@ -1384,9 +1472,9 @@ mod tests { fn test_parse_value_edge_cases() { // Zero values in different formats assert_eq!(parse_value("0"), Ok(("", Value::Uint(0)))); - assert_eq!(parse_value("-0"), Ok(("", Value::Uint(0)))); + assert_eq!(parse_value("-0"), Ok(("", Value::Int(0)))); assert_eq!(parse_value("0x0"), Ok(("", Value::Uint(0)))); - assert_eq!(parse_value("-0x0"), Ok(("", Value::Uint(0)))); + assert_eq!(parse_value("-0x0"), Ok(("", Value::Int(0)))); // Large values assert_eq!( @@ -1519,50 +1607,37 @@ mod tests { } } } -/// Parse a type specification (byte, short, long, string, etc.) -/// -/// Supports various type formats found in magic files: -/// - `byte` - single byte -/// - `short` - 16-bit integer (native endian) -/// - `leshort` - 16-bit little-endian integer -/// - `beshort` - 16-bit big-endian integer -/// - `long` - 32-bit integer (native endian) -/// - `lelong` - 32-bit little-endian integer -/// - `belong` - 32-bit big-endian integer -/// - `string` - null-terminated string -/// -/// # Examples -/// -/// ``` -/// use libmagic_rs::parser::grammar::parse_type; -/// use libmagic_rs::parser::ast::{TypeKind, Endianness}; +/// Parse a type specification with an optional attached bitwise-AND mask operator +/// (e.g., `lelong&0xf0000000`). /// -/// assert_eq!(parse_type("byte"), Ok(("", TypeKind::Byte { signed: true }))); -/// assert_eq!(parse_type("leshort"), Ok(("", TypeKind::Short { endian: Endianness::Little, signed: true }))); -/// assert_eq!(parse_type("string"), Ok(("", TypeKind::String { max_length: None }))); -/// ``` -/// Parse a type specification with optional attached operator -/// Parse a type specification followed by an optional operator +/// Returns the `TypeKind` and an optional `Operator`. /// /// # Errors /// Returns a nom parsing error if the input doesn't match the expected format +#[allow(clippy::too_many_lines)] pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option)> { let (input, _) = multispace0(input)?; let (input, type_name) = alt(( // Unsigned variants (longer names first to avoid partial matches) + tag("ubequad"), + tag("ulequad"), tag("ubelong"), tag("ulelong"), tag("ubeshort"), tag("uleshort"), + tag("uquad"), tag("ulong"), tag("ushort"), tag("ubyte"), // Signed variants (default in libmagic) + tag("lequad"), + tag("bequad"), tag("lelong"), tag("belong"), tag("leshort"), tag("beshort"), + tag("quad"), tag("long"), tag("short"), tag("byte"), @@ -1571,16 +1646,23 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option IResult<&str, (TypeKind, Option TypeKind::Quad { + endian: Endianness::Native, + signed: true, + }, + "uquad" => TypeKind::Quad { + endian: Endianness::Native, + signed: false, + }, + "lequad" => TypeKind::Quad { + endian: Endianness::Little, + signed: true, + }, + "ulequad" => TypeKind::Quad { + endian: Endianness::Little, + signed: false, + }, + "bequad" => TypeKind::Quad { + endian: Endianness::Big, + signed: true, + }, + "ubequad" => TypeKind::Quad { + endian: Endianness::Big, + signed: false, + }, "string" => TypeKind::String { max_length: None }, _ => unreachable!("Parser should only match known types"), }; @@ -1642,8 +1748,32 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option impl Strategy { ] } +/// Generate a valid endianness for testing (includes Native) +fn arb_endianness() -> impl Strategy { + prop_oneof![ + Just(Endianness::Little), + Just(Endianness::Big), + Just(Endianness::Native), + ] +} + /// Generate a valid TypeKind for testing fn arb_type_kind() -> impl Strategy { prop_oneof![ any::().prop_map(|signed| TypeKind::Byte { signed }), - (any::(), any::()).prop_map(|(is_big, signed)| { - TypeKind::Short { - endian: if is_big { - libmagic_rs::Endianness::Big - } else { - libmagic_rs::Endianness::Little - }, - signed, - } - }), - (any::(), any::()).prop_map(|(is_big, signed)| { - TypeKind::Long { - endian: if is_big { - libmagic_rs::Endianness::Big - } else { - libmagic_rs::Endianness::Little - }, - signed, - } - }), + (arb_endianness(), any::()) + .prop_map(|(endian, signed)| { TypeKind::Short { endian, signed } }), + (arb_endianness(), any::()) + .prop_map(|(endian, signed)| { TypeKind::Long { endian, signed } }), + (arb_endianness(), any::()) + .prop_map(|(endian, signed)| { TypeKind::Quad { endian, signed } }), (0usize..256usize).prop_map(|len| TypeKind::String { max_length: Some(len), }), From d1002f5b5db38fc96c97dabd84cca349f702df2e Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 1 Mar 2026 22:59:57 -0500 Subject: [PATCH 03/12] feat(parser): implement 64-bit integer type parsing with endian variants Signed-off-by: UncleSp1d3r --- src/parser/grammar.rs | 52 +++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index 9c2e501e..dbb99e3c 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1619,28 +1619,36 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option Date: Sun, 1 Mar 2026 23:00:04 -0500 Subject: [PATCH 04/12] feat(test): add end-to-end tests for 64-bit integer type parsing Signed-off-by: UncleSp1d3r --- tests/integration_tests.rs | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index bb0ac9c2..51c35da4 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -465,3 +465,103 @@ fn test_evaluate_multiple_files() { assert!(elf_result.description.contains("ELF")); assert!(pdf_result.description.contains("PDF")); } + +// ============================================================ +// 64-bit Integer (Quad) Types End-to-End +// ============================================================ + +#[test] +fn test_quad_lequad_matches_little_endian_value() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("quad.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 lequad 0x0123456789abcdef LE quad match").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // 0x0123456789abcdef in little-endian byte order + let result = db + .evaluate_buffer(b"\xef\xcd\xab\x89\x67\x45\x23\x01") + .unwrap(); + assert!( + result.description.contains("LE quad match"), + "Expected LE quad match, got: {}", + result.description + ); +} + +#[test] +fn test_quad_bequad_matches_big_endian_value() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("quad.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + writeln!(f, "0 bequad 0x0123456789abcdef BE quad match").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // 0x0123456789abcdef in big-endian byte order + let result = db + .evaluate_buffer(b"\x01\x23\x45\x67\x89\xab\xcd\xef") + .unwrap(); + assert!( + result.description.contains("BE quad match"), + "Expected BE quad match, got: {}", + result.description + ); +} + +#[test] +fn test_quad_signed_negative_one() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("quad.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + // -1 as signed 64-bit = 0xffffffffffffffff + writeln!(f, "0 lequad -1 All-ones quad").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + let result = db + .evaluate_buffer(b"\xff\xff\xff\xff\xff\xff\xff\xff") + .unwrap(); + assert!( + result.description.contains("All-ones quad"), + "Expected signed -1 match, got: {}", + result.description + ); +} + +#[test] +fn test_quad_nested_child_rule_with_offset() { + let temp_dir = TempDir::new().unwrap(); + let magic_path = temp_dir.path().join("quad.magic"); + + let mut f = fs::File::create(&magic_path).unwrap(); + // Parent matches 4-byte magic at offset 0 + writeln!(f, "0 belong 0xdeadbeef Dead beef header").unwrap(); + // Child matches an 8-byte LE quad at offset 8 + writeln!(f, ">8 ulequad 0xcafebabe00000001 With cafe payload").unwrap(); + + let db = MagicDatabase::load_from_file(&magic_path).unwrap(); + + // Build buffer: 4 bytes magic + 4 bytes padding + 8 bytes LE quad + // 0xdeadbeef in BE at offset 0 + // 0xcafebabe00000001 in LE at offset 8 + let mut buf = Vec::new(); + buf.extend_from_slice(b"\xde\xad\xbe\xef"); // belong magic + buf.extend_from_slice(b"\x00\x00\x00\x00"); // padding + buf.extend_from_slice(b"\x01\x00\x00\x00\xbe\xba\xfe\xca"); // ulequad LE + let result = db.evaluate_buffer(&buf).unwrap(); + assert!( + result.description.contains("Dead beef header"), + "Expected parent match, got: {}", + result.description + ); + assert!( + result.description.contains("With cafe payload"), + "Expected nested child quad match, got: {}", + result.description + ); +} From bac8ebe8999af5851cc969b8c1d616061a845eca Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 1 Mar 2026 23:25:38 -0500 Subject: [PATCH 05/12] feat(types): add quad 64-bit integer type to MVP development phase Signed-off-by: UncleSp1d3r --- AGENTS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index adfd2fe7..ccecf88f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -461,9 +461,9 @@ Human PRs are merged manually by maintainers. ### Development Phases -1. **MVP (v0.1.0)** - CURRENT: Basic parsing and evaluation with byte/short/long/string types, equality and bitwise AND operators, built-in rules for 10 common formats +1. **MVP (v0.1.0)** - CURRENT: Basic parsing and evaluation with byte/short/long/quad/string types, equality and bitwise AND operators, built-in rules for 10 common formats 2. **Enhanced Features (v0.2)**: Comparison operators (`>`, `<`), indirect offset improvements, strength-based rule ordering -3. **Advanced Types (v0.3)**: Regex type, 64-bit integers, floating-point types, search patterns +3. **Advanced Types (v0.3)**: Regex type, floating-point types, search patterns 4. **Full Compatibility (v0.4)**: Complete libmagic syntax support, all special directives, named tests 5. **Production Ready (v1.0)**: Stable API, complete documentation, 95%+ compatibility with GNU file From 4718212589ba5363161fbe3f8c0ce436f493d23b Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Sun, 1 Mar 2026 23:26:28 -0500 Subject: [PATCH 06/12] fix(evaluator): improve buffer overrun handling in read functions Signed-off-by: UncleSp1d3r --- src/evaluator/types.rs | 18 +++++++++++++++--- src/parser/grammar.rs | 6 ++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/evaluator/types.rs b/src/evaluator/types.rs index 125bf29d..f2729a96 100644 --- a/src/evaluator/types.rs +++ b/src/evaluator/types.rs @@ -135,8 +135,12 @@ pub fn read_short( endian: Endianness, signed: bool, ) -> Result { + let end = offset.checked_add(2).ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; let bytes = buffer - .get(offset..offset + 2) + .get(offset..end) .ok_or(TypeReadError::BufferOverrun { offset, buffer_len: buffer.len(), @@ -197,8 +201,12 @@ pub fn read_long( endian: Endianness, signed: bool, ) -> Result { + let end = offset.checked_add(4).ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; let bytes = buffer - .get(offset..offset + 4) + .get(offset..end) .ok_or(TypeReadError::BufferOverrun { offset, buffer_len: buffer.len(), @@ -264,8 +272,12 @@ pub fn read_quad( endian: Endianness, signed: bool, ) -> Result { + let end = offset.checked_add(8).ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })?; let bytes = buffer - .get(offset..offset + 8) + .get(offset..end) .ok_or(TypeReadError::BufferOverrun { offset, buffer_len: buffer.len(), diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index dbb99e3c..68f6e0a7 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1664,6 +1664,12 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option Date: Sun, 1 Mar 2026 23:51:55 -0500 Subject: [PATCH 07/12] chore(ci): update actions/upload-artifact to v7.0.0 Signed-off-by: UncleSp1d3r --- dist-workspace.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist-workspace.toml b/dist-workspace.toml index 513c9c6a..e861fe6b 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -53,4 +53,4 @@ publish-jobs = [ "homebrew" ] "actions/checkout" = "v6.0.2" "actions/download-artifact" = "v8" "actions/attest-build-provenance" = "v4" -"actions/upload-artifact" = "v6.0.0" +"actions/upload-artifact" = "v7.0.0" From 0fcc06944a9df64739d3d9a51d6114e653d08be9 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 2 Mar 2026 00:11:14 -0500 Subject: [PATCH 08/12] refactor(evaluator): rename MatchResult to RuleMatch for clarity - Updated all references to MatchResult in the evaluator module - Adjusted related documentation and tests to reflect the new name - Ensured consistency across the codebase with the new naming convention Signed-off-by: UncleSp1d3r --- build.rs | 289 +------------------------------- src/build_helpers.rs | 307 ++------------------------------- src/evaluator/mod.rs | 56 +++---- src/lib.rs | 16 +- src/output/mod.rs | 5 +- src/parser/codegen.rs | 326 +++++++++++++++++++++++++++++++++++ src/parser/grammar.rs | 122 +------------- src/parser/mod.rs | 3 + src/parser/types.rs | 382 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 769 insertions(+), 737 deletions(-) create mode 100644 src/parser/codegen.rs create mode 100644 src/parser/types.rs diff --git a/build.rs b/build.rs index ff3dcfd6..e611329d 100644 --- a/build.rs +++ b/build.rs @@ -24,15 +24,13 @@ mod error; mod parser; use error::ParseError; -use parser::ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; +use parser::codegen; use parser::parse_text_magic_file; use std::env; use std::fs; use std::path::Path; use std::process; -const INDENT_WIDTH: usize = 4; - fn main() { println!("cargo:rerun-if-changed=src/builtin_rules.magic"); println!("cargo:rerun-if-changed=build.rs"); @@ -71,7 +69,7 @@ fn main() { }; let output_path = Path::new(&out_dir).join("builtin_rules.rs"); - let generated = generate_builtin_rules(&rules); + let generated = codegen::generate_builtin_rules(&rules); if let Err(err) = fs::write(&output_path, generated) { eprintln!("Failed to write {}: {err}", output_path.display()); @@ -109,286 +107,3 @@ fn format_parse_error(error: &ParseError) -> String { } } } - -fn generate_builtin_rules(rules: &[MagicRule]) -> String { - let mut output = String::new(); - - // Allow unused_imports since StrengthModifier may not be used if no rules have strength modifiers - push_line(&mut output, "#[allow(unused_imports)]"); - push_line( - &mut output, - "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier};", - ); - push_line(&mut output, "use std::sync::LazyLock;"); - push_line(&mut output, ""); - push_line( - &mut output, - "/// Built-in magic rules compiled at build time.", - ); - push_line(&mut output, "///"); - push_line( - &mut output, - "/// This static contains magic rules parsed from `src/builtin_rules.magic` during", - ); - push_line( - &mut output, - "/// the build process. The rules are lazily initialized on first access.", - ); - push_line(&mut output, "///"); - push_line( - &mut output, - "/// Use [`get_builtin_rules()`] to access these rules instead of using this static directly.", - ); - push_line( - &mut output, - "pub static BUILTIN_RULES: LazyLock> = LazyLock::new(|| {", - ); - push_line(&mut output, " vec!["); - - for rule in rules { - let serialized = serialize_magic_rule(rule, INDENT_WIDTH * 2); - output.push_str(&serialized); - output.push(','); - output.push('\n'); - } - - push_line(&mut output, " ]"); - push_line(&mut output, "});\n"); - output -} - -fn serialize_magic_rule(rule: &MagicRule, indent: usize) -> String { - let mut output = String::new(); - - push_indent(&mut output, indent); - output.push_str("MagicRule {\n"); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "offset", - &serialize_offset_spec(&rule.offset), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "typ", - &serialize_type_kind(&rule.typ), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "op", - &serialize_operator(&rule.op), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "value", - &serialize_value(&rule.value), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "message", - &format!("String::from({})", format_string_literal(&rule.message)), - ); - - push_indent(&mut output, indent + INDENT_WIDTH); - output.push_str("children: "); - output.push_str(&serialize_children(&rule.children, indent + INDENT_WIDTH)); - output.push_str(",\n"); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "level", - &rule.level.to_string(), - ); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "strength_modifier", - &serialize_strength_modifier(&rule.strength_modifier), - ); - - push_indent(&mut output, indent); - output.push('}'); - - output -} - -fn serialize_strength_modifier(modifier: &Option) -> String { - match modifier { - None => "None".to_string(), - Some(StrengthModifier::Add(val)) => format!("Some(StrengthModifier::Add({val}))"), - Some(StrengthModifier::Subtract(val)) => format!("Some(StrengthModifier::Subtract({val}))"), - Some(StrengthModifier::Multiply(val)) => format!("Some(StrengthModifier::Multiply({val}))"), - Some(StrengthModifier::Divide(val)) => format!("Some(StrengthModifier::Divide({val}))"), - Some(StrengthModifier::Set(val)) => format!("Some(StrengthModifier::Set({val}))"), - } -} - -fn serialize_children(children: &[MagicRule], indent: usize) -> String { - if children.is_empty() { - return "Vec::new()".to_string(); - } - - let mut output = String::new(); - output.push_str("vec![\n"); - - for child in children { - let serialized = serialize_magic_rule(child, indent + INDENT_WIDTH); - output.push_str(&serialized); - output.push_str(",\n"); - } - - push_indent(&mut output, indent); - output.push(']'); - output -} - -fn serialize_offset_spec(offset: &OffsetSpec) -> String { - match offset { - OffsetSpec::Absolute(value) => format!("OffsetSpec::Absolute({value})"), - OffsetSpec::Indirect { - base_offset, - pointer_type, - adjustment, - endian, - } => format!( - "OffsetSpec::Indirect {{ base_offset: {base_offset}, pointer_type: {}, adjustment: {adjustment}, endian: {} }}", - serialize_type_kind(pointer_type), - serialize_endianness(*endian) - ), - OffsetSpec::Relative(value) => format!("OffsetSpec::Relative({value})"), - OffsetSpec::FromEnd(value) => format!("OffsetSpec::FromEnd({value})"), - } -} - -fn serialize_type_kind(typ: &TypeKind) -> String { - match typ { - TypeKind::Byte { signed } => format!("TypeKind::Byte {{ signed: {signed} }}"), - TypeKind::Short { endian, signed } => format!( - "TypeKind::Short {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::Long { endian, signed } => format!( - "TypeKind::Long {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::Quad { endian, signed } => format!( - "TypeKind::Quad {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::String { max_length } => match max_length { - Some(value) => { - format!("TypeKind::String {{ max_length: Some({value}) }}") - } - None => "TypeKind::String { max_length: None }".to_string(), - }, - } -} - -fn serialize_operator(op: &Operator) -> String { - match op { - Operator::Equal => "Operator::Equal".to_string(), - Operator::NotEqual => "Operator::NotEqual".to_string(), - Operator::LessThan => "Operator::LessThan".to_string(), - Operator::GreaterThan => "Operator::GreaterThan".to_string(), - Operator::LessEqual => "Operator::LessEqual".to_string(), - Operator::GreaterEqual => "Operator::GreaterEqual".to_string(), - Operator::BitwiseAnd => "Operator::BitwiseAnd".to_string(), - Operator::BitwiseAndMask(mask) => format!("Operator::BitwiseAndMask({mask})"), - } -} - -fn serialize_value(value: &Value) -> String { - match value { - Value::Uint(number) => format!("Value::Uint({})", format_number(*number)), - Value::Int(number) => { - if *number < 0 { - let abs = number.unsigned_abs(); - format!("Value::Int(-{})", format_number(abs)) - } else { - format!("Value::Int({})", format_number(*number as u64)) - } - } - Value::Bytes(bytes) => format!("Value::Bytes({})", format_byte_vec(bytes)), - Value::String(text) => format!( - "Value::String(String::from({}))", - format_string_literal(text) - ), - } -} - -/// Format a number with underscores for readability (clippy::unreadable_literal) -fn format_number(num: u64) -> String { - if num < 10000 { - num.to_string() - } else { - let num_str = num.to_string(); - let mut result = String::new(); - let len = num_str.len(); - - for (i, ch) in num_str.chars().enumerate() { - if i > 0 && (len - i) % 3 == 0 { - result.push('_'); - } - result.push(ch); - } - result - } -} - -fn serialize_endianness(endian: Endianness) -> String { - match endian { - Endianness::Little => "Endianness::Little".to_string(), - Endianness::Big => "Endianness::Big".to_string(), - Endianness::Native => "Endianness::Native".to_string(), - } -} - -fn format_byte_vec(bytes: &[u8]) -> String { - if bytes.is_empty() { - return "vec![]".to_string(); - } - - let mut output = String::from("vec!["); - for (index, byte) in bytes.iter().enumerate() { - if index > 0 { - output.push_str(", "); - } - output.push_str(&format!("0x{byte:02x}")); - } - output.push(']'); - output -} - -fn format_string_literal(value: &str) -> String { - let escaped = value.escape_default().to_string(); - format!("\"{escaped}\"") -} - -fn push_line(output: &mut String, line: &str) { - output.push_str(line); - output.push('\n'); -} - -fn push_indent(output: &mut String, indent: usize) { - for _ in 0..indent { - output.push(' '); - } -} - -fn push_field(output: &mut String, indent: usize, name: &str, value: &str) { - push_indent(output, indent); - output.push_str(name); - output.push_str(": "); - output.push_str(value); - output.push_str(",\n"); -} diff --git a/src/build_helpers.rs b/src/build_helpers.rs index 4e8f6a40..75a028b1 100644 --- a/src/build_helpers.rs +++ b/src/build_helpers.rs @@ -6,13 +6,19 @@ /// This module contains functionality used by the build script to parse magic files /// and generate Rust code for built-in rules. It is extracted into a library module /// to enable comprehensive testing of the build process, including error cases. +/// +/// Serialization logic is provided by [`crate::parser::codegen`], which is shared +/// with `build.rs` to avoid duplication. use crate::error::ParseError; -use crate::parser::ast::{ - Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value, -}; use crate::parser::parse_text_magic_file; -const INDENT_WIDTH: usize = 4; +// Re-export codegen functions used by tests +#[cfg(test)] +use crate::parser::codegen::{ + format_byte_vec, format_number, generate_builtin_rules, serialize_children, + serialize_endianness, serialize_offset_spec, serialize_operator, serialize_type_kind, + serialize_value, +}; /// Parses a magic file and generates Rust code for the built-in rules. /// @@ -24,7 +30,7 @@ const INDENT_WIDTH: usize = 4; /// Returns a `ParseError` if the magic file content is invalid or malformed. pub fn parse_and_generate_builtin_rules(magic_content: &str) -> Result { let rules = parse_text_magic_file(magic_content)?; - Ok(generate_builtin_rules(&rules)) + Ok(crate::parser::codegen::generate_builtin_rules(&rules)) } /// Formats a parse error for display in build script output. @@ -63,298 +69,11 @@ pub fn format_parse_error(error: &ParseError) -> String { } } -fn generate_builtin_rules(rules: &[MagicRule]) -> String { - let mut output = String::new(); - - // Allow unused_imports since StrengthModifier may not be used if no rules have strength modifiers - push_line(&mut output, "#[allow(unused_imports)]"); - push_line( - &mut output, - "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier};", - ); - push_line(&mut output, "use std::sync::LazyLock;"); - push_line(&mut output, ""); - push_line( - &mut output, - "/// Built-in magic rules compiled at build time.", - ); - push_line(&mut output, "///"); - push_line( - &mut output, - "/// This static contains magic rules parsed from `src/builtin_rules.magic` during", - ); - push_line( - &mut output, - "/// the build process. The rules are lazily initialized on first access.", - ); - push_line(&mut output, "///"); - push_line( - &mut output, - "/// Use [`get_builtin_rules()`] to access these rules instead of using this static directly.", - ); - push_line( - &mut output, - "pub static BUILTIN_RULES: LazyLock> = LazyLock::new(|| {", - ); - push_line(&mut output, " vec!["); - - for rule in rules { - let serialized = serialize_magic_rule(rule, INDENT_WIDTH * 2); - output.push_str(&serialized); - output.push(','); - output.push('\n'); - } - - push_line(&mut output, " ]"); - push_line(&mut output, "});\n"); - output -} - -fn serialize_magic_rule(rule: &MagicRule, indent: usize) -> String { - let mut output = String::new(); - - push_indent(&mut output, indent); - output.push_str("MagicRule {\n"); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "offset", - &serialize_offset_spec(&rule.offset), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "typ", - &serialize_type_kind(&rule.typ), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "op", - &serialize_operator(&rule.op), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "value", - &serialize_value(&rule.value), - ); - push_field( - &mut output, - indent + INDENT_WIDTH, - "message", - &format!("String::from({})", format_string_literal(&rule.message)), - ); - - push_indent(&mut output, indent + INDENT_WIDTH); - output.push_str("children: "); - output.push_str(&serialize_children(&rule.children, indent + INDENT_WIDTH)); - output.push_str(",\n"); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "level", - &rule.level.to_string(), - ); - - push_field( - &mut output, - indent + INDENT_WIDTH, - "strength_modifier", - &serialize_strength_modifier(rule.strength_modifier), - ); - - push_indent(&mut output, indent); - output.push('}'); - - output -} - -fn serialize_children(children: &[MagicRule], indent: usize) -> String { - if children.is_empty() { - return "Vec::new()".to_string(); - } - - let mut output = String::new(); - output.push_str("vec![\n"); - - for child in children { - let serialized = serialize_magic_rule(child, indent + INDENT_WIDTH); - output.push_str(&serialized); - output.push_str(",\n"); - } - - push_indent(&mut output, indent); - output.push(']'); - output -} - -fn serialize_offset_spec(offset: &OffsetSpec) -> String { - match offset { - OffsetSpec::Absolute(value) => format!("OffsetSpec::Absolute({value})"), - OffsetSpec::Indirect { - base_offset, - pointer_type, - adjustment, - endian, - } => format!( - "OffsetSpec::Indirect {{ base_offset: {base_offset}, pointer_type: {}, adjustment: {adjustment}, endian: {} }}", - serialize_type_kind(pointer_type), - serialize_endianness(*endian) - ), - OffsetSpec::Relative(value) => format!("OffsetSpec::Relative({value})"), - OffsetSpec::FromEnd(value) => format!("OffsetSpec::FromEnd({value})"), - } -} - -fn serialize_type_kind(typ: &TypeKind) -> String { - match typ { - TypeKind::Byte { signed } => format!("TypeKind::Byte {{ signed: {signed} }}"), - TypeKind::Short { endian, signed } => format!( - "TypeKind::Short {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::Long { endian, signed } => format!( - "TypeKind::Long {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::Quad { endian, signed } => format!( - "TypeKind::Quad {{ endian: {}, signed: {} }}", - serialize_endianness(*endian), - signed - ), - TypeKind::String { max_length } => match max_length { - Some(value) => { - format!("TypeKind::String {{ max_length: Some({value}) }}") - } - None => "TypeKind::String { max_length: None }".to_string(), - }, - } -} - -fn serialize_operator(op: &Operator) -> String { - match op { - Operator::Equal => "Operator::Equal".to_string(), - Operator::NotEqual => "Operator::NotEqual".to_string(), - Operator::LessThan => "Operator::LessThan".to_string(), - Operator::GreaterThan => "Operator::GreaterThan".to_string(), - Operator::LessEqual => "Operator::LessEqual".to_string(), - Operator::GreaterEqual => "Operator::GreaterEqual".to_string(), - Operator::BitwiseAnd => "Operator::BitwiseAnd".to_string(), - Operator::BitwiseAndMask(mask) => format!("Operator::BitwiseAndMask({mask})"), - } -} - -fn serialize_value(value: &Value) -> String { - match value { - Value::Uint(number) => format!("Value::Uint({})", format_number(*number)), - Value::Int(number) => format!("Value::Int({})", format_signed_number(*number)), - Value::Bytes(bytes) => format!("Value::Bytes({})", format_byte_vec(bytes)), - Value::String(text) => format!( - "Value::String(String::from({}))", - format_string_literal(text) - ), - } -} - -/// Format an unsigned number with underscores for readability (`clippy::unreadable_literal`) -fn format_number(num: u64) -> String { - if num < 10000 { - num.to_string() - } else { - let num_str = num.to_string(); - let mut result = String::new(); - let len = num_str.len(); - - for (i, ch) in num_str.chars().enumerate() { - if i > 0 && (len - i) % 3 == 0 { - result.push('_'); - } - result.push(ch); - } - result - } -} - -/// Format a signed number with underscores for readability (`clippy::unreadable_literal`) -fn format_signed_number(num: i64) -> String { - if num < 0 { - let abs = num.unsigned_abs(); - format!("-{}", format_number(abs)) - } else { - // Safe: num >= 0, so the cast cannot lose the sign - format_number(num.unsigned_abs()) - } -} - -fn serialize_endianness(endian: Endianness) -> String { - match endian { - Endianness::Little => "Endianness::Little".to_string(), - Endianness::Big => "Endianness::Big".to_string(), - Endianness::Native => "Endianness::Native".to_string(), - } -} - -fn serialize_strength_modifier(modifier: Option) -> String { - match modifier { - None => "None".to_string(), - Some(StrengthModifier::Add(val)) => format!("Some(StrengthModifier::Add({val}))"), - Some(StrengthModifier::Subtract(val)) => format!("Some(StrengthModifier::Subtract({val}))"), - Some(StrengthModifier::Multiply(val)) => format!("Some(StrengthModifier::Multiply({val}))"), - Some(StrengthModifier::Divide(val)) => format!("Some(StrengthModifier::Divide({val}))"), - Some(StrengthModifier::Set(val)) => format!("Some(StrengthModifier::Set({val}))"), - } -} - -fn format_byte_vec(bytes: &[u8]) -> String { - use std::fmt::Write; - - if bytes.is_empty() { - return "vec![]".to_string(); - } - - let mut output = String::from("vec!["); - for (index, byte) in bytes.iter().enumerate() { - if index > 0 { - output.push_str(", "); - } - write!(output, "0x{byte:02x}").unwrap(); - } - output.push(']'); - output -} - -fn format_string_literal(value: &str) -> String { - let escaped = value.escape_default().to_string(); - format!("\"{escaped}\"") -} - -fn push_line(output: &mut String, line: &str) { - output.push_str(line); - output.push('\n'); -} - -fn push_indent(output: &mut String, indent: usize) { - for _ in 0..indent { - output.push(' '); - } -} - -fn push_field(output: &mut String, indent: usize, name: &str, value: &str) { - push_indent(output, indent); - output.push_str(name); - output.push_str(": "); - output.push_str(value); - output.push_str(",\n"); -} - #[cfg(test)] mod tests { use super::*; + use crate::parser::ast::{Endianness, MagicRule, OffsetSpec, Operator, TypeKind, Value}; + use crate::parser::codegen::format_string_literal; #[test] fn test_format_parse_error_invalid_syntax() { diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 516b481a..f9635b99 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -204,7 +204,7 @@ impl EvaluationContext { /// Contains information about a successful rule match, including the rule /// that matched and its associated message. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct MatchResult { +pub struct RuleMatch { /// The message associated with the matching rule pub message: String, /// The offset where the match occurred @@ -221,7 +221,7 @@ pub struct MatchResult { pub confidence: f64, } -impl MatchResult { +impl RuleMatch { /// Calculate confidence score based on rule depth /// /// Formula: min(1.0, 0.3 + (level * 0.2)) @@ -234,11 +234,11 @@ impl MatchResult { /// # Examples /// /// ``` - /// use libmagic_rs::evaluator::MatchResult; + /// use libmagic_rs::evaluator::RuleMatch; /// - /// assert!((MatchResult::calculate_confidence(0) - 0.3).abs() < 0.001); - /// assert!((MatchResult::calculate_confidence(3) - 0.9).abs() < 0.001); - /// assert!((MatchResult::calculate_confidence(10) - 1.0).abs() < 0.001); + /// assert!((RuleMatch::calculate_confidence(0) - 0.3).abs() < 0.001); + /// assert!((RuleMatch::calculate_confidence(3) - 0.9).abs() < 0.001); + /// assert!((RuleMatch::calculate_confidence(10) - 1.0).abs() < 0.001); /// ``` #[must_use] pub fn calculate_confidence(level: u32) -> f64 { @@ -341,14 +341,14 @@ pub fn evaluate_single_rule( /// /// # Returns /// -/// Returns `Ok(Vec)` containing all matches found. Errors in individual rules +/// Returns `Ok(Vec)` containing all matches found. Errors in individual rules /// are logged and skipped to allow evaluation to continue. Only returns `Err(LibmagicError)` /// for critical failures like timeout or recursion limit exceeded. /// /// # Examples /// /// ```rust -/// use libmagic_rs::evaluator::{evaluate_rules, EvaluationContext, MatchResult}; +/// use libmagic_rs::evaluator::{evaluate_rules, EvaluationContext, RuleMatch}; /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value}; /// use libmagic_rs::EvaluationConfig; /// @@ -394,7 +394,7 @@ pub fn evaluate_rules( rules: &[MagicRule], buffer: &[u8], context: &mut EvaluationContext, -) -> Result, LibmagicError> { +) -> Result, LibmagicError> { let mut matches = Vec::with_capacity(8); let start_time = std::time::Instant::now(); let mut rule_count = 0u32; @@ -431,12 +431,12 @@ pub fn evaluate_rules( }; if let Some((absolute_offset, read_value)) = match_data { - let match_result = MatchResult { + let match_result = RuleMatch { message: rule.message.clone(), offset: absolute_offset, level: rule.level, value: read_value, - confidence: MatchResult::calculate_confidence(rule.level), + confidence: RuleMatch::calculate_confidence(rule.level), }; matches.push(match_result); @@ -512,13 +512,13 @@ pub fn evaluate_rules( /// /// # Returns /// -/// Returns `Ok(Vec)` containing all matches found, or `Err(LibmagicError)` +/// Returns `Ok(Vec)` containing all matches found, or `Err(LibmagicError)` /// if evaluation fails. /// /// # Examples /// /// ```rust -/// use libmagic_rs::evaluator::{evaluate_rules_with_config, MatchResult}; +/// use libmagic_rs::evaluator::{evaluate_rules_with_config, RuleMatch}; /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value}; /// use libmagic_rs::EvaluationConfig; /// @@ -550,7 +550,7 @@ pub fn evaluate_rules_with_config( rules: &[MagicRule], buffer: &[u8], config: &EvaluationConfig, -) -> Result, LibmagicError> { +) -> Result, LibmagicError> { let mut context = EvaluationContext::new(config.clone()); evaluate_rules(rules, buffer, &mut context) } @@ -1717,12 +1717,12 @@ fn test_evaluation_context_performance_config() { #[test] fn test_match_result_creation() { - let match_result = MatchResult { + let match_result = RuleMatch { message: "ELF executable".to_string(), offset: 0, level: 0, value: Value::Uint(0x7f), - confidence: MatchResult::calculate_confidence(0), + confidence: RuleMatch::calculate_confidence(0), }; assert_eq!(match_result.message, "ELF executable"); @@ -1734,12 +1734,12 @@ fn test_match_result_creation() { #[test] fn test_match_result_clone() { - let original = MatchResult { + let original = RuleMatch { message: "Test message".to_string(), offset: 42, level: 1, value: Value::String("test".to_string()), - confidence: MatchResult::calculate_confidence(1), + confidence: RuleMatch::calculate_confidence(1), }; let cloned = original.clone(); @@ -1748,16 +1748,16 @@ fn test_match_result_clone() { #[test] fn test_match_result_debug() { - let match_result = MatchResult { + let match_result = RuleMatch { message: "Debug test".to_string(), offset: 10, level: 2, value: Value::Bytes(vec![0x01, 0x02]), - confidence: MatchResult::calculate_confidence(2), + confidence: RuleMatch::calculate_confidence(2), }; let debug_str = format!("{match_result:?}"); - assert!(debug_str.contains("MatchResult")); + assert!(debug_str.contains("RuleMatch")); assert!(debug_str.contains("Debug test")); assert!(debug_str.contains("10")); assert!(debug_str.contains('2')); @@ -1765,38 +1765,38 @@ fn test_match_result_debug() { #[test] fn test_confidence_calculation_depth_0() { - let confidence = MatchResult::calculate_confidence(0); + let confidence = RuleMatch::calculate_confidence(0); assert!((confidence - 0.3).abs() < 0.001); } #[test] fn test_confidence_calculation_depth_1() { - let confidence = MatchResult::calculate_confidence(1); + let confidence = RuleMatch::calculate_confidence(1); assert!((confidence - 0.5).abs() < 0.001); } #[test] fn test_confidence_calculation_depth_2() { - let confidence = MatchResult::calculate_confidence(2); + let confidence = RuleMatch::calculate_confidence(2); assert!((confidence - 0.7).abs() < 0.001); } #[test] fn test_confidence_calculation_depth_3() { - let confidence = MatchResult::calculate_confidence(3); + let confidence = RuleMatch::calculate_confidence(3); assert!((confidence - 0.9).abs() < 0.001); } #[test] fn test_confidence_calculation_capped_at_1() { // Level 4+ should cap at 1.0 - let confidence_4 = MatchResult::calculate_confidence(4); + let confidence_4 = RuleMatch::calculate_confidence(4); assert!((confidence_4 - 1.0).abs() < 0.001); - let confidence_10 = MatchResult::calculate_confidence(10); + let confidence_10 = RuleMatch::calculate_confidence(10); assert!((confidence_10 - 1.0).abs() < 0.001); - let confidence_100 = MatchResult::calculate_confidence(100); + let confidence_100 = RuleMatch::calculate_confidence(100); assert!((confidence_100 - 1.0).abs() < 0.001); } diff --git a/src/lib.rs b/src/lib.rs index 9dc79112..a2b9869b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -128,7 +128,7 @@ pub use parser::ast::{ }; // Re-export evaluator types for convenience -pub use evaluator::{EvaluationContext, MatchResult}; +pub use evaluator::{EvaluationContext, RuleMatch}; // Re-export error types for convenience pub use error::{EvaluationError, LibmagicError, ParseError}; @@ -651,7 +651,7 @@ impl MagicDatabase { /// avoid duplicating the result-construction logic. fn build_result( &self, - matches: Vec, + matches: Vec, file_size: u64, start_time: std::time::Instant, ) -> EvaluationResult { @@ -689,7 +689,7 @@ impl MagicDatabase { /// /// Messages are joined with spaces, except when a message starts with /// backspace character (\\b) which suppresses the space. - fn concatenate_messages(matches: &[evaluator::MatchResult]) -> String { + fn concatenate_messages(matches: &[evaluator::RuleMatch]) -> String { let capacity: usize = matches.iter().map(|m| m.message.len() + 1).sum(); let mut result = String::with_capacity(capacity); for m in matches { @@ -835,7 +835,7 @@ pub struct EvaluationResult { /// /// Contains details about each rule that matched, including /// offset, matched value, and per-match confidence. - pub matches: Vec, + pub matches: Vec, /// Metadata about the evaluation process pub metadata: EvaluationMetadata, } @@ -1215,14 +1215,14 @@ mod tests { #[test] fn test_concatenate_messages_simple() { let matches = vec![ - evaluator::MatchResult { + evaluator::RuleMatch { message: "ELF".to_string(), offset: 0, level: 0, value: Value::Bytes(vec![0x7f]), confidence: 0.3, }, - evaluator::MatchResult { + evaluator::RuleMatch { message: "64-bit".to_string(), offset: 4, level: 1, @@ -1238,14 +1238,14 @@ mod tests { #[test] fn test_concatenate_messages_with_backspace() { let matches = vec![ - evaluator::MatchResult { + evaluator::RuleMatch { message: "ELF".to_string(), offset: 0, level: 0, value: Value::Bytes(vec![0x7f]), confidence: 0.3, }, - evaluator::MatchResult { + evaluator::RuleMatch { message: "\u{0008}, 64-bit".to_string(), // backspace prefix offset: 4, level: 1, diff --git a/src/output/mod.rs b/src/output/mod.rs index abb5392e..7c66c40f 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -267,10 +267,7 @@ impl MatchResult { /// * `m` - The evaluator match result to convert /// * `mime_type` - Optional MIME type to associate with this match #[must_use] - pub fn from_evaluator_match( - m: &crate::evaluator::MatchResult, - mime_type: Option<&str>, - ) -> Self { + pub fn from_evaluator_match(m: &crate::evaluator::RuleMatch, mime_type: Option<&str>) -> Self { let rule_path = DEFAULT_TAG_EXTRACTOR.extract_rule_path(std::iter::once(m.message.as_str())); diff --git a/src/parser/codegen.rs b/src/parser/codegen.rs new file mode 100644 index 00000000..8cd86c11 --- /dev/null +++ b/src/parser/codegen.rs @@ -0,0 +1,326 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Code generation for magic rule serialization +//! +//! This module provides functions to serialize parsed magic rules into Rust source +//! code. It is shared between the build script (`build.rs`) and the testable build +//! helpers (`src/build_helpers.rs`), eliminating the previous duplication of 16 +//! serialization functions across both files. +//! +//! The generated code creates `MagicRule` struct literals that are compiled into the +//! binary as built-in rules. + +use super::ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; + +const INDENT_WIDTH: usize = 4; + +/// Generate the complete Rust source for built-in rules +/// +/// Produces a Rust source file containing a `BUILTIN_RULES` static that lazily +/// initializes a `Vec` from the given parsed rules. +pub fn generate_builtin_rules(rules: &[MagicRule]) -> String { + let mut output = String::new(); + + // Allow unused_imports since StrengthModifier may not be used if no rules have strength modifiers + push_line(&mut output, "#[allow(unused_imports)]"); + push_line( + &mut output, + "use crate::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, Endianness, StrengthModifier};", + ); + push_line(&mut output, "use std::sync::LazyLock;"); + push_line(&mut output, ""); + push_line( + &mut output, + "/// Built-in magic rules compiled at build time.", + ); + push_line(&mut output, "///"); + push_line( + &mut output, + "/// This static contains magic rules parsed from `src/builtin_rules.magic` during", + ); + push_line( + &mut output, + "/// the build process. The rules are lazily initialized on first access.", + ); + push_line(&mut output, "///"); + push_line( + &mut output, + "/// Use [`get_builtin_rules()`] to access these rules instead of using this static directly.", + ); + push_line( + &mut output, + "pub static BUILTIN_RULES: LazyLock> = LazyLock::new(|| {", + ); + push_line(&mut output, " vec!["); + + for rule in rules { + let serialized = serialize_magic_rule(rule, INDENT_WIDTH * 2); + output.push_str(&serialized); + output.push(','); + output.push('\n'); + } + + push_line(&mut output, " ]"); + push_line(&mut output, "});\n"); + output +} + +/// Serialize a single magic rule as a Rust struct literal +pub fn serialize_magic_rule(rule: &MagicRule, indent: usize) -> String { + let mut output = String::new(); + + push_indent(&mut output, indent); + output.push_str("MagicRule {\n"); + + push_field( + &mut output, + indent + INDENT_WIDTH, + "offset", + &serialize_offset_spec(&rule.offset), + ); + push_field( + &mut output, + indent + INDENT_WIDTH, + "typ", + &serialize_type_kind(&rule.typ), + ); + push_field( + &mut output, + indent + INDENT_WIDTH, + "op", + &serialize_operator(&rule.op), + ); + push_field( + &mut output, + indent + INDENT_WIDTH, + "value", + &serialize_value(&rule.value), + ); + push_field( + &mut output, + indent + INDENT_WIDTH, + "message", + &format!("String::from({})", format_string_literal(&rule.message)), + ); + + push_indent(&mut output, indent + INDENT_WIDTH); + output.push_str("children: "); + output.push_str(&serialize_children(&rule.children, indent + INDENT_WIDTH)); + output.push_str(",\n"); + + push_field( + &mut output, + indent + INDENT_WIDTH, + "level", + &rule.level.to_string(), + ); + + push_field( + &mut output, + indent + INDENT_WIDTH, + "strength_modifier", + &serialize_strength_modifier(rule.strength_modifier), + ); + + push_indent(&mut output, indent); + output.push('}'); + + output +} + +/// Serialize child rules as a Rust `vec![]` literal +pub fn serialize_children(children: &[MagicRule], indent: usize) -> String { + if children.is_empty() { + return "Vec::new()".to_string(); + } + + let mut output = String::new(); + output.push_str("vec![\n"); + + for child in children { + let serialized = serialize_magic_rule(child, indent + INDENT_WIDTH); + output.push_str(&serialized); + output.push_str(",\n"); + } + + push_indent(&mut output, indent); + output.push(']'); + output +} + +/// Serialize an offset specification as a Rust expression +pub fn serialize_offset_spec(offset: &OffsetSpec) -> String { + match offset { + OffsetSpec::Absolute(value) => format!("OffsetSpec::Absolute({value})"), + OffsetSpec::Indirect { + base_offset, + pointer_type, + adjustment, + endian, + } => format!( + "OffsetSpec::Indirect {{ base_offset: {base_offset}, pointer_type: {}, adjustment: {adjustment}, endian: {} }}", + serialize_type_kind(pointer_type), + serialize_endianness(*endian) + ), + OffsetSpec::Relative(value) => format!("OffsetSpec::Relative({value})"), + OffsetSpec::FromEnd(value) => format!("OffsetSpec::FromEnd({value})"), + } +} + +/// Serialize a type kind as a Rust expression +pub fn serialize_type_kind(typ: &TypeKind) -> String { + match typ { + TypeKind::Byte { signed } => format!("TypeKind::Byte {{ signed: {signed} }}"), + TypeKind::Short { endian, signed } => format!( + "TypeKind::Short {{ endian: {}, signed: {} }}", + serialize_endianness(*endian), + signed + ), + TypeKind::Long { endian, signed } => format!( + "TypeKind::Long {{ endian: {}, signed: {} }}", + serialize_endianness(*endian), + signed + ), + TypeKind::Quad { endian, signed } => format!( + "TypeKind::Quad {{ endian: {}, signed: {} }}", + serialize_endianness(*endian), + signed + ), + TypeKind::String { max_length } => match max_length { + Some(value) => { + format!("TypeKind::String {{ max_length: Some({value}) }}") + } + None => "TypeKind::String { max_length: None }".to_string(), + }, + } +} + +/// Serialize an operator as a Rust expression +pub fn serialize_operator(op: &Operator) -> String { + match op { + Operator::Equal => "Operator::Equal".to_string(), + Operator::NotEqual => "Operator::NotEqual".to_string(), + Operator::LessThan => "Operator::LessThan".to_string(), + Operator::GreaterThan => "Operator::GreaterThan".to_string(), + Operator::LessEqual => "Operator::LessEqual".to_string(), + Operator::GreaterEqual => "Operator::GreaterEqual".to_string(), + Operator::BitwiseAnd => "Operator::BitwiseAnd".to_string(), + Operator::BitwiseAndMask(mask) => format!("Operator::BitwiseAndMask({mask})"), + } +} + +/// Serialize a value as a Rust expression +pub fn serialize_value(value: &Value) -> String { + match value { + Value::Uint(number) => format!("Value::Uint({})", format_number(*number)), + Value::Int(number) => format!("Value::Int({})", format_signed_number(*number)), + Value::Bytes(bytes) => format!("Value::Bytes({})", format_byte_vec(bytes)), + Value::String(text) => format!( + "Value::String(String::from({}))", + format_string_literal(text) + ), + } +} + +/// Serialize an endianness value as a Rust expression +pub fn serialize_endianness(endian: Endianness) -> String { + match endian { + Endianness::Little => "Endianness::Little".to_string(), + Endianness::Big => "Endianness::Big".to_string(), + Endianness::Native => "Endianness::Native".to_string(), + } +} + +/// Serialize a strength modifier as a Rust expression +pub fn serialize_strength_modifier(modifier: Option) -> String { + match modifier { + None => "None".to_string(), + Some(StrengthModifier::Add(val)) => format!("Some(StrengthModifier::Add({val}))"), + Some(StrengthModifier::Subtract(val)) => { + format!("Some(StrengthModifier::Subtract({val}))") + } + Some(StrengthModifier::Multiply(val)) => { + format!("Some(StrengthModifier::Multiply({val}))") + } + Some(StrengthModifier::Divide(val)) => format!("Some(StrengthModifier::Divide({val}))"), + Some(StrengthModifier::Set(val)) => format!("Some(StrengthModifier::Set({val}))"), + } +} + +/// Format an unsigned number with underscores for readability (`clippy::unreadable_literal`) +pub fn format_number(num: u64) -> String { + if num < 10000 { + num.to_string() + } else { + let num_str = num.to_string(); + let mut result = String::new(); + let len = num_str.len(); + + for (i, ch) in num_str.chars().enumerate() { + if i > 0 && (len - i) % 3 == 0 { + result.push('_'); + } + result.push(ch); + } + result + } +} + +/// Format a signed number with underscores for readability (`clippy::unreadable_literal`) +pub fn format_signed_number(num: i64) -> String { + if num < 0 { + let abs = num.unsigned_abs(); + format!("-{}", format_number(abs)) + } else { + // Safe: num >= 0, so the cast cannot lose the sign + format_number(num.unsigned_abs()) + } +} + +/// Format a byte slice as a Rust `vec![]` literal +pub fn format_byte_vec(bytes: &[u8]) -> String { + use std::fmt::Write; + + if bytes.is_empty() { + return "vec![]".to_string(); + } + + let mut output = String::from("vec!["); + for (index, byte) in bytes.iter().enumerate() { + if index > 0 { + output.push_str(", "); + } + write!(output, "0x{byte:02x}").unwrap(); + } + output.push(']'); + output +} + +/// Format a string as a Rust string literal with escaping +pub fn format_string_literal(value: &str) -> String { + let escaped = value.escape_default().to_string(); + format!("\"{escaped}\"") +} + +/// Append a line to the output string +fn push_line(output: &mut String, line: &str) { + output.push_str(line); + output.push('\n'); +} + +/// Append indentation to the output string +fn push_indent(output: &mut String, indent: usize) { + for _ in 0..indent { + output.push(' '); + } +} + +/// Append a named field to the output string +fn push_field(output: &mut String, indent: usize, name: &str, value: &str) { + push_indent(output, indent); + output.push_str(name); + output.push_str(": "); + output.push_str(value); + output.push_str(",\n"); +} diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index 68f6e0a7..7592c3a0 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -17,9 +17,10 @@ use nom::{ sequence::pair, }; -use crate::parser::ast::{ - Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value, -}; +use crate::parser::ast::{MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; + +#[cfg(test)] +use crate::parser::ast::Endianness; /// Parse a decimal number with overflow protection fn parse_decimal_number(input: &str) -> IResult<&str, i64> { @@ -1614,44 +1615,10 @@ mod tests { /// /// # Errors /// Returns a nom parsing error if the input doesn't match the expected format -#[allow(clippy::too_many_lines)] pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option)> { let (input, _) = multispace0(input)?; - let (input, type_name) = alt(( - // 64-bit types (6 branches) - alt(( - tag("ubequad"), - tag("ulequad"), - tag("uquad"), - tag("bequad"), - tag("lequad"), - tag("quad"), - )), - // 32-bit types (6 branches) - alt(( - tag("ubelong"), - tag("ulelong"), - tag("ulong"), - tag("belong"), - tag("lelong"), - tag("long"), - )), - // 16-bit types (6 branches) - alt(( - tag("ubeshort"), - tag("uleshort"), - tag("ushort"), - tag("beshort"), - tag("leshort"), - tag("short"), - )), - // 8-bit types (2 branches) - alt((tag("ubyte"), tag("byte"))), - // String types (1 branch, will grow with pstring/search/regex) - tag("string"), - )) - .parse(input)?; + let (input, type_name) = crate::parser::types::parse_type_keyword(input)?; // Check for attached operator with mask (like &0xf0000000) // Uses unsigned parsing so full u64 masks (e.g. 0xffffffffffffffff) are supported. @@ -1680,84 +1647,7 @@ pub fn parse_type_and_operator(input: &str) -> IResult<&str, (TypeKind, Option TypeKind::Byte { signed: true }, - "ubyte" => TypeKind::Byte { signed: false }, - "short" => TypeKind::Short { - endian: Endianness::Native, - signed: true, - }, - "ushort" => TypeKind::Short { - endian: Endianness::Native, - signed: false, - }, - "leshort" => TypeKind::Short { - endian: Endianness::Little, - signed: true, - }, - "uleshort" => TypeKind::Short { - endian: Endianness::Little, - signed: false, - }, - "beshort" => TypeKind::Short { - endian: Endianness::Big, - signed: true, - }, - "ubeshort" => TypeKind::Short { - endian: Endianness::Big, - signed: false, - }, - "long" => TypeKind::Long { - endian: Endianness::Native, - signed: true, - }, - "ulong" => TypeKind::Long { - endian: Endianness::Native, - signed: false, - }, - "lelong" => TypeKind::Long { - endian: Endianness::Little, - signed: true, - }, - "ulelong" => TypeKind::Long { - endian: Endianness::Little, - signed: false, - }, - "belong" => TypeKind::Long { - endian: Endianness::Big, - signed: true, - }, - "ubelong" => TypeKind::Long { - endian: Endianness::Big, - signed: false, - }, - "quad" => TypeKind::Quad { - endian: Endianness::Native, - signed: true, - }, - "uquad" => TypeKind::Quad { - endian: Endianness::Native, - signed: false, - }, - "lequad" => TypeKind::Quad { - endian: Endianness::Little, - signed: true, - }, - "ulequad" => TypeKind::Quad { - endian: Endianness::Little, - signed: false, - }, - "bequad" => TypeKind::Quad { - endian: Endianness::Big, - signed: true, - }, - "ubequad" => TypeKind::Quad { - endian: Endianness::Big, - signed: false, - }, - "string" => TypeKind::String { max_length: None }, - _ => unreachable!("Parser should only match known types"), - }; + let type_kind = crate::parser::types::type_keyword_to_kind(type_name); Ok((input, (type_kind, attached_op))) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index fdc9bfc2..8b4eb48e 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -131,11 +131,14 @@ //! - Non-critical errors (parse failures in individual files): Logs warning to stderr and continues pub mod ast; +#[allow(dead_code)] +pub(crate) mod codegen; mod format; pub mod grammar; mod hierarchy; mod loader; pub(crate) mod preprocessing; +pub mod types; // Re-export AST types for convenience pub use ast::{Endianness, MagicRule, OffsetSpec, Operator, StrengthModifier, TypeKind, Value}; diff --git a/src/parser/types.rs b/src/parser/types.rs new file mode 100644 index 00000000..012b1530 --- /dev/null +++ b/src/parser/types.rs @@ -0,0 +1,382 @@ +// Copyright (c) 2025-2026 the libmagic-rs contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Type keyword parsing for magic file types +//! +//! This module handles parsing and classification of magic file type keywords +//! (byte, short, long, quad, string, etc.) into their corresponding [`TypeKind`] +//! representations. It extracts the type keyword recognition from the grammar +//! module to keep type-specific logic cohesive and manageable as new types are +//! added. + +use nom::{IResult, Parser, branch::alt, bytes::complete::tag}; + +use crate::parser::ast::{Endianness, TypeKind}; + +/// Parse a type keyword from magic file input +/// +/// Recognizes all supported type keywords and returns the matched keyword string. +/// Type keywords are organized by bit width (64, 32, 16, 8 bits) with longest +/// prefixes matched first within each group to avoid ambiguous partial matches. +/// +/// # Supported Keywords +/// +/// - 64-bit: `ubequad`, `ulequad`, `uquad`, `bequad`, `lequad`, `quad` +/// - 32-bit: `ubelong`, `ulelong`, `ulong`, `belong`, `lelong`, `long` +/// - 16-bit: `ubeshort`, `uleshort`, `ushort`, `beshort`, `leshort`, `short` +/// - 8-bit: `ubyte`, `byte` +/// - String: `string` +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::parser::types::parse_type_keyword; +/// +/// let (rest, keyword) = parse_type_keyword("bequad rest").unwrap(); +/// assert_eq!(keyword, "bequad"); +/// assert_eq!(rest, " rest"); +/// ``` +/// +/// # Errors +/// +/// Returns a nom parsing error if the input doesn't start with a known type keyword. +pub fn parse_type_keyword(input: &str) -> IResult<&str, &str> { + alt(( + // 64-bit types (6 branches) + alt(( + tag("ubequad"), + tag("ulequad"), + tag("uquad"), + tag("bequad"), + tag("lequad"), + tag("quad"), + )), + // 32-bit types (6 branches) + alt(( + tag("ubelong"), + tag("ulelong"), + tag("ulong"), + tag("belong"), + tag("lelong"), + tag("long"), + )), + // 16-bit types (6 branches) + alt(( + tag("ubeshort"), + tag("uleshort"), + tag("ushort"), + tag("beshort"), + tag("leshort"), + tag("short"), + )), + // 8-bit types (2 branches) + alt((tag("ubyte"), tag("byte"))), + // String types (1 branch, will grow with pstring/search/regex) + tag("string"), + )) + .parse(input) +} + +/// Convert a type keyword string to its corresponding [`TypeKind`] +/// +/// Maps a previously parsed type keyword (from [`parse_type_keyword`]) to the +/// appropriate `TypeKind` variant with correct endianness and signedness settings. +/// +/// # Conventions +/// +/// - Unprefixed types are signed (libmagic default): `byte`, `short`, `long`, `quad` +/// - `u` prefix indicates unsigned: `ubyte`, `ushort`, `ulong`, `uquad` +/// - `be` prefix indicates big-endian: `beshort`, `belong`, `bequad` +/// - `le` prefix indicates little-endian: `leshort`, `lelong`, `lequad` +/// - No endian prefix means native endianness +/// +/// # Examples +/// +/// ``` +/// use libmagic_rs::parser::types::type_keyword_to_kind; +/// use libmagic_rs::parser::ast::{TypeKind, Endianness}; +/// +/// assert_eq!(type_keyword_to_kind("byte"), TypeKind::Byte { signed: true }); +/// assert_eq!(type_keyword_to_kind("ubyte"), TypeKind::Byte { signed: false }); +/// assert_eq!( +/// type_keyword_to_kind("beshort"), +/// TypeKind::Short { endian: Endianness::Big, signed: true } +/// ); +/// ``` +/// +/// # Panics +/// +/// Panics if `type_name` is not a recognized type keyword. This function should +/// only be called with values returned by [`parse_type_keyword`]. +#[must_use] +pub fn type_keyword_to_kind(type_name: &str) -> TypeKind { + match type_name { + // BYTE types (8-bit) + "byte" => TypeKind::Byte { signed: true }, + "ubyte" => TypeKind::Byte { signed: false }, + + // SHORT types (16-bit) + "short" => TypeKind::Short { + endian: Endianness::Native, + signed: true, + }, + "ushort" => TypeKind::Short { + endian: Endianness::Native, + signed: false, + }, + "leshort" => TypeKind::Short { + endian: Endianness::Little, + signed: true, + }, + "uleshort" => TypeKind::Short { + endian: Endianness::Little, + signed: false, + }, + "beshort" => TypeKind::Short { + endian: Endianness::Big, + signed: true, + }, + "ubeshort" => TypeKind::Short { + endian: Endianness::Big, + signed: false, + }, + + // LONG types (32-bit) + "long" => TypeKind::Long { + endian: Endianness::Native, + signed: true, + }, + "ulong" => TypeKind::Long { + endian: Endianness::Native, + signed: false, + }, + "lelong" => TypeKind::Long { + endian: Endianness::Little, + signed: true, + }, + "ulelong" => TypeKind::Long { + endian: Endianness::Little, + signed: false, + }, + "belong" => TypeKind::Long { + endian: Endianness::Big, + signed: true, + }, + "ubelong" => TypeKind::Long { + endian: Endianness::Big, + signed: false, + }, + + // QUAD types (64-bit) + "quad" => TypeKind::Quad { + endian: Endianness::Native, + signed: true, + }, + "uquad" => TypeKind::Quad { + endian: Endianness::Native, + signed: false, + }, + "lequad" => TypeKind::Quad { + endian: Endianness::Little, + signed: true, + }, + "ulequad" => TypeKind::Quad { + endian: Endianness::Little, + signed: false, + }, + "bequad" => TypeKind::Quad { + endian: Endianness::Big, + signed: true, + }, + "ubequad" => TypeKind::Quad { + endian: Endianness::Big, + signed: false, + }, + + // STRING type + "string" => TypeKind::String { max_length: None }, + + _ => unreachable!("type_keyword_to_kind called with unknown type: {type_name}"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::ast::Endianness; + + // ============================================================ + // parse_type_keyword tests + // ============================================================ + + #[test] + fn test_parse_type_keyword_byte_variants() { + assert_eq!(parse_type_keyword("byte rest"), Ok((" rest", "byte"))); + assert_eq!(parse_type_keyword("ubyte rest"), Ok((" rest", "ubyte"))); + } + + #[test] + fn test_parse_type_keyword_short_variants() { + let cases = [ + ("short", "short"), + ("ushort", "ushort"), + ("leshort", "leshort"), + ("uleshort", "uleshort"), + ("beshort", "beshort"), + ("ubeshort", "ubeshort"), + ]; + for (input, expected) in cases { + let input_with_rest = format!("{input} rest"); + let (rest, keyword) = parse_type_keyword(&input_with_rest).unwrap(); + assert_eq!(keyword, expected, "Failed for input: {input}"); + assert_eq!(rest, " rest", "Wrong remaining for input: {input}"); + } + } + + #[test] + fn test_parse_type_keyword_long_variants() { + let cases = ["long", "ulong", "lelong", "ulelong", "belong", "ubelong"]; + for input in cases { + let input_with_rest = format!("{input} rest"); + let (rest, keyword) = parse_type_keyword(&input_with_rest).unwrap(); + assert_eq!(keyword, input, "Failed for: {input}"); + assert_eq!(rest, " rest"); + } + } + + #[test] + fn test_parse_type_keyword_quad_variants() { + let cases = ["quad", "uquad", "lequad", "ulequad", "bequad", "ubequad"]; + for input in cases { + let input_with_rest = format!("{input} rest"); + let (rest, keyword) = parse_type_keyword(&input_with_rest).unwrap(); + assert_eq!(keyword, input, "Failed for: {input}"); + assert_eq!(rest, " rest"); + } + } + + #[test] + fn test_parse_type_keyword_string() { + assert_eq!(parse_type_keyword("string rest"), Ok((" rest", "string"))); + } + + #[test] + fn test_parse_type_keyword_unknown() { + assert!(parse_type_keyword("unknown rest").is_err()); + } + + #[test] + fn test_parse_type_keyword_empty() { + assert!(parse_type_keyword("").is_err()); + } + + // ============================================================ + // type_keyword_to_kind tests + // ============================================================ + + #[test] + fn test_type_keyword_to_kind_byte() { + assert_eq!( + type_keyword_to_kind("byte"), + TypeKind::Byte { signed: true } + ); + assert_eq!( + type_keyword_to_kind("ubyte"), + TypeKind::Byte { signed: false } + ); + } + + #[test] + fn test_type_keyword_to_kind_short_endianness() { + assert_eq!( + type_keyword_to_kind("short"), + TypeKind::Short { + endian: Endianness::Native, + signed: true + } + ); + assert_eq!( + type_keyword_to_kind("leshort"), + TypeKind::Short { + endian: Endianness::Little, + signed: true + } + ); + assert_eq!( + type_keyword_to_kind("beshort"), + TypeKind::Short { + endian: Endianness::Big, + signed: true + } + ); + } + + #[test] + fn test_type_keyword_to_kind_unsigned_variants() { + assert_eq!( + type_keyword_to_kind("ushort"), + TypeKind::Short { + endian: Endianness::Native, + signed: false + } + ); + assert_eq!( + type_keyword_to_kind("ulong"), + TypeKind::Long { + endian: Endianness::Native, + signed: false + } + ); + assert_eq!( + type_keyword_to_kind("uquad"), + TypeKind::Quad { + endian: Endianness::Native, + signed: false + } + ); + } + + #[test] + fn test_type_keyword_to_kind_signed_defaults() { + // libmagic types are signed by default + assert_eq!( + type_keyword_to_kind("long"), + TypeKind::Long { + endian: Endianness::Native, + signed: true + } + ); + assert_eq!( + type_keyword_to_kind("quad"), + TypeKind::Quad { + endian: Endianness::Native, + signed: true + } + ); + } + + #[test] + fn test_type_keyword_to_kind_string() { + assert_eq!( + type_keyword_to_kind("string"), + TypeKind::String { max_length: None } + ); + } + + #[test] + fn test_roundtrip_all_keywords() { + // Verify that every keyword parsed by parse_type_keyword can be + // converted to a TypeKind by type_keyword_to_kind + let keywords = [ + "byte", "ubyte", "short", "ushort", "leshort", "uleshort", "beshort", "ubeshort", + "long", "ulong", "lelong", "ulelong", "belong", "ubelong", "quad", "uquad", "lequad", + "ulequad", "bequad", "ubequad", "string", + ]; + for keyword in keywords { + let (rest, parsed) = parse_type_keyword(keyword).unwrap(); + assert_eq!(rest, "", "Keyword {keyword} should consume all input"); + // Should not panic + let _ = type_keyword_to_kind(parsed); + } + } +} From 295eb362af02673e7c219afb232ce56b1125e46e Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 2 Mar 2026 00:11:20 -0500 Subject: [PATCH 09/12] docs(agents): update module organization and type support instructions Signed-off-by: UncleSp1d3r --- AGENTS.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index ccecf88f..c8c6cf9d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -80,7 +80,9 @@ pub enum Operator { parser/ ├── mod.rs // Public parser interface ├── ast.rs // AST node definitions -└── grammar.rs // Magic file DSL parsing (nom/pest) +├── grammar.rs // Magic file DSL parsing (nom) +├── types.rs // Type keyword parsing and TypeKind conversion +└── codegen.rs // Serialization for code generation (shared with build.rs) // Evaluator module structure evaluator/ @@ -139,7 +141,7 @@ pub fn evaluate_magic_rules( - `src/error.rs` is shared with `build.rs` -- cannot reference lib-only types like `crate::io::IoError` - `FileError(String)` wraps structured I/O errors as strings to work around the build.rs constraint -- `build.rs` and `src/build_helpers.rs` have duplicate `serialize_*` functions -- both must be updated when adding enum variants +- Serialization functions live in `src/parser/codegen.rs`, shared by both `build.rs` (via `#[path]` include) and `src/build_helpers.rs` (via `crate::parser::codegen`); `format_parse_error` remains duplicated in both because `ParseError` has different import paths - Use `ParseError::IoError` for I/O errors in parser code, not `ParseError::invalid_syntax` - Use `LibmagicError::ConfigError` for config validation, not `ParseError::invalid_syntax` - Clippy pedantic lints are active (e.g., prefer `trailing_zeros()` over bitwise masks) @@ -312,10 +314,12 @@ sample.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) > **Note:** Currently implemented types are `Byte`, `Short`, `Long`, `Quad`, and `String`. Regex and other advanced types are planned for future releases. 1. Extend `TypeKind` enum in `src/parser/ast.rs` -2. Add parsing logic in `src/parser/grammar.rs` -3. Implement reading logic in `src/evaluator/types.rs` -4. Add tests for the new type -5. Update documentation +2. Add keyword parsing in `src/parser/types.rs` (`parse_type_keyword` and `type_keyword_to_kind`) +3. Add value/operator parsing in `src/parser/grammar.rs` if needed +4. Implement reading logic in `src/evaluator/types.rs` +5. Update `serialize_type_kind()` in `src/parser/codegen.rs` +6. Add tests for the new type +7. Update documentation ### Adding New Operators @@ -324,7 +328,7 @@ sample.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV) 1. Extend `Operator` enum in `src/parser/ast.rs` 2. Add parsing logic in `src/parser/grammar.rs` 3. Implement operator logic in `src/evaluator/operators.rs` -4. Update `serialize_operator()` in both `src/build_helpers.rs` AND `build.rs` (they have duplicate match statements) +4. Update `serialize_operator()` in `src/parser/codegen.rs` 5. Update strength calculation match in `src/evaluator/strength.rs` 6. Update `arb_operator()` in `tests/property_tests.rs` 7. Add tests for the new operator From 35e5cb221fbb354261b06b8e33de90f5368a1ac7 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 2 Mar 2026 00:13:15 -0500 Subject: [PATCH 10/12] docs(agents): clarify Mergify auto-merge rules for bot PRs Signed-off-by: UncleSp1d3r --- AGENTS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c8c6cf9d..1693f1da 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -441,7 +441,8 @@ All pull requests require review before merging. Reviews are performed by mainta - **Documentation**: Public APIs have rustdoc with examples, AGENTS.md updated if architecture changes CI must pass before merge. Mergify merge protections enforce these checks. -Bot PRs (dependabot, dosubot, release-plz) are auto-merged by Mergify when CI passes. +Bot PRs from dependabot and dosubot are auto-merged by Mergify when all required CI checks pass. +Bot PRs from release-plz are auto-merged by Mergify when their required DCO check passes (they are exempt from full CI in `.mergify.yml`). Human PRs are merged manually by maintainers. ## Project Context @@ -521,7 +522,7 @@ This guide ensures consistent, high-quality development practices for the libmag ## Quick Reference -- Mergify auto-merges bot PRs (dependabot, dosubot, release-plz) via direct `merge` action (no merge queue) +- Mergify auto-merges dependabot/dosubot PRs when full CI passes; release-plz PRs when DCO passes (exempt from full CI) - Human PRs are merged manually -- Mergify only provides merge protections for those - `.mergify.yml` configures auto-merge rules and merge protections - `cargo deny check` uses `deny.toml` (default) -- do not specify a custom config path From c4f854ff8ad078bd232881cc0963f62f9e4d7c2c Mon Sep 17 00:00:00 2001 From: "dosubot[bot]" <131922026+dosubot[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 00:14:30 -0500 Subject: [PATCH 11/12] docs: updates for PR #133 (#134) Update documentation for https://github.com/EvilBit-Labs/libmagic-rs/pull/133 _Generated by [Dosu](https://dosu.dev)_ --------- Co-authored-by: dosubot[bot] <131922026+dosubot[bot]@users.noreply.github.com> Co-authored-by: UncleSp1d3r --- docs/API_REFERENCE.md | 14 ++++++++++++++ docs/ARCHITECTURE.md | 21 +++++++++++++++++++-- docs/MAGIC_FORMAT.md | 15 ++++++++++++++- docs/src/architecture.md | 3 ++- docs/src/ast-structures.md | 17 ++++++++++++++++- 5 files changed, 65 insertions(+), 5 deletions(-) diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index 3af1ca3f..ee8cfc89 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -301,8 +301,22 @@ use libmagic_rs::TypeKind; | `Byte { signed }` | Single byte with explicit signedness (changed in v0.2.0) | | `Short { endian, signed }` | 16-bit integer | | `Long { endian, signed }` | 32-bit integer | +| `Quad { endian, signed }` | 64-bit integer | | `String { max_length }` | String data | +##### 64-bit Integer Types + +The `Quad` variant supports six endian-signedness combinations: + +| Type Specifier | Endianness | Signedness | Description | +|----------------|------------|------------|-------------| +| `quad` | Native | Signed | Native-endian signed 64-bit integer | +| `uquad` | Native | Unsigned | Native-endian unsigned 64-bit integer | +| `lequad` | Little | Signed | Little-endian signed 64-bit integer | +| `ulequad` | Little | Unsigned | Little-endian unsigned 64-bit integer | +| `bequad` | Big | Signed | Big-endian signed 64-bit integer | +| `ubequad` | Big | Unsigned | Big-endian unsigned 64-bit integer | + **Version Note:** In v0.2.0, the `Byte` variant changed from a unit variant to a struct variant with a `signed` field. #### Operator diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index c02b5df2..0355499b 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -267,6 +267,13 @@ pub struct MagicRule { } ``` +**TypeKind Variants:** +- `Byte { signed: bool }` - 8-bit integer +- `Short { endian: Endianness, signed: bool }` - 16-bit integer +- `Long { endian: Endianness, signed: bool }` - 32-bit integer +- `Quad { endian: Endianness, signed: bool }` - 64-bit integer +- `String { max_length: Option }` - Null-terminated string + **Hierarchical Structure:** - Top-level rules (level 0) are entry points - Child rules are evaluated only if parent matches @@ -467,8 +474,18 @@ The evaluation hot path is optimized for: 1. Add variant to `TypeKind` enum (`ast.rs`) 2. Add parsing logic (`grammar.rs`) 3. Add reading logic (`types.rs`) -4. Add tests -5. Update documentation +4. Add serialization support (`build_helpers.rs`) +5. Add tests +6. Update documentation + +**Example: Quad Type Implementation** + +The `Quad` type (64-bit integer) demonstrates the type system extension pattern. The implementation includes: +- `TypeKind::Quad { endian: Endianness, signed: bool }` variant in the AST +- `read_quad()` function for safe buffer access with bounds checking +- Parsing support for `quad`, `uquad`, `lequad`, `ulequad`, `bequad`, `ubequad` type names +- Strength calculation (specificity score of 16, highest among numeric types) +- Serialization for build-time rule compilation ### Adding New Operators diff --git a/docs/MAGIC_FORMAT.md b/docs/MAGIC_FORMAT.md index f1332a8f..b0904c40 100644 --- a/docs/MAGIC_FORMAT.md +++ b/docs/MAGIC_FORMAT.md @@ -119,6 +119,7 @@ Types for indirect offsets: - `.b` - byte (1 byte) - `.s` - short (2 bytes) - `.l` - long (4 bytes) +- `.q` - quad (8 bytes) ### Relative Offset @@ -146,12 +147,22 @@ The `&` prefix indicates relative offset. | `long` | 4 bytes | native | | `lelong` | 4 bytes | little-endian | | `belong` | 4 bytes | big-endian | +| `quad` | 8 bytes | native | +| `lequad` | 8 bytes | little-endian | +| `bequad` | 8 bytes | big-endian | + +All integer types have unsigned variants prefixed with `u`: +- `ubyte`, `ushort`, `uleshort`, `ubeshort` +- `ulong`, `ulelong`, `ubelong` +- `uquad`, `ulequad`, `ubequad` Examples: ``` 0 byte 0x7f (byte match) 0 leshort 0x5a4d DOS MZ signature 0 belong 0xcafebabe Java class file +0 lequad 0x1234567890abcdef (64-bit little-endian) +8 uquad >0x8000000000000000 (unsigned 64-bit check) ``` ### String Type @@ -469,7 +480,7 @@ Consider: - Absolute offsets - Relative offsets - Indirect offsets (basic) -- Byte, short, long types +- Byte, short, long, quad types (8-bit, 16-bit, 32-bit, 64-bit integers) - String type - Comparison operators (`=`, `!`, `<`, `>`, `<=`, `>=`) - Bitwise AND operator @@ -481,6 +492,7 @@ Consider: - Regex patterns - Date/time types - Float types +- 128-bit integer types - Use/name directives - Default rules @@ -488,6 +500,7 @@ Consider: - **Comparison operators**: Full support for `<`, `>`, `<=`, `>=` operators - **Strength modifiers**: The `!:strength` directive for adjusting rule priority +- **64-bit integers**: `quad` type family (`quad`, `uquad`, `lequad`, `ulequad`, `bequad`, `ubequad`) --- diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 65e5d7e2..8896d909 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -93,6 +93,7 @@ pub enum TypeKind { Byte { signed: bool }, // Single byte with explicit signedness Short { endian: Endianness, signed: bool }, Long { endian: Endianness, signed: bool }, + Quad { endian: Endianness, signed: bool }, String { max_length: Option }, } @@ -114,7 +115,7 @@ pub enum Operator { - **Serializable**: Full serde support for caching - **Self-contained**: No external dependencies in AST nodes - **Type-safe**: Rust's type system prevents invalid rule combinations -- **Explicit signedness**: `TypeKind::Byte` and integer types distinguish signed from unsigned interpretations +- **Explicit signedness**: `TypeKind::Byte` and integer types (Short, Long, Quad) distinguish signed from unsigned interpretations ### 3. Evaluator Module (`src/evaluator/`) diff --git a/docs/src/ast-structures.md b/docs/src/ast-structures.md index f821b183..1e465c9b 100644 --- a/docs/src/ast-structures.md +++ b/docs/src/ast-structures.md @@ -179,6 +179,9 @@ pub enum TypeKind { /// 32-bit integer Long { endian: Endianness, signed: bool }, + /// 64-bit integer + Quad { endian: Endianness, signed: bool }, + /// String data String { max_length: Option }, } @@ -205,6 +208,18 @@ let long_be = TypeKind::Long { signed: true }; +// 64-bit little-endian unsigned integer +let quad_le = TypeKind::Quad { + endian: Endianness::Little, + signed: false +}; + +// 64-bit big-endian signed integer +let quad_be = TypeKind::Quad { + endian: Endianness::Big, + signed: true +}; + // Null-terminated string, max 256 bytes let string_type = TypeKind::String { max_length: Some(256) @@ -402,7 +417,7 @@ let script_rule = MagicRule { ### Type Selection 1. **Use `Byte { signed }`** for single-byte values and flags, specifying signedness -2. **Use `Short/Long`** with explicit endianness and signedness for multi-byte integers +2. **Use `Short/Long/Quad`** with explicit endianness and signedness for multi-byte integers 3. **Use `String`** with length limits for text patterns 4. **Use `Bytes`** for exact binary sequences From 3c3f8edf035be96e91bcebc924b8c9ba5763ba34 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Mon, 2 Mar 2026 00:28:43 -0500 Subject: [PATCH 12/12] refactor(output): clarify conversion from evaluator RuleMatch to output MatchResult Signed-off-by: UncleSp1d3r --- src/output/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/output/mod.rs b/src/output/mod.rs index 7c66c40f..d3ecca7b 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -256,7 +256,7 @@ impl MatchResult { } } - /// Convert from an evaluator `MatchResult` to an output `MatchResult` + /// Convert from an evaluator [`RuleMatch`](crate::evaluator::RuleMatch) to an output `MatchResult` /// /// This adapts the internal evaluation result format to the richer output format /// used for JSON and structured output. It extracts rule paths from match messages @@ -264,7 +264,7 @@ impl MatchResult { /// /// # Arguments /// - /// * `m` - The evaluator match result to convert + /// * `m` - The evaluator rule match to convert /// * `mime_type` - Optional MIME type to associate with this match #[must_use] pub fn from_evaluator_match(m: &crate::evaluator::RuleMatch, mime_type: Option<&str>) -> Self { @@ -274,6 +274,9 @@ impl MatchResult { #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] let confidence = (m.confidence * 100.0).min(100.0) as u8; + // TODO: Numeric length is hardcoded to 4 bytes. Value::Uint/Int don't encode + // their source width, so byte/short/long/quad all report 4. Carrying TypeKind + // in RuleMatch would allow accurate lengths (1, 2, 4, 8). let length = match &m.value { Value::Bytes(b) => b.len(), Value::String(s) => s.len(),