From fbb750ad0f2e018456d304efda1309f4a8604ad0 Mon Sep 17 00:00:00 2001 From: Leynos Date: Fri, 8 Aug 2025 02:31:49 +0100 Subject: [PATCH 1/2] Hash actions via canonical JSON --- docs/netsuke-design.md | 29 +++---- src/hasher.rs | 79 +++---------------- src/ir.rs | 8 +- tests/hasher_tests.rs | 12 +-- ..._snapshot_tests__touch_manifest_ninja.snap | 4 +- 5 files changed, 37 insertions(+), 95 deletions(-) diff --git a/docs/netsuke-design.md b/docs/netsuke-design.md index fe78e3d4..868b116f 100644 --- a/docs/netsuke-design.md +++ b/docs/netsuke-design.md @@ -913,7 +913,8 @@ use std::path::{Path, PathBuf}; /// The complete, static build graph. pub struct BuildGraph { /// A map of all unique actions (rules) in the build. - /// The key is a hash of the action's properties to enable deduplication. + /// The key is a hash of a canonical JSON serialisation of the action's + /// properties to enable deduplication. pub actions: HashMap, /// A map of all target files to be built. The key is the output path. @@ -1109,9 +1110,10 @@ action identifier and carries the `phony` and `always` flags verbatim from the manifest. No Ninja specific placeholders are stored in the IR to keep the representation portable. -- Actions are deduplicated using a SHA-256 hash of their recipe and metadata. - Identical commands therefore share the same identifier which keeps the IR - deterministic for snapshot tests. +- Actions are deduplicated using a SHA-256 hash of a canonical JSON + serialisation of their recipe and metadata. Identical commands therefore + share the same identifier which keeps the IR deterministic for snapshot + tests. - Multiple rule references in a single target are not yet supported. The IR generator reports `IrGenError::MultipleRules` when encountered. - Duplicate output files are rejected. Attempting to define the same output @@ -1261,29 +1263,18 @@ libraries.[^27] Rust ```rust -// In src/ir.rs -use thiserror::Error; -use std::path::PathBuf; +// In src/ir.rs use thiserror::Error; use std::path::PathBuf; #[derive(Debug, Error)] pub enum IrGenError { #[error("rule not found: {rule_name} for target {target_name}")] - RuleNotFound { - target_name: String, - rule_name: String, - }, + RuleNotFound { target_name: String, rule_name: String, }, #[error("circular dependency detected: {cycle:?}")] - CircularDependency { - cycle: Vec, - }, + CircularDependency { cycle: Vec, }, #[error("dependency not found: {dependency_name} for target {target_name}")] - DependencyNotFound { - target_name: String, - dependency_name: String, - }, -} + DependencyNotFound { target_name: String, dependency_name: String, }, } ``` - `anyhow`: This crate will be used in the main application logic (`main.rs`) diff --git a/src/hasher.rs b/src/hasher.rs index fed0a546..33039465 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -1,8 +1,8 @@ //! Action hashing utilities. //! -//! This module provides the [`ActionHasher`] type used to compute a stable -//! SHA-256 digest for [`Action`] definitions. The hash is used to deduplicate -//! identical actions when generating the build graph. +//! [`ActionHasher`] computes stable SHA-256 digests for [`Action`] definitions. +//! Each action is serialised to canonical JSON before hashing, ensuring +//! identical actions map to the same digest even as the struct evolves. //! //! # Examples //! @@ -23,10 +23,8 @@ //! assert!(!hash.is_empty()); //! ``` -use itoa::Buffer; use sha2::{Digest, Sha256}; -use crate::ast::{Recipe, StringOrList}; use crate::ir::Action; /// Computes stable digests for [`Action`] definitions. @@ -34,69 +32,16 @@ pub struct ActionHasher; impl ActionHasher { /// Calculate the hash of an [`Action`]. + /// + /// # Panics + /// + /// Panics if the action cannot be serialised to JSON. #[must_use] pub fn hash(action: &Action) -> String { - let mut hasher = Sha256::new(); - Self::hash_recipe(&mut hasher, &action.recipe); - Self::hash_optional_fields(&mut hasher, action); - format!("{:x}", hasher.finalize()) - } - - fn hash_recipe(hasher: &mut Sha256, recipe: &Recipe) { - match recipe { - Recipe::Command { command } => { - hasher.update(b"cmd"); - Self::update_with_len(hasher, command.as_bytes()); - } - Recipe::Script { script } => { - hasher.update(b"scr"); - Self::update_with_len(hasher, script.as_bytes()); - } - Recipe::Rule { rule } => { - hasher.update(b"rule"); - Self::hash_rule_reference(hasher, rule); - } - } - } - - fn hash_optional_fields(hasher: &mut Sha256, action: &Action) { - Self::hash_optional_string(hasher, action.description.as_ref()); - Self::hash_optional_string(hasher, action.depfile.as_ref()); - Self::hash_optional_string(hasher, action.deps_format.as_ref()); - Self::hash_optional_string(hasher, action.pool.as_ref()); - hasher.update(if action.restat { b"1" } else { b"0" }); - } - - fn hash_rule_reference(hasher: &mut Sha256, rule: &StringOrList) { - match rule { - StringOrList::String(r) => Self::update_with_len(hasher, r.as_bytes()), - StringOrList::List(list) => { - // Preserve the original sequence so that different orders - // generate distinct hashes. - for r in list { - Self::update_with_len(hasher, r.as_bytes()); - } - } - StringOrList::Empty => {} - } - } - - fn hash_optional_string(hasher: &mut Sha256, value: Option<&String>) { - match value { - Some(v) => { - hasher.update(b"1"); - Self::update_with_len(hasher, v.as_bytes()); - } - None => hasher.update(b"0"), - } - } - - fn update_with_len(hasher: &mut Sha256, bytes: &[u8]) { - // Write the length prefix into a stack buffer to avoid heap allocation. - let mut buf = Buffer::new(); - let len_str = buf.format(bytes.len()); - hasher.update(len_str.as_bytes()); - hasher.update(b":"); - hasher.update(bytes); + // Serialise using canonical JSON so field order and absent options do + // not affect the resulting digest. + let bytes = serde_json::to_vec(action).expect("serialise action to JSON"); + let digest = Sha256::digest(bytes); + format!("{digest:x}") } } diff --git a/src/ir.rs b/src/ir.rs index 846b39ce..d1d3e651 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -39,12 +39,18 @@ pub struct BuildGraph { } /// A reusable command analogous to a Ninja rule. -#[derive(Debug, Clone, PartialEq)] +use serde::Serialize; + +#[derive(Debug, Clone, PartialEq, Serialize)] pub struct Action { pub recipe: Recipe, + #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub depfile: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub deps_format: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub pool: Option, pub restat: bool, } diff --git a/tests/hasher_tests.rs b/tests/hasher_tests.rs index b5ef2e52..90a0692d 100644 --- a/tests/hasher_tests.rs +++ b/tests/hasher_tests.rs @@ -15,7 +15,7 @@ use rstest::rstest; pool: None, restat: false, }, - "a0f6e2cd3b9b3cee0bf94a7d53bce56cf4178dfe907bb1cb7c832f47846baf38" + "b43a76a10b522e53fc0fb0fcb3354939e00d6b708252050c27100da204a811ae" )] #[case( Action { @@ -26,7 +26,7 @@ use rstest::rstest; pool: None, restat: true, }, - "cf8e97357820acf6f66037dcf977ee36c88c2811d60342db30c99507d24a0d60" + "9b0289f92ea0e374eecdaf50c8c9080547635aaff38d07fe2a278af6894c3207" )] #[case( Action { @@ -37,7 +37,7 @@ use rstest::rstest; pool: None, restat: false, }, - "69f72afccc2aa5a709af1139a9c7ef5f4f72e57cf5376e6c043e575f68f2ef8d" + "9733343b512253e636fbacfea40ef4f5771d49409fcda026aec7c7ce2f5405ec" )] #[case( Action { @@ -48,7 +48,7 @@ use rstest::rstest; pool: None, restat: false, }, - "c28b5c0b7f20bf1093cbab990976b904268f173413f54b7007166b2c02f498f3" + "9b53c477668394e59eca5b34416ef7ad7fb5799ca96dd283e81d7acda6c56006" )] #[case( Action { @@ -59,7 +59,7 @@ use rstest::rstest; pool: None, restat: false, }, - "28adc0857704aa0c54c3bc624cb2dc70c101c3936987b20ae520a20319f591c2" + "333d2b3f4f805b80c2e1aef1b5c9f1e0bbc990b77121c731f14edf3691ce120c" )] // Order of rule names influences the digest. #[case( @@ -71,7 +71,7 @@ use rstest::rstest; pool: None, restat: true, }, - "b93ff0102089f1f1a3fe9eec082b59d5aab58271a40724ccdfdaade6a68fe340" + "d5f1a262a95b75db3a7a79a5855eb27b6b430833e7ba93538502a16ebd03f50b" )] fn hash_action_is_stable(#[case] action: Action, #[case] expected: &str) { assert_eq!(ActionHasher::hash(&action), expected); diff --git a/tests/snapshots/ninja/ninja_snapshot_tests__touch_manifest_ninja.snap b/tests/snapshots/ninja/ninja_snapshot_tests__touch_manifest_ninja.snap index ca715357..96b8b56e 100644 --- a/tests/snapshots/ninja/ninja_snapshot_tests__touch_manifest_ninja.snap +++ b/tests/snapshots/ninja/ninja_snapshot_tests__touch_manifest_ninja.snap @@ -2,7 +2,7 @@ source: tests/ninja_snapshot_tests.rs expression: ninja_content --- -rule ca3067639652d0018b982cd2fc8262e3a02f4404f60148b8493de0f656d9b1a2 +rule d3cc8be04150cb4e2d9ccbdbe94cf9f2e8ade54bb4701b8faf99cafeb456a75d command = python3 -c 'import os,sys; open(sys.argv[1],"a").close()' $out -build out/a: ca3067639652d0018b982cd2fc8262e3a02f4404f60148b8493de0f656d9b1a2 in/a +build out/a: d3cc8be04150cb4e2d9ccbdbe94cf9f2e8ade54bb4701b8faf99cafeb456a75d in/a From f5629dbcf3b875e55d764dc9dfe9e03221cfa2cc Mon Sep 17 00:00:00 2001 From: Leynos Date: Fri, 8 Aug 2025 11:47:51 +0100 Subject: [PATCH 2/2] Stream canonical JSON into action hashes --- Cargo.lock | 18 ++++++++++++++++++ Cargo.toml | 1 + docs/netsuke-design.md | 19 ++++++++++++++----- src/hasher.rs | 36 +++++++++++++++++++++++++++--------- src/ir.rs | 18 +++++++++++------- tests/hasher_tests.rs | 7 ++++--- 6 files changed, 75 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 80389185..26605112 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -820,6 +820,7 @@ dependencies = [ "semver", "serde", "serde_json", + "serde_json_canonicalizer", "serde_yml", "serial_test", "sha2", @@ -1140,6 +1141,12 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "ryu-js" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" + [[package]] name = "same-file" version = "1.0.6" @@ -1223,6 +1230,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_json_canonicalizer" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18032888bfda612a88f6b9c7c7a12e8168686936702fe584e3f1b1fc7848443a" +dependencies = [ + "ryu-js", + "serde", + "serde_json", +] + [[package]] name = "serde_yml" version = "0.0.12" diff --git a/Cargo.toml b/Cargo.toml index d85a516f..a5a97648 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ itertools = "0.12" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt"] } serde_json = "1" +serde_json_canonicalizer = "0.3" tempfile = "3.8.0" [lints.clippy] diff --git a/docs/netsuke-design.md b/docs/netsuke-design.md index 868b116f..8c0fb969 100644 --- a/docs/netsuke-design.md +++ b/docs/netsuke-design.md @@ -1267,14 +1267,23 @@ Rust #[derive(Debug, Error)] pub enum IrGenError { - #[error("rule not found: {rule_name} for target {target_name}")] - RuleNotFound { target_name: String, rule_name: String, }, + #[error("rule '{rule_name}' referenced by target '{target_name}' was not found")] + RuleNotFound { target_name: String, rule_name: String }, + + #[error("multiple rules for target '{target_name}': {rules:?}")] + MultipleRules { target_name: String, rules: Vec }, + + #[error("No rules specified for target {target_name}")] + EmptyRule { target_name: String }, + + #[error("duplicate target outputs: {outputs:?}")] + DuplicateOutput { outputs: Vec }, #[error("circular dependency detected: {cycle:?}")] - CircularDependency { cycle: Vec, }, + CircularDependency { cycle: Vec }, - #[error("dependency not found: {dependency_name} for target {target_name}")] - DependencyNotFound { target_name: String, dependency_name: String, }, } + #[error("failed to serialise action: {0}")] + ActionSerialisation(#[from] serde_json::Error), } ``` - `anyhow`: This crate will be used in the main application logic (`main.rs`) diff --git a/src/hasher.rs b/src/hasher.rs index 33039465..d34a11d1 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -26,22 +26,40 @@ use sha2::{Digest, Sha256}; use crate::ir::Action; +use serde_json_canonicalizer::to_writer; +use std::io::{self, Write}; /// Computes stable digests for [`Action`] definitions. pub struct ActionHasher; +struct DigestWriter<'a, D: Digest>(&'a mut D); + +impl Write for DigestWriter<'_, D> { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.update(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + impl ActionHasher { /// Calculate the hash of an [`Action`]. /// - /// # Panics + /// Returns a lowercase hex-encoded SHA-256 of the action's canonical JSON. + /// + /// # Errors /// - /// Panics if the action cannot be serialised to JSON. - #[must_use] - pub fn hash(action: &Action) -> String { - // Serialise using canonical JSON so field order and absent options do - // not affect the resulting digest. - let bytes = serde_json::to_vec(action).expect("serialise action to JSON"); - let digest = Sha256::digest(bytes); - format!("{digest:x}") + /// Returns an error if the action cannot be serialised to JSON. + pub fn hash(action: &Action) -> Result { + let mut hasher = Sha256::new(); + { + // Canonical JSON: compact formatting with sorted keys. + let mut writer = DigestWriter(&mut hasher); + to_writer(action, &mut writer)?; + } + Ok(format!("{:x}", hasher.finalize())) } } diff --git a/src/ir.rs b/src/ir.rs index d1d3e651..94846f53 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -102,6 +102,9 @@ pub enum IrGenError { #[error("circular dependency detected: {cycle:?}")] CircularDependency { cycle: Vec }, + + #[error("failed to serialise action: {0}")] + ActionSerialisation(#[from] serde_json::Error), } impl BuildGraph { @@ -115,7 +118,7 @@ impl BuildGraph { let mut graph = Self::default(); let mut rule_map = HashMap::new(); - Self::process_rules(manifest, &mut graph.actions, &mut rule_map); + Self::process_rules(manifest, &mut graph.actions, &mut rule_map)?; Self::process_targets(manifest, &mut graph.actions, &mut graph.targets, &rule_map)?; Self::process_defaults(manifest, &mut graph.default_targets); @@ -128,11 +131,12 @@ impl BuildGraph { manifest: &NetsukeManifest, actions: &mut HashMap, rule_map: &mut HashMap, - ) { + ) -> Result<(), IrGenError> { for rule in &manifest.rules { - let hash = register_action(actions, rule.recipe.clone(), rule.description.clone()); + let hash = register_action(actions, rule.recipe.clone(), rule.description.clone())?; rule_map.insert(rule.name.clone(), hash); } + Ok(()) } fn process_targets( @@ -147,7 +151,7 @@ impl BuildGraph { let action_id = match &target.recipe { Recipe::Rule { rule } => resolve_rule(rule, rule_map, &target_name)?, Recipe::Command { .. } | Recipe::Script { .. } => { - register_action(actions, target.recipe.clone(), None) + register_action(actions, target.recipe.clone(), None)? } }; @@ -189,7 +193,7 @@ fn register_action( actions: &mut HashMap, recipe: Recipe, description: Option, -) -> String { +) -> Result { let action = Action { recipe, description, @@ -198,9 +202,9 @@ fn register_action( pool: None, restat: false, }; - let hash = ActionHasher::hash(&action); + let hash = ActionHasher::hash(&action).map_err(IrGenError::ActionSerialisation)?; actions.entry(hash.clone()).or_insert(action); - hash + Ok(hash) } fn map_string_or_list(sol: &StringOrList, f: F) -> Vec diff --git a/tests/hasher_tests.rs b/tests/hasher_tests.rs index 90a0692d..cb01447c 100644 --- a/tests/hasher_tests.rs +++ b/tests/hasher_tests.rs @@ -15,7 +15,7 @@ use rstest::rstest; pool: None, restat: false, }, - "b43a76a10b522e53fc0fb0fcb3354939e00d6b708252050c27100da204a811ae" + "0fe3670f0746dcec34768df158d814ac099e416b6045e7e213d0aabd6aa761cb" )] #[case( Action { @@ -59,7 +59,7 @@ use rstest::rstest; pool: None, restat: false, }, - "333d2b3f4f805b80c2e1aef1b5c9f1e0bbc990b77121c731f14edf3691ce120c" + "57023b1c00f7daf410d3d2077346e38014d3612c278aadef73a8484c94bdcb77" )] // Order of rule names influences the digest. #[case( @@ -74,5 +74,6 @@ use rstest::rstest; "d5f1a262a95b75db3a7a79a5855eb27b6b430833e7ba93538502a16ebd03f50b" )] fn hash_action_is_stable(#[case] action: Action, #[case] expected: &str) { - assert_eq!(ActionHasher::hash(&action), expected); + let digest = ActionHasher::hash(&action).expect("hash action"); + assert_eq!(digest, expected); }