From c78ffad3b22d1b22dd883888aa4adb396df54dd2 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:05:24 +0200 Subject: [PATCH 01/11] feat(c14n): implement XML canonicalization (inclusive + exclusive) - Inclusive C14N 1.0: all in-scope namespaces, sorted by prefix - Exclusive C14N 1.0: visibly-utilized namespaces + PrefixList - Document-order tree walker on roxmltree - Text/attribute escaping per W3C spec - Attribute sorting (ns-uri + local-name) - Empty element expansion, CDATA flattening, PI normalization - Document-level comment/PI separator handling - 25 unit + 13 integration tests (xmllint reference outputs) - Update deps: roxmltree 0.21, x509-parser 0.18 Closes #5 --- .gitignore | 2 + Cargo.toml | 4 +- src/c14n/escape.rs | 66 +++++++ src/c14n/mod.rs | 228 ++++++++++++++++++++-- src/c14n/ns_exclusive.rs | 169 ++++++++++++++++ src/c14n/ns_inclusive.rs | 140 ++++++++++++++ src/c14n/serialize.rs | 397 ++++++++++++++++++++++++++++++++++++++ src/lib.rs | 15 +- tests/c14n_integration.rs | 319 ++++++++++++++++++++++++++++++ 9 files changed, 1315 insertions(+), 25 deletions(-) create mode 100644 src/c14n/escape.rs create mode 100644 src/c14n/ns_exclusive.rs create mode 100644 src/c14n/ns_inclusive.rs create mode 100644 src/c14n/serialize.rs create mode 100644 tests/c14n_integration.rs diff --git a/.gitignore b/.gitignore index b1c081e..5716ccf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ Cargo.lock .DS_Store donors/ .refs/ +arch/ +ROADMAP.md diff --git a/Cargo.toml b/Cargo.toml index 42b7c6d..a137bbc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,13 +14,13 @@ readme = "README.md" [dependencies] # XML parsing -roxmltree = "0.20" +roxmltree = "0.21" # Crypto ring = "0.17" # X.509 certificates -x509-parser = "0.16" +x509-parser = "0.18" der = "0.7" # Error handling diff --git a/src/c14n/escape.rs b/src/c14n/escape.rs new file mode 100644 index 0000000..2bd62e5 --- /dev/null +++ b/src/c14n/escape.rs @@ -0,0 +1,66 @@ +//! Text and attribute value escaping for canonical XML. + +/// Escape text node content for canonical XML. +/// +/// Replaces: `&` → `&`, `<` → `<`, `>` → `>`, `\r` → ` ` +pub(crate) fn escape_text(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'&' => output.extend_from_slice(b"&"), + b'<' => output.extend_from_slice(b"<"), + b'>' => output.extend_from_slice(b">"), + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + +/// Escape attribute value for canonical XML. +/// +/// Replaces: `&` → `&`, `<` → `<`, `"` → `"`, +/// `\t` → ` `, `\n` → ` `, `\r` → ` ` +pub(crate) fn escape_attr(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'&' => output.extend_from_slice(b"&"), + b'<' => output.extend_from_slice(b"<"), + b'"' => output.extend_from_slice(b"""), + b'\t' => output.extend_from_slice(b" "), + b'\n' => output.extend_from_slice(b" "), + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn text_escaping() { + let mut out = Vec::new(); + escape_text("a < b & c > d\r\n", &mut out); + assert_eq!( + String::from_utf8(out).expect("valid utf8"), + "a < b & c > d \n" + ); + } + + #[test] + fn attr_escaping() { + let mut out = Vec::new(); + escape_attr("he said \"hi\" & \t\n\r", &mut out); + assert_eq!( + String::from_utf8(out).expect("valid utf8"), + "he said "hi" & " + ); + } + + #[test] + fn passthrough_plain_text() { + let mut out = Vec::new(); + escape_text("hello world", &mut out); + assert_eq!(String::from_utf8(out).expect("valid utf8"), "hello world"); + } +} diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index 4a65e69..4343ae7 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -1,23 +1,217 @@ //! XML Canonicalization (C14N). //! //! Implements: -//! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) -//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) -//! - [Exclusive XML Canonicalization](https://www.w3.org/TR/xml-exc-c14n/) +//! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) (inclusive) +//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) (inclusive) +//! - [Exclusive XML Canonicalization 1.0](https://www.w3.org/TR/xml-exc-c14n/) (exclusive) +//! +//! # Example +//! +//! ``` +//! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; +//! +//! let xml = b""; +//! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); +//! let canonical = canonicalize_xml(xml, &algo).unwrap(); +//! assert_eq!( +//! String::from_utf8(canonical).unwrap(), +//! "" +//! ); +//! ``` + +mod escape; +pub(crate) mod ns_exclusive; +pub(crate) mod ns_inclusive; +pub(crate) mod serialize; + +use std::collections::HashSet; + +use roxmltree::{Document, Node}; + +use ns_exclusive::ExclusiveNsRenderer; +use ns_inclusive::InclusiveNsRenderer; +use serialize::serialize_canonical; -/// Canonicalization algorithm selection. +/// C14N algorithm mode (without the comments flag). #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum C14nAlgorithm { - /// Canonical XML 1.0 (with comments). - Inclusive10WithComments, - /// Canonical XML 1.0 (without comments). - Inclusive10, - /// Canonical XML 1.1 (with comments). - Inclusive11WithComments, - /// Canonical XML 1.1 (without comments). - Inclusive11, - /// Exclusive Canonical XML (with comments). - Exclusive10WithComments, - /// Exclusive Canonical XML (without comments). - Exclusive10, +pub enum C14nMode { + /// Inclusive C14N 1.0 — all in-scope namespaces rendered. + Inclusive1_0, + /// Inclusive C14N 1.1 — like 1.0 with xml:id propagation and xml:base fixup. + Inclusive1_1, + /// Exclusive C14N 1.0 — only visibly-utilized namespaces rendered. + Exclusive1_0, +} + +/// Full C14N algorithm identifier. +/// +/// Constructed from algorithm URIs found in `` or +/// `` elements. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct C14nAlgorithm { + /// The canonicalization mode. + pub mode: C14nMode, + /// Whether to preserve comment nodes. + pub with_comments: bool, + /// For Exclusive C14N: prefixes to treat as inclusive. + /// Parsed from ``. + /// `"#default"` in the PrefixList is normalized to `""` (empty string). + pub inclusive_prefixes: HashSet, +} + +impl C14nAlgorithm { + /// Create a new algorithm with the given mode and comments flag. + pub fn new(mode: C14nMode, with_comments: bool) -> Self { + Self { + mode, + with_comments, + inclusive_prefixes: HashSet::new(), + } + } + + /// Parse from an algorithm URI. Returns `None` for unrecognized URIs. + pub fn from_uri(uri: &str) -> Option { + let (mode, with_comments) = match uri { + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315" => (C14nMode::Inclusive1_0, false), + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" => { + (C14nMode::Inclusive1_0, true) + } + "http://www.w3.org/2006/12/xml-c14n11" => (C14nMode::Inclusive1_1, false), + "http://www.w3.org/2006/12/xml-c14n11#WithComments" => (C14nMode::Inclusive1_1, true), + "http://www.w3.org/2001/10/xml-exc-c14n#" => (C14nMode::Exclusive1_0, false), + "http://www.w3.org/2001/10/xml-exc-c14n#WithComments" => (C14nMode::Exclusive1_0, true), + _ => return None, + }; + Some(Self { + mode, + with_comments, + inclusive_prefixes: HashSet::new(), + }) + } + + /// Set the InclusiveNamespaces PrefixList (exclusive C14N only). + /// `"#default"` is normalized to empty string `""`. + pub fn with_prefix_list(mut self, prefix_list: &str) -> Self { + self.inclusive_prefixes = prefix_list + .split_whitespace() + .map(|p| { + if p == "#default" { + String::new() + } else { + p.to_string() + } + }) + .collect(); + self + } + + /// Get the algorithm URI for this configuration. + pub fn uri(&self) -> &'static str { + match (self.mode, self.with_comments) { + (C14nMode::Inclusive1_0, false) => "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", + (C14nMode::Inclusive1_0, true) => { + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" + } + (C14nMode::Inclusive1_1, false) => "http://www.w3.org/2006/12/xml-c14n11", + (C14nMode::Inclusive1_1, true) => "http://www.w3.org/2006/12/xml-c14n11#WithComments", + (C14nMode::Exclusive1_0, false) => "http://www.w3.org/2001/10/xml-exc-c14n#", + (C14nMode::Exclusive1_0, true) => "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", + } + } +} + +/// Error type for C14N operations. +#[derive(Debug, thiserror::Error)] +pub enum C14nError { + /// XML parsing error. + #[error("XML parse error: {0}")] + Parse(String), + /// Invalid node reference. + #[error("invalid node reference")] + InvalidNode, + /// I/O error. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), +} + +/// Canonicalize an XML document or document subset. +/// +/// - `doc`: parsed roxmltree document (read-only DOM). +/// - `node_set`: optional predicate controlling which nodes appear in output. +/// `None` means the entire document. +/// - `algo`: algorithm parameters (mode, comments, prefix list). +/// - `output`: byte buffer receiving canonical XML. +pub fn canonicalize( + doc: &Document, + node_set: Option<&dyn Fn(Node) -> bool>, + algo: &C14nAlgorithm, + output: &mut Vec, +) -> Result<(), C14nError> { + match algo.mode { + C14nMode::Inclusive1_0 | C14nMode::Inclusive1_1 => { + let renderer = InclusiveNsRenderer; + serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) + } + C14nMode::Exclusive1_0 => { + let renderer = ExclusiveNsRenderer::new(&algo.inclusive_prefixes); + serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) + } + } +} + +/// Convenience: parse XML bytes and canonicalize the whole document. +pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result, C14nError> { + let xml_str = std::str::from_utf8(xml).map_err(|e| C14nError::Parse(e.to_string()))?; + let doc = Document::parse(xml_str).map_err(|e| C14nError::Parse(e.to_string()))?; + let mut output = Vec::new(); + canonicalize(&doc, None, algo, &mut output)?; + Ok(output) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_uri_roundtrip() { + let uris = [ + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments", + "http://www.w3.org/2006/12/xml-c14n11", + "http://www.w3.org/2006/12/xml-c14n11#WithComments", + "http://www.w3.org/2001/10/xml-exc-c14n#", + "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", + ]; + for uri in uris { + let algo = C14nAlgorithm::from_uri(uri).expect(uri); + assert_eq!(algo.uri(), uri); + } + } + + #[test] + fn unknown_uri_returns_none() { + assert!(C14nAlgorithm::from_uri("http://example.com/unknown").is_none()); + } + + #[test] + fn prefix_list_parsing() { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false) + .with_prefix_list("foo bar #default baz"); + assert!(algo.inclusive_prefixes.contains("foo")); + assert!(algo.inclusive_prefixes.contains("bar")); + assert!(algo.inclusive_prefixes.contains("baz")); + assert!(algo.inclusive_prefixes.contains("")); // #default → "" + assert_eq!(algo.inclusive_prefixes.len(), 4); + } + + #[test] + fn canonicalize_xml_basic() { + let xml = b""; + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + let result = canonicalize_xml(xml, &algo).expect("c14n"); + assert_eq!( + String::from_utf8(result).expect("utf8"), + r#""# + ); + } } diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs new file mode 100644 index 0000000..5d37294 --- /dev/null +++ b/src/c14n/ns_exclusive.rs @@ -0,0 +1,169 @@ +//! Exclusive namespace rendering for Exclusive C14N 1.0. +//! +//! In exclusive mode, only **visibly utilized** namespace prefixes are emitted +//! on each element (plus any prefixes forced by the InclusiveNamespaces PrefixList). + +use std::collections::{HashMap, HashSet}; + +use roxmltree::Node; + +use super::serialize::NsRenderer; + +/// Exclusive C14N namespace renderer. +/// +/// Only emits namespace declarations for prefixes that are visibly utilized +/// by the element's tag name or attributes, plus any forced prefixes from +/// the `InclusiveNamespaces PrefixList`. +pub(crate) struct ExclusiveNsRenderer<'a> { + inclusive_prefixes: &'a HashSet, +} + +impl<'a> ExclusiveNsRenderer<'a> { + pub(crate) fn new(inclusive_prefixes: &'a HashSet) -> Self { + Self { inclusive_prefixes } + } +} + +impl NsRenderer for ExclusiveNsRenderer<'_> { + fn render_namespaces<'n>( + &self, + node: Node<'n, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap) { + let utilized = visibly_utilized_prefixes(node); + let mut rendered = parent_rendered.clone(); + let mut decls: Vec<(String, String)> = Vec::new(); + + for ns in node.namespaces() { + let prefix = ns.name().unwrap_or(""); + let uri = ns.uri(); + + // The `xml` prefix is never declared. + if prefix == "xml" { + continue; + } + + // Include if visibly utilized OR in the forced prefix list. + let dominated = utilized.contains(prefix) || self.inclusive_prefixes.contains(prefix); + if !dominated { + continue; + } + + // Only render if different from nearest output ancestor. + if parent_rendered.get(prefix).map(|s| s.as_str()) == Some(uri) { + continue; + } + + // Don't emit xmlns="" if no default ns was in scope. + if prefix.is_empty() && uri.is_empty() && !parent_rendered.contains_key("") { + continue; + } + + decls.push((prefix.to_string(), uri.to_string())); + } + + // Sort by prefix. + decls.sort_by(|a, b| a.0.cmp(&b.0)); + + for (prefix, uri) in &decls { + rendered.insert(prefix.clone(), uri.clone()); + } + + (decls, rendered) + } +} + +/// Determine which namespace prefixes are visibly utilized by an element. +/// +/// A prefix is visibly utilized if: +/// 1. The element's tag name uses that prefix, OR +/// 2. Any attribute on the element uses that prefix. +fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { + let mut utilized = HashSet::new(); + + // Element's own prefix. + if let Some(ns_uri) = node.tag_name().namespace() { + match node.lookup_prefix(ns_uri) { + Some(prefix) if !prefix.is_empty() => { + utilized.insert(prefix); + } + _ if !ns_uri.is_empty() => { + // Element uses default namespace. + utilized.insert(""); + } + _ => {} + } + } + + // Attribute prefixes. + for attr in node.attributes() { + if let Some(ns_uri) = attr.namespace() { + if let Some(prefix) = node.lookup_prefix(ns_uri) { + if !prefix.is_empty() { + utilized.insert(prefix); + } + } + } + } + + utilized +} + +#[cfg(test)] +mod tests { + use super::super::serialize::serialize_canonical; + use super::*; + use roxmltree::Document; + use std::collections::HashSet; + + fn exc_c14n(xml: &str, prefix_list: &HashSet) -> String { + let doc = Document::parse(xml).expect("parse"); + let renderer = ExclusiveNsRenderer::new(prefix_list); + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + String::from_utf8(out).expect("utf8") + } + + #[test] + fn only_utilized_ns_rendered() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // root uses no prefix → no ns decls on root. + // a:child uses a: → xmlns:a on child. + // b: is not utilized anywhere → not rendered. + assert!(!result.contains("xmlns:b")); + assert!(result.contains(r#""#)); + } + + #[test] + fn forced_prefix_via_prefix_list() { + let xml = r#""#; + let mut forced = HashSet::new(); + forced.insert("b".to_string()); + let result = exc_c14n(xml, &forced); + // b: is forced via PrefixList → should appear on root. + assert!(result.contains(r#"xmlns:b="http://b.com""#)); + } + + #[test] + fn sibling_elements_redeclare() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // In exclusive mode, each sibling must independently declare a:. + assert!(result.contains(r#""#)); + assert!(result.contains(r#""#)); + } + + #[test] + fn default_ns_utilized() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // Both root and child use the default ns. + assert!(result.contains(r#""#)); + // child inherits → parent already rendered → not redeclared. + assert_eq!( + result, + r#""# + ); + } +} diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs new file mode 100644 index 0000000..d917aaf --- /dev/null +++ b/src/c14n/ns_inclusive.rs @@ -0,0 +1,140 @@ +//! Inclusive namespace rendering for C14N 1.0 and 1.1. +//! +//! In inclusive mode, ALL in-scope namespace declarations are emitted on each +//! element, even if the namespace prefix is not visibly used by that element's +//! tag or attributes. + +use std::collections::HashMap; + +use roxmltree::Node; + +use super::serialize::NsRenderer; + +/// Inclusive C14N namespace renderer. +/// +/// Emits all in-scope namespace bindings that differ from what the nearest +/// output ancestor already rendered. +pub(crate) struct InclusiveNsRenderer; + +impl NsRenderer for InclusiveNsRenderer { + fn render_namespaces<'a>( + &self, + node: Node<'a, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap) { + let mut rendered = parent_rendered.clone(); + let mut decls: Vec<(String, String)> = Vec::new(); + + // Collect all in-scope namespace declarations from roxmltree. + // node.namespaces() yields every ns declared on this element or + // inherited from ancestors. + for ns in node.namespaces() { + let prefix = ns.name().unwrap_or(""); // None = default namespace + let uri = ns.uri(); + + // The `xml` prefix namespace is never declared in canonical XML. + if prefix == "xml" { + continue; + } + + // Emit only if different from what the nearest output ancestor rendered. + if parent_rendered.get(prefix).map(|u| u.as_str()) == Some(uri) { + continue; + } + + // Suppress xmlns="" when no default namespace was previously in scope. + // Only emit xmlns="" to undeclare a previously declared default ns. + if prefix.is_empty() && uri.is_empty() && !parent_rendered.contains_key("") { + continue; + } + + decls.push((prefix.to_string(), uri.to_string())); + } + + // Sort namespace declarations by prefix (lexicographic, "" sorts first). + decls.sort_by(|a, b| a.0.cmp(&b.0)); + + // Update the rendered map. + for (prefix, uri) in &decls { + rendered.insert(prefix.clone(), uri.clone()); + } + + (decls, rendered) + } +} + +#[cfg(test)] +mod tests { + use super::super::serialize::serialize_canonical; + use super::*; + use roxmltree::Document; + + #[test] + fn namespaces_rendered_on_first_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // Default ns and a: ns should appear on root. + assert!(result.contains(r#"xmlns="http://example.com""#)); + assert!(result.contains(r#"xmlns:a="http://a.com""#)); + } + + #[test] + fn inherited_ns_not_redeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // xmlns:a should appear only once (on root), not redeclared on child. + assert_eq!( + result, + r#""# + ); + } + + #[test] + fn overridden_ns_is_redeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // child should redeclare a: with different URI. + assert!(result.contains(r#""#)); + } + + #[test] + fn default_ns_undeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // child should have xmlns="" to undeclare the default namespace. + assert!(result.contains(r#""#)); + } + + #[test] + fn ns_decls_sorted_by_prefix() { + let xml = + r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // Order should be: xmlns="" (default), xmlns:a, xmlns:z. + let idx_default = result.find(r#"xmlns=""#).expect("default ns"); + let idx_a = result.find(r#"xmlns:a="#).expect("a ns"); + let idx_z = result.find(r#"xmlns:z="#).expect("z ns"); + assert!(idx_default < idx_a, "default before a"); + assert!(idx_a < idx_z, "a before z"); + } +} diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs new file mode 100644 index 0000000..e0f4cf3 --- /dev/null +++ b/src/c14n/serialize.rs @@ -0,0 +1,397 @@ +//! Document-order serialization for canonical XML. +//! +//! Walks an XML document tree in document order, emitting canonical bytes. +//! Namespace rendering is delegated to the caller via [`NsRenderer`] trait. + +use std::collections::HashMap; + +use roxmltree::{Document, Node, NodeType}; + +use super::escape::{escape_attr, escape_text}; +use super::C14nError; + +/// Trait for namespace rendering strategies (inclusive vs exclusive). +pub(crate) trait NsRenderer { + /// Compute namespace declarations to emit for this element. + /// Returns (sorted_ns_decls, updated_rendered_map). + /// + /// `parent_rendered` maps prefix → URI for what the nearest output ancestor + /// already declared in the canonical form. + fn render_namespaces<'a>( + &self, + node: Node<'a, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap); +} + +/// Canonicalize a document (or subset) to the output buffer. +/// +/// - `doc`: parsed roxmltree document +/// - `node_set`: optional predicate — if `Some`, only nodes where predicate +/// returns `true` are included in the output +/// - `with_comments`: whether to preserve comment nodes +/// - `ns_renderer`: namespace rendering strategy +/// - `output`: destination buffer +pub(crate) fn serialize_canonical( + doc: &Document, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + output: &mut Vec, +) -> Result<(), C14nError> { + let root = doc.root(); + serialize_children( + root, + node_set, + with_comments, + ns_renderer, + &HashMap::new(), + output, + ); + Ok(()) +} + +/// Serialize children of a node in document order. +fn serialize_children( + parent: Node, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + parent_rendered: &HashMap, + output: &mut Vec, +) { + let is_doc_root = parent.node_type() == NodeType::Root; + + for child in parent.children() { + // Node-set filtering: skip nodes not in the set. + let in_set = node_set.map_or(true, |pred| pred(child)); + + match child.node_type() { + NodeType::Element => { + if in_set { + // Before root element: emit \n if there was output before. + if is_doc_root && !output.is_empty() { + output.push(b'\n'); + } + serialize_element( + child, + node_set, + with_comments, + ns_renderer, + parent_rendered, + output, + ); + } else { + // Element not in set, but descendants might be — walk children. + serialize_children( + child, + node_set, + with_comments, + ns_renderer, + parent_rendered, + output, + ); + } + } + NodeType::Text => { + if in_set { + // Document-level text nodes are ignored by C14N. + // Only text inside elements is serialized. + if !is_doc_root { + if let Some(text) = child.text() { + escape_text(text, output); + } + } + } + } + NodeType::Comment => { + if with_comments && in_set { + if is_doc_root { + write_doc_level_separator(&child, output); + } + output.extend_from_slice(b""); + } + } + NodeType::PI => { + if in_set { + if let Some(pi) = child.pi() { + if is_doc_root { + write_doc_level_separator(&child, output); + } + output.extend_from_slice(b""); + } + } + } + NodeType::Root => { + // Should not happen as a child. + } + } + } +} + +/// Serialize a single element node (start tag + children + end tag). +fn serialize_element( + node: Node, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + parent_rendered: &HashMap, + output: &mut Vec, +) { + let (ns_decls, rendered) = ns_renderer.render_namespaces(node, parent_rendered); + + // Start tag: = node.attributes().collect(); + + // Filter attributes by node-set if applicable. + if let Some(pred) = node_set { + // For document subsets, only include attributes that are "in the set". + // roxmltree doesn't give attribute nodes separate Node identity, + // so when we have a node_set, all attributes of an included element + // are included (matching xmlsec1 behavior for typical use cases). + let _ = pred; + // All attributes included if the element is in the set. + } + + attrs.sort_by(|a, b| { + let a_key = (a.namespace().unwrap_or(""), a.name()); + let b_key = (b.namespace().unwrap_or(""), b.name()); + a_key.cmp(&b_key) + }); + + for attr in &attrs { + output.push(b' '); + write_attribute_name(node, attr, output); + output.extend_from_slice(b"=\""); + escape_attr(attr.value(), output); + output.push(b'"'); + } + + // Always use form, never self-closing. + output.push(b'>'); + + // Children. + serialize_children( + node, + node_set, + with_comments, + ns_renderer, + &rendered, + output, + ); + + // End tag. + output.extend_from_slice(b"'); +} + +/// Write `\n` separator for document-level nodes. +/// +/// C14N spec: document-level comments and PIs get `\n` between them and the +/// root element. Specifically: +/// - Before root element: comment/PI followed by `\n` +/// - After root element: `\n` followed by comment/PI +/// +/// This function emits `\n` either before or after the node, depending on +/// whether the root element has already been emitted. +fn write_doc_level_separator(node: &Node, output: &mut Vec) { + let root_elem_seen = has_preceding_element_sibling(node); + if root_elem_seen { + // After root element: \n before this node. + output.push(b'\n'); + } else if !output.is_empty() { + // Before root element but not first output: \n after previous node. + output.push(b'\n'); + } +} + +/// Check if a node has a preceding sibling that is an element. +fn has_preceding_element_sibling(node: &Node) -> bool { + let mut prev = node.prev_sibling(); + while let Some(p) = prev { + if p.is_element() { + return true; + } + prev = p.prev_sibling(); + } + false +} + +/// Write the qualified name (prefix:localname or just localname) of an element. +fn write_qualified_name(node: Node, output: &mut Vec) { + if let Some(ns_uri) = node.tag_name().namespace() { + if let Some(prefix) = node.lookup_prefix(ns_uri) { + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); + } + } + } + output.extend_from_slice(node.tag_name().name().as_bytes()); +} + +/// Write an attribute's qualified name (prefix:localname or just localname). +fn write_attribute_name(element: Node, attr: &roxmltree::Attribute, output: &mut Vec) { + if let Some(ns_uri) = attr.namespace() { + if let Some(prefix) = element.lookup_prefix(ns_uri) { + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); + } + } + } + output.extend_from_slice(attr.name().as_bytes()); +} + +#[cfg(test)] +mod tests { + use super::super::ns_inclusive::InclusiveNsRenderer; + use super::*; + + #[test] + fn empty_element_expanded() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn text_preserved() { + let xml = " hello & world "; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + " hello & world " + ); + } + + #[test] + fn comments_stripped_by_default() { + let xml = "text"; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!(String::from_utf8(out).expect("utf8"), "text"); + } + + #[test] + fn comments_preserved_with_flag() { + let xml = "text"; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, true, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "text" + ); + } + + #[test] + fn attribute_sorting() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + r#""# + ); + } + + #[test] + fn pi_serialization() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + // XML declaration is omitted by roxmltree parsing. + // PI inside root is preserved. + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn nested_elements_document_order() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn document_level_comments() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, true, &renderer, &mut out).expect("c14n"); + // C14N spec: \n between document-level nodes. + // Before root: comment + \n + root + // After root: root + \n + comment + assert_eq!( + String::from_utf8(out).expect("utf8"), + "\n\n" + ); + } + + #[test] + fn document_level_pi_before_root() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "\n" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 992babe..aa900e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,13 +11,16 @@ //! //! ## Quick Start //! -//! ```rust,no_run -//! use xml_sec::{XmlSigner, XmlVerifier}; +//! ```rust +//! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; //! -//! // Verify a signed XML document -//! let doc = std::fs::read_to_string("signed.xml").unwrap(); -//! let cert = std::fs::read("cert.pem").unwrap(); -//! let valid = XmlVerifier::new(&cert).verify(&doc).unwrap(); +//! let xml = b""; +//! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); +//! let canonical = canonicalize_xml(xml, &algo).unwrap(); +//! assert_eq!( +//! String::from_utf8(canonical).unwrap(), +//! "" +//! ); //! ``` #![deny(unsafe_code)] diff --git a/tests/c14n_integration.rs b/tests/c14n_integration.rs new file mode 100644 index 0000000..4a9fa6b --- /dev/null +++ b/tests/c14n_integration.rs @@ -0,0 +1,319 @@ +//! Integration tests for C14N against xmllint reference outputs. +//! +//! Reference outputs generated by: `xmllint --c14n ` and `xmllint --exc-c14n ` +//! using libxml2 (system xmllint). These are the authoritative C14N implementations. + +use xml_sec::c14n::{canonicalize_xml, C14nAlgorithm, C14nMode}; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +fn assert_c14n(xml: &[u8], mode: C14nMode, with_comments: bool, expected: &str) { + let algo = C14nAlgorithm::new(mode, with_comments); + let result = canonicalize_xml(xml, &algo).expect("canonicalize failed"); + let result_str = String::from_utf8(result).expect("invalid utf8"); + assert_eq!( + result_str, expected, + "\n--- GOT ---\n{result_str}\n--- EXPECTED ---\n{expected}" + ); +} + +fn assert_exc_c14n(xml: &[u8], prefix_list: &str, expected: &str) { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false).with_prefix_list(prefix_list); + let result = canonicalize_xml(xml, &algo).expect("canonicalize failed"); + let result_str = String::from_utf8(result).expect("invalid utf8"); + assert_eq!( + result_str, expected, + "\n--- GOT ---\n{result_str}\n--- EXPECTED ---\n{expected}" + ); +} + +// ─── Simple namespace document (inclusive) ────────────────────────────────── +// Source: tests/vectors/simple-ns.xml +// Reference: xmllint --c14n + +const SIMPLE_NS_XML: &[u8] = + br#" + + + + +"#; + +#[test] +fn simple_ns_inclusive() { + // Reference from xmllint --c14n: + // - ns decls sorted: xmlns= first, then xmlns:a, xmlns:z + // - attrs sorted: a before b (no namespace, lexicographic) + // - child: ns inherited, not redeclared; attrs sorted: plain, z:attr + // - z:other: xmlns:a overridden, must redeclare + assert_c14n( + SIMPLE_NS_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#""#, + ), + ); +} + +#[test] +fn simple_ns_exclusive() { + // Reference from xmllint --exc-c14n: + // - root: only xmlns= (default ns used by root tag) + // - a:child: xmlns:a, xmlns:z (used by a: prefix and z:attr) + // - inner: no ns (no prefix used, inherits default from parent) + // - z:other: xmlns:z only (a: not used by z:other) + assert_exc_c14n( + SIMPLE_NS_XML, + "", + concat!( + r#""#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#""#, + ), + ); +} + +// ─── Merlin-like document ─────────────────────────────────────────────────── +// Structure from merlin-c14n-three/signature.xml (data portion only) +// Reference from xmllint --c14n / --exc-c14n + +const MERLIN_DATA_XML: &[u8] = br#" + + + + + + + + + + + + + + + + +"#; + +#[test] +fn merlin_data_inclusive() { + // All 4 ns prefixes + default ns + xml:lang on root. + // Inner elements inherit everything, no redeclarations. + assert_c14n( + MERLIN_DATA_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "", + ), + ); +} + +#[test] +fn merlin_data_exclusive() { + // Exclusive: each element only declares what it visibly uses. + // foo:Root uses foo: prefix → xmlns:foo + // xml:lang is an attribute, always present on root + // bar:Something uses bar: → xmlns:bar + // foo:Nothing uses foo: → already rendered by parent? No — exc mode + // tracks per output ancestor, foo:Root rendered foo:, so foo:Nothing inherits + // baz:Something uses baz: → xmlns:baz + // Default ns (xmlns="http://example.org/") not used by any element prefix → NOT rendered + assert_exc_c14n( + MERLIN_DATA_XML, + "", + concat!( + r#""#, + "\n", + r#" "#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + r#" "#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "", + ), + ); +} + +// ─── Attribute sorting with namespaced attributes ─────────────────────────── +// Reference from xmllint --c14n + +const ATTR_SORT_XML: &[u8] = br#" + +"#; + +#[test] +fn attr_sort_inclusive() { + // Attributes sorted by (namespace-uri, local-name): + // - plain (no ns, "") → first + // - ns1:a (http://ns1, a) + // - ns1:b (http://ns1, b) + // - ns2:a (http://ns2, a) + // - ns2:b (http://ns2, b) + assert_c14n( + ATTR_SORT_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + r#" "#, + "\n", + "", + ), + ); +} + +#[test] +fn attr_sort_exclusive() { + // In exclusive mode: root uses no ns → no decls on root. + // elem uses ns1: and ns2: → declares both. + assert_exc_c14n( + ATTR_SORT_XML, + "", + concat!( + "\n", + r#" "#, + "\n", + "", + ), + ); +} + +// ─── Comments and Processing Instructions ─────────────────────────────────── + +const COMMENTS_PI_XML: &[u8] = b"\n\n\n \n \n\n"; + +#[test] +fn comments_stripped() { + // Comments are stripped entirely. PI is first output, no leading \n. + // Trailing comment also stripped, no trailing \n. + // The whitespace text node between stripped comment and PI is at + // document level → ignored by C14N. + assert_c14n( + COMMENTS_PI_XML, + C14nMode::Inclusive1_0, + false, + concat!( + "\n", + "\n", + " \n", + " \n", + "", + ), + ); +} + +#[test] +fn comments_preserved() { + assert_c14n( + COMMENTS_PI_XML, + C14nMode::Inclusive1_0, + true, + concat!( + "\n\n", + "\n", + " \n", + " \n", + "\n", + ), + ); +} + +// ─── Idempotency ──────────────────────────────────────────────────────────── + +#[test] +fn idempotency_inclusive() { + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + let pass1 = canonicalize_xml(MERLIN_DATA_XML, &algo).expect("pass1"); + let pass2 = canonicalize_xml(&pass1, &algo).expect("pass2"); + assert_eq!(pass1, pass2, "C14N must be idempotent"); +} + +#[test] +fn idempotency_exclusive() { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false); + let pass1 = canonicalize_xml(MERLIN_DATA_XML, &algo).expect("pass1"); + let pass2 = canonicalize_xml(&pass1, &algo).expect("pass2"); + assert_eq!(pass1, pass2, "Exclusive C14N must be idempotent"); +} + +// ─── Empty document edge case ─────────────────────────────────────────────── + +#[test] +fn minimal_document() { + assert_c14n(b"", C14nMode::Inclusive1_0, false, ""); +} + +#[test] +fn cdata_flattened() { + // CDATA sections should be replaced with escaped text content. + assert_c14n( + b"", + C14nMode::Inclusive1_0, + false, + "a < b & c", + ); +} + +// ─── Default namespace undeclaration ──────────────────────────────────────── + +#[test] +fn default_ns_undeclaration() { + let xml = br#""#; + assert_c14n( + xml, + C14nMode::Inclusive1_0, + false, + r#""#, + ); +} From d4326d772144f9e6c3a8696ca6b1bfb1094e270a Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:24:48 +0200 Subject: [PATCH 02/11] fix(c14n): escape CR in PI/node content, reject C14N 1.1 - Escape \r to in PI and XML node content (C14N spec section 2.3) - C14N 1.1 returns UnsupportedAlgorithm instead of silent 1.0 fallback - Allow clippy::unwrap_used in test modules - Document lookup_prefix() limitation for aliased namespace prefixes --- src/c14n/escape.rs | 13 +++++++++++++ src/c14n/mod.rs | 25 ++++++++++++++++++++++++- src/c14n/ns_exclusive.rs | 8 +++++++- src/c14n/ns_inclusive.rs | 4 +++- src/c14n/serialize.rs | 13 ++++++++++--- 5 files changed, 57 insertions(+), 6 deletions(-) diff --git a/src/c14n/escape.rs b/src/c14n/escape.rs index 2bd62e5..0586945 100644 --- a/src/c14n/escape.rs +++ b/src/c14n/escape.rs @@ -33,7 +33,20 @@ pub(crate) fn escape_attr(s: &str, output: &mut Vec) { } } +/// Escape only carriage returns in comment/PI content for canonical XML. +/// +/// C14N spec section 2.3: `\r` in comments and PIs → ` ` +pub(crate) fn escape_cr(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::*; diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index 4343ae7..bcf87f6 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -129,6 +129,9 @@ pub enum C14nError { /// Invalid node reference. #[error("invalid node reference")] InvalidNode, + /// Algorithm not yet implemented. + #[error("unsupported algorithm: {0}")] + UnsupportedAlgorithm(String), /// I/O error. #[error("I/O error: {0}")] Io(#[from] std::io::Error), @@ -148,10 +151,16 @@ pub fn canonicalize( output: &mut Vec, ) -> Result<(), C14nError> { match algo.mode { - C14nMode::Inclusive1_0 | C14nMode::Inclusive1_1 => { + C14nMode::Inclusive1_0 => { let renderer = InclusiveNsRenderer; serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) } + // C14N 1.1 has observable differences (xml:id propagation, xml:base fixup) + // that are not yet implemented. Fail explicitly rather than silently + // producing 1.0 output. See ROADMAP P1-007. + C14nMode::Inclusive1_1 => Err(C14nError::UnsupportedAlgorithm( + "C14N 1.1 is not yet implemented (tracked in P1-007)".to_string(), + )), C14nMode::Exclusive1_0 => { let renderer = ExclusiveNsRenderer::new(&algo.inclusive_prefixes); serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) @@ -169,6 +178,7 @@ pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result, C14 } #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::*; @@ -214,4 +224,17 @@ mod tests { r#""# ); } + + #[test] + fn c14n_1_1_returns_error() { + let xml = b""; + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_1, false); + let result = canonicalize_xml(xml, &algo); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + matches!(err, C14nError::UnsupportedAlgorithm(_)), + "expected UnsupportedAlgorithm, got: {err:?}" + ); + } } diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs index 5d37294..4c01112 100644 --- a/src/c14n/ns_exclusive.rs +++ b/src/c14n/ns_exclusive.rs @@ -78,10 +78,15 @@ impl NsRenderer for ExclusiveNsRenderer<'_> { /// A prefix is visibly utilized if: /// 1. The element's tag name uses that prefix, OR /// 2. Any attribute on the element uses that prefix. +// NOTE: roxmltree does not expose the lexical prefix from parsed QNames. +// We reverse-map via lookup_prefix(namespace_uri). This is ambiguous when +// multiple prefixes bind the same URI (e.g., xmlns:a="u" xmlns:b="u"). +// In practice this is extremely rare in SAML/XMLDSig documents. +// A proper fix requires a parser that preserves lexical prefixes. fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { let mut utilized = HashSet::new(); - // Element's own prefix. + // Element's own prefix (reverse-mapped from namespace URI). if let Some(ns_uri) = node.tag_name().namespace() { match node.lookup_prefix(ns_uri) { Some(prefix) if !prefix.is_empty() => { @@ -110,6 +115,7 @@ fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { } #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::super::serialize::serialize_canonical; use super::*; diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs index d917aaf..06bb7ec 100644 --- a/src/c14n/ns_inclusive.rs +++ b/src/c14n/ns_inclusive.rs @@ -64,6 +64,7 @@ impl NsRenderer for InclusiveNsRenderer { } #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::super::serialize::serialize_canonical; use super::*; @@ -130,7 +131,8 @@ mod tests { let mut out = Vec::new(); serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); let result = String::from_utf8(out).expect("utf8"); - // Order should be: xmlns="" (default), xmlns:a, xmlns:z. + // Order should be: xmlns="..." (default), xmlns:a, xmlns:z. + // find(r#"xmlns=""#) matches the start of xmlns="http://default.com" let idx_default = result.find(r#"xmlns=""#).expect("default ns"); let idx_a = result.find(r#"xmlns:a="#).expect("a ns"); let idx_z = result.find(r#"xmlns:z="#).expect("z ns"); diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index e0f4cf3..94e488b 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use roxmltree::{Document, Node, NodeType}; -use super::escape::{escape_attr, escape_text}; +use super::escape::{escape_attr, escape_cr, escape_text}; use super::C14nError; /// Trait for namespace rendering strategies (inclusive vs exclusive). @@ -111,7 +111,8 @@ fn serialize_children( } output.extend_from_slice(b""); } @@ -126,7 +127,8 @@ fn serialize_children( output.extend_from_slice(pi.target.as_bytes()); if let Some(value) = pi.value { output.push(b' '); - output.extend_from_slice(value.as_bytes()); + // C14N spec: \r in PI content must be escaped to + escape_cr(value, output); } output.extend_from_slice(b"?>"); } @@ -246,6 +248,10 @@ fn has_preceding_element_sibling(node: &Node) -> bool { } /// Write the qualified name (prefix:localname or just localname) of an element. +/// +/// Uses `lookup_prefix()` to reverse-map namespace URI → prefix. +/// This is ambiguous when multiple prefixes bind the same URI; see +/// ns_exclusive.rs comment on `visibly_utilized_prefixes()`. fn write_qualified_name(node: Node, output: &mut Vec) { if let Some(ns_uri) = node.tag_name().namespace() { if let Some(prefix) = node.lookup_prefix(ns_uri) { @@ -272,6 +278,7 @@ fn write_attribute_name(element: Node, attr: &roxmltree::Attribute, output: &mut } #[cfg(test)] +#[allow(clippy::unwrap_used)] mod tests { use super::super::ns_inclusive::InclusiveNsRenderer; use super::*; From e4274642e32037ca798ec4c8730a5d5e9c6f757b Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:31:10 +0200 Subject: [PATCH 03/11] refactor(c14n): encapsulate C14nAlgorithm fields, remove dead code - Make C14nAlgorithm fields private, add accessor methods - Remove dead attribute filter block with let _ = pred suppression - Replace with doc explaining roxmltree attribute node limitation --- src/c14n/mod.rs | 35 ++++++++++++++++++++++++----------- src/c14n/serialize.rs | 14 +++----------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index bcf87f6..792dfd2 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -2,7 +2,8 @@ //! //! Implements: //! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) (inclusive) -//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) (inclusive) +//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) — URI parsing only; +//! canonicalization returns `UnsupportedAlgorithm` (1.1-specific rules not yet implemented) //! - [Exclusive XML Canonicalization 1.0](https://www.w3.org/TR/xml-exc-c14n/) (exclusive) //! //! # Example @@ -49,17 +50,29 @@ pub enum C14nMode { /// `` elements. #[derive(Debug, Clone, PartialEq, Eq)] pub struct C14nAlgorithm { - /// The canonicalization mode. - pub mode: C14nMode, - /// Whether to preserve comment nodes. - pub with_comments: bool, - /// For Exclusive C14N: prefixes to treat as inclusive. - /// Parsed from ``. - /// `"#default"` in the PrefixList is normalized to `""` (empty string). - pub inclusive_prefixes: HashSet, + mode: C14nMode, + with_comments: bool, + /// For Exclusive C14N: prefixes forced via InclusiveNamespaces PrefixList. + /// `"#default"` is normalized to `""` (empty string) by `with_prefix_list()`. + inclusive_prefixes: HashSet, } impl C14nAlgorithm { + /// The canonicalization mode. + pub fn mode(&self) -> C14nMode { + self.mode + } + + /// Whether comment nodes are preserved. + pub fn with_comments(&self) -> bool { + self.with_comments + } + + /// Prefixes forced via InclusiveNamespaces PrefixList (exclusive C14N). + pub fn inclusive_prefixes(&self) -> &HashSet { + &self.inclusive_prefixes + } + /// Create a new algorithm with the given mode and comments flag. pub fn new(mode: C14nMode, with_comments: bool) -> Self { Self { @@ -157,9 +170,9 @@ pub fn canonicalize( } // C14N 1.1 has observable differences (xml:id propagation, xml:base fixup) // that are not yet implemented. Fail explicitly rather than silently - // producing 1.0 output. See ROADMAP P1-007. + // producing 1.0 output. C14nMode::Inclusive1_1 => Err(C14nError::UnsupportedAlgorithm( - "C14N 1.1 is not yet implemented (tracked in P1-007)".to_string(), + "C14N 1.1 is not yet implemented".to_string(), )), C14nMode::Exclusive1_0 => { let renderer = ExclusiveNsRenderer::new(&algo.inclusive_prefixes); diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index 94e488b..530e24f 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -170,18 +170,10 @@ fn serialize_element( } // Regular attributes, sorted by (namespace-uri, local-name). + // Note: roxmltree doesn't expose attribute nodes with separate Node identity, + // so node_set filtering cannot distinguish individual attributes. When an + // element is in the set, all its attributes are included (matches xmlsec1). let mut attrs: Vec<_> = node.attributes().collect(); - - // Filter attributes by node-set if applicable. - if let Some(pred) = node_set { - // For document subsets, only include attributes that are "in the set". - // roxmltree doesn't give attribute nodes separate Node identity, - // so when we have a node_set, all attributes of an included element - // are included (matching xmlsec1 behavior for typical use cases). - let _ = pred; - // All attributes included if the element is in the set. - } - attrs.sort_by(|a, b| { let a_key = (a.namespace().unwrap_or(""), a.name()); let b_key = (b.namespace().unwrap_or(""), b.name()); From 0f8899230e64a1a95cdb68fe54d6d0d1820b352b Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:43:37 +0200 Subject: [PATCH 04/11] fix(c14n): clarify UTF-8 error, fix inclusive ns doc, document subset edge case - UTF-8 decode error now says "invalid UTF-8:" instead of "XML parse error" - Inclusive ns doc: clarify redundant redeclarations are suppressed - Document doc-level newline limitation with subset filtering --- src/c14n/mod.rs | 3 ++- src/c14n/ns_inclusive.rs | 10 ++++++---- src/c14n/serialize.rs | 7 ++++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index 792dfd2..9d7ca2c 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -183,7 +183,8 @@ pub fn canonicalize( /// Convenience: parse XML bytes and canonicalize the whole document. pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result, C14nError> { - let xml_str = std::str::from_utf8(xml).map_err(|e| C14nError::Parse(e.to_string()))?; + let xml_str = + std::str::from_utf8(xml).map_err(|e| C14nError::Parse(format!("invalid UTF-8: {e}")))?; let doc = Document::parse(xml_str).map_err(|e| C14nError::Parse(e.to_string()))?; let mut output = Vec::new(); canonicalize(&doc, None, algo, &mut output)?; diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs index 06bb7ec..f0a10ca 100644 --- a/src/c14n/ns_inclusive.rs +++ b/src/c14n/ns_inclusive.rs @@ -1,8 +1,10 @@ -//! Inclusive namespace rendering for C14N 1.0 and 1.1. +//! Inclusive namespace rendering for C14N 1.0. //! -//! In inclusive mode, ALL in-scope namespace declarations are emitted on each -//! element, even if the namespace prefix is not visibly used by that element's -//! tag or attributes. +//! In inclusive mode, all in-scope namespace declarations are rendered on each +//! element unless the same binding was already emitted by the nearest output +//! ancestor (suppressing redundant redeclarations). Unlike exclusive C14N, +//! namespaces are rendered even if not visibly used by the element's tag or +//! attributes. use std::collections::HashMap; diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index 530e24f..bac8899 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -69,7 +69,12 @@ fn serialize_children( match child.node_type() { NodeType::Element => { if in_set { - // Before root element: emit \n if there was output before. + // Before root element: emit \n separator if there was output before + // (e.g., a preceding comment/PI). Note: when node_set excludes the + // root element but includes preceding comments, those comments won't + // get a trailing \n — this is acceptable because document subsets + // that exclude the root element are only used with XPath transforms, + // which are not yet implemented. if is_doc_root && !output.is_empty() { output.push(b'\n'); } From 55f78206f3565a91a2622ab0d6dbebbd3337bbeb Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 21:45:52 +0200 Subject: [PATCH 05/11] docs(c14n): clarify with_prefix_list is exclusive-mode only --- src/c14n/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index 9d7ca2c..f33b85b 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -104,6 +104,9 @@ impl C14nAlgorithm { /// Set the InclusiveNamespaces PrefixList (exclusive C14N only). /// `"#default"` is normalized to empty string `""`. + /// + /// Only meaningful for [`C14nMode::Exclusive1_0`]. For inclusive modes, + /// the prefix list is ignored during canonicalization. pub fn with_prefix_list(mut self, prefix_list: &str) -> Self { self.inclusive_prefixes = prefix_list .split_whitespace() From 74f2e9d30aff8bebdba337303b4ec610f7f60bf3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 22:02:32 +0200 Subject: [PATCH 06/11] docs(c14n): add UTF-8 requirement doc, remove stale file reference --- src/c14n/mod.rs | 4 ++++ src/c14n/serialize.rs | 4 ++++ tests/c14n_integration.rs | 3 +-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index f33b85b..f815348 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -185,6 +185,10 @@ pub fn canonicalize( } /// Convenience: parse XML bytes and canonicalize the whole document. +/// +/// Input must be valid UTF-8 (XML 1.0 documents are UTF-8 or declare their +/// encoding; roxmltree only accepts UTF-8). Returns `C14nError::Parse` for +/// invalid UTF-8 or malformed XML. pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result, C14nError> { let xml_str = std::str::from_utf8(xml).map_err(|e| C14nError::Parse(format!("invalid UTF-8: {e}")))?; diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index bac8899..bae71a1 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -221,6 +221,10 @@ fn serialize_element( /// /// This function emits `\n` either before or after the node, depending on /// whether the root element has already been emitted. +/// +/// Limitation: when `node_set` excludes the root element, the `\n` logic may +/// be incorrect for preceding comments/PIs. This only affects XPath-selected +/// subsets that exclude the root — not relevant for SAML enveloped signatures. fn write_doc_level_separator(node: &Node, output: &mut Vec) { let root_elem_seen = has_preceding_element_sibling(node); if root_elem_seen { diff --git a/tests/c14n_integration.rs b/tests/c14n_integration.rs index 4a9fa6b..d2d906a 100644 --- a/tests/c14n_integration.rs +++ b/tests/c14n_integration.rs @@ -28,8 +28,7 @@ fn assert_exc_c14n(xml: &[u8], prefix_list: &str, expected: &str) { } // ─── Simple namespace document (inclusive) ────────────────────────────────── -// Source: tests/vectors/simple-ns.xml -// Reference: xmllint --c14n +// Reference output verified against: xmllint --c14n const SIMPLE_NS_XML: &[u8] = br#" From a9fb11b776bfb8df4667d352eff15cc37c7aabbc Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 15 Mar 2026 22:24:50 +0200 Subject: [PATCH 07/11] perf(c14n): document clone() cost and deferred optimization path --- src/c14n/ns_exclusive.rs | 2 ++ src/c14n/ns_inclusive.rs | 2 ++ src/c14n/serialize.rs | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs index 4c01112..53f4623 100644 --- a/src/c14n/ns_exclusive.rs +++ b/src/c14n/ns_exclusive.rs @@ -31,6 +31,8 @@ impl NsRenderer for ExclusiveNsRenderer<'_> { parent_rendered: &HashMap, ) -> (Vec<(String, String)>, HashMap) { let utilized = visibly_utilized_prefixes(node); + // Clone is O(n) per element. Acceptable for typical XML depths (<20 levels). + // Optimization: pass &mut and restore on backtrack — deferred to perf phase. let mut rendered = parent_rendered.clone(); let mut decls: Vec<(String, String)> = Vec::new(); diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs index f0a10ca..5959c06 100644 --- a/src/c14n/ns_inclusive.rs +++ b/src/c14n/ns_inclusive.rs @@ -24,6 +24,8 @@ impl NsRenderer for InclusiveNsRenderer { node: Node<'a, '_>, parent_rendered: &HashMap, ) -> (Vec<(String, String)>, HashMap) { + // Clone is O(n) per element. Acceptable for typical XML depths (<20 levels). + // Optimization: pass &mut and restore on backtrack — deferred to perf phase. let mut rendered = parent_rendered.clone(); let mut decls: Vec<(String, String)> = Vec::new(); diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index bae71a1..33e1e37 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -266,6 +266,8 @@ fn write_qualified_name(node: Node, output: &mut Vec) { } /// Write an attribute's qualified name (prefix:localname or just localname). +/// +/// Uses `lookup_prefix()` reverse-mapping (see `write_qualified_name` doc). fn write_attribute_name(element: Node, attr: &roxmltree::Attribute, output: &mut Vec) { if let Some(ns_uri) = attr.namespace() { if let Some(prefix) = element.lookup_prefix(ns_uri) { From 7f8ca368e72dfc81ccba4db294687d9f41057c1d Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 16 Mar 2026 00:47:06 +0200 Subject: [PATCH 08/11] fix(c14n): use lexical prefixes from source XML instead of lookup_prefix() Replace ambiguous lookup_prefix(namespace_uri) reverse-mapping with direct extraction of lexical prefixes from source XML byte positions using roxmltree's `positions` feature. - Enable roxmltree `positions` feature for byte-range access - Add prefix.rs: element_prefix() and attribute_prefix() extract the actual prefix used in the source QName via Node::range() and Attribute::range_qname() - Replace lookup_prefix() in serialize.rs write_qualified_name and write_attribute_name - Replace lookup_prefix() in ns_exclusive.rs visibly_utilized_prefixes - Add regression tests for aliased prefixes (same URI, different prefix) --- Cargo.toml | 2 +- src/c14n/mod.rs | 1 + src/c14n/ns_exclusive.rs | 39 +++++------- src/c14n/prefix.rs | 122 ++++++++++++++++++++++++++++++++++++++ src/c14n/serialize.rs | 30 ++++------ tests/c14n_integration.rs | 34 +++++++++++ 6 files changed, 185 insertions(+), 43 deletions(-) create mode 100644 src/c14n/prefix.rs diff --git a/Cargo.toml b/Cargo.toml index a137bbc..ebdbbbd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ readme = "README.md" [dependencies] # XML parsing -roxmltree = "0.21" +roxmltree = { version = "0.21", features = ["positions"] } # Crypto ring = "0.17" diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index f815348..eef0ea6 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -23,6 +23,7 @@ mod escape; pub(crate) mod ns_exclusive; pub(crate) mod ns_inclusive; +mod prefix; pub(crate) mod serialize; use std::collections::HashSet; diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs index 53f4623..4b5fa7a 100644 --- a/src/c14n/ns_exclusive.rs +++ b/src/c14n/ns_exclusive.rs @@ -7,6 +7,7 @@ use std::collections::{HashMap, HashSet}; use roxmltree::Node; +use super::prefix::{attribute_prefix, element_prefix}; use super::serialize::NsRenderer; /// Exclusive C14N namespace renderer. @@ -80,36 +81,26 @@ impl NsRenderer for ExclusiveNsRenderer<'_> { /// A prefix is visibly utilized if: /// 1. The element's tag name uses that prefix, OR /// 2. Any attribute on the element uses that prefix. -// NOTE: roxmltree does not expose the lexical prefix from parsed QNames. -// We reverse-map via lookup_prefix(namespace_uri). This is ambiguous when -// multiple prefixes bind the same URI (e.g., xmlns:a="u" xmlns:b="u"). -// In practice this is extremely rare in SAML/XMLDSig documents. -// A proper fix requires a parser that preserves lexical prefixes. +/// +/// Uses lexical prefixes extracted from source XML byte positions, +/// avoiding ambiguity when multiple prefixes bind the same namespace URI. fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { let mut utilized = HashSet::new(); - // Element's own prefix (reverse-mapped from namespace URI). - if let Some(ns_uri) = node.tag_name().namespace() { - match node.lookup_prefix(ns_uri) { - Some(prefix) if !prefix.is_empty() => { - utilized.insert(prefix); - } - _ if !ns_uri.is_empty() => { - // Element uses default namespace. - utilized.insert(""); - } - _ => {} - } + // Element's own lexical prefix from source XML. + let el_prefix = element_prefix(node); + if !el_prefix.is_empty() { + utilized.insert(el_prefix); + } else if node.tag_name().namespace().is_some() { + // Unprefixed element with namespace = default namespace utilized. + utilized.insert(""); } - // Attribute prefixes. + // Attribute lexical prefixes from source XML. for attr in node.attributes() { - if let Some(ns_uri) = attr.namespace() { - if let Some(prefix) = node.lookup_prefix(ns_uri) { - if !prefix.is_empty() { - utilized.insert(prefix); - } - } + let attr_prefix = attribute_prefix(node, &attr); + if !attr_prefix.is_empty() { + utilized.insert(attr_prefix); } } diff --git a/src/c14n/prefix.rs b/src/c14n/prefix.rs new file mode 100644 index 0000000..1ec1a84 --- /dev/null +++ b/src/c14n/prefix.rs @@ -0,0 +1,122 @@ +//! Lexical prefix extraction from roxmltree nodes. +//! +//! roxmltree's DOM stores only `(namespace_uri, local_name)` — the lexical +//! prefix is discarded during parsing. We recover it from the source XML +//! using byte-range positions (`roxmltree` `positions` feature). +//! +//! This avoids ambiguity when multiple prefixes bind the same namespace URI +//! (e.g., `xmlns:a="u" xmlns:b="u"`), where `lookup_prefix()` would return +//! an arbitrary match. + +use roxmltree::{Attribute, Node}; + +/// Extract the lexical prefix of an element from the source XML. +/// +/// Returns `""` for unprefixed elements, or the prefix string (e.g., `"foo"` +/// for ``). +pub(crate) fn element_prefix<'a>(node: Node<'a, '_>) -> &'a str { + let input = node.document().input_text(); + let range = node.range(); + + // range starts at '<', skip it to get to the QName + let tag_start = range.start + 1; + let tag_bytes = input.as_bytes(); + + // Find end of QName: first space, '>', or '/' + let mut qname_end = tag_start; + while qname_end < input.len() { + match tag_bytes[qname_end] { + b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/' => break, + _ => qname_end += 1, + } + } + + let qname = &input[tag_start..qname_end]; + match qname.find(':') { + Some(pos) => &qname[..pos], + None => "", + } +} + +/// Extract the lexical prefix of an attribute from the source XML. +/// +/// Returns `""` for unprefixed attributes, or the prefix string (e.g., `"xml"` +/// for `xml:lang="en"`). +pub(crate) fn attribute_prefix<'a>(node: Node<'a, '_>, attr: &Attribute<'a, '_>) -> &'a str { + let input = node.document().input_text(); + let qname_range = attr.range_qname(); + let qname = &input[qname_range]; + match qname.find(':') { + Some(pos) => &qname[..pos], + None => "", + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use roxmltree::Document; + + #[test] + fn prefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + assert_eq!(element_prefix(root), "foo"); + } + + #[test] + fn unprefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + assert_eq!(element_prefix(root), ""); + } + + #[test] + fn nested_prefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + assert_eq!(element_prefix(child), "b"); + } + + #[test] + fn prefixed_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), "ns"); + } + + #[test] + fn unprefixed_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), ""); + } + + #[test] + fn xml_lang_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), "xml"); + } + + #[test] + fn aliased_prefixes_same_uri() { + // Two prefixes bound to same URI — element uses "b", not "a" + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + assert_eq!(element_prefix(child), "b"); + let attr = child.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(child, &attr), "a"); + } +} diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs index 33e1e37..35e9dc2 100644 --- a/src/c14n/serialize.rs +++ b/src/c14n/serialize.rs @@ -8,6 +8,7 @@ use std::collections::HashMap; use roxmltree::{Document, Node, NodeType}; use super::escape::{escape_attr, escape_cr, escape_text}; +use super::prefix::{attribute_prefix, element_prefix}; use super::C14nError; /// Trait for namespace rendering strategies (inclusive vs exclusive). @@ -250,32 +251,25 @@ fn has_preceding_element_sibling(node: &Node) -> bool { /// Write the qualified name (prefix:localname or just localname) of an element. /// -/// Uses `lookup_prefix()` to reverse-map namespace URI → prefix. -/// This is ambiguous when multiple prefixes bind the same URI; see -/// ns_exclusive.rs comment on `visibly_utilized_prefixes()`. +/// Extracts the lexical prefix from the source XML via byte-range positions, +/// avoiding ambiguity when multiple prefixes bind the same namespace URI. fn write_qualified_name(node: Node, output: &mut Vec) { - if let Some(ns_uri) = node.tag_name().namespace() { - if let Some(prefix) = node.lookup_prefix(ns_uri) { - if !prefix.is_empty() { - output.extend_from_slice(prefix.as_bytes()); - output.push(b':'); - } - } + let prefix = element_prefix(node); + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); } output.extend_from_slice(node.tag_name().name().as_bytes()); } /// Write an attribute's qualified name (prefix:localname or just localname). /// -/// Uses `lookup_prefix()` reverse-mapping (see `write_qualified_name` doc). +/// Extracts the lexical prefix from the source XML via byte-range positions. fn write_attribute_name(element: Node, attr: &roxmltree::Attribute, output: &mut Vec) { - if let Some(ns_uri) = attr.namespace() { - if let Some(prefix) = element.lookup_prefix(ns_uri) { - if !prefix.is_empty() { - output.extend_from_slice(prefix.as_bytes()); - output.push(b':'); - } - } + let prefix = attribute_prefix(element, attr); + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); } output.extend_from_slice(attr.name().as_bytes()); } diff --git a/tests/c14n_integration.rs b/tests/c14n_integration.rs index d2d906a..4b56b93 100644 --- a/tests/c14n_integration.rs +++ b/tests/c14n_integration.rs @@ -304,6 +304,40 @@ fn cdata_flattened() { ); } +// ─── Prefix aliasing (multiple prefixes for same URI) ─────────────────────── + +#[test] +fn aliased_prefixes_exclusive() { + // Two prefixes bound to the same URI. Each element/attribute must preserve + // the lexical prefix actually used in the source, not an arbitrary one + // returned by lookup_prefix(). + let xml = br#""#; + assert_exc_c14n( + xml, + "", + concat!( + "", + r#""#, + "", + ), + ); +} + +#[test] +fn aliased_prefixes_inclusive() { + let xml = br#""#; + assert_c14n( + xml, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + r#""#, + "", + ), + ); +} + // ─── Default namespace undeclaration ──────────────────────────────────────── #[test] From 026a410906e4329f2a19d5acd0eb599a05602bf3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 16 Mar 2026 01:06:41 +0200 Subject: [PATCH 09/11] fix(c14n): correct xmlns="" suppression for document subsets When canonicalizing a document subset where ancestor elements are excluded, xmlns="" must still be emitted if the source tree has a non-empty default namespace that needs undeclaring. Previously, suppression was based only on parent_rendered (output ancestors), which missed inherited default namespaces from non-output ancestors. - Add has_in_scope_default_namespace() to check ancestor chain - Use it in both inclusive and exclusive ns renderers - Add subset integration tests for xmlns="" edge cases --- src/c14n/ns_exclusive.rs | 13 +++++--- src/c14n/ns_inclusive.rs | 18 ++++++++--- src/c14n/prefix.rs | 60 ++++++++++++++++++++++++++++++++++ tests/c14n_integration.rs | 68 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 150 insertions(+), 9 deletions(-) diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs index 4b5fa7a..dc04b36 100644 --- a/src/c14n/ns_exclusive.rs +++ b/src/c14n/ns_exclusive.rs @@ -7,7 +7,7 @@ use std::collections::{HashMap, HashSet}; use roxmltree::Node; -use super::prefix::{attribute_prefix, element_prefix}; +use super::prefix::{attribute_prefix, element_prefix, has_in_scope_default_namespace}; use super::serialize::NsRenderer; /// Exclusive C14N namespace renderer. @@ -57,9 +57,14 @@ impl NsRenderer for ExclusiveNsRenderer<'_> { continue; } - // Don't emit xmlns="" if no default ns was in scope. - if prefix.is_empty() && uri.is_empty() && !parent_rendered.contains_key("") { - continue; + // Suppress xmlns="" when no non-empty default namespace is in scope. + // Check source tree for subsets where output ancestors may be absent. + if prefix.is_empty() && uri.is_empty() { + let has_rendered_default = parent_rendered.contains_key(""); + let has_in_scope_default = has_in_scope_default_namespace(node); + if !has_rendered_default && !has_in_scope_default { + continue; + } } decls.push((prefix.to_string(), uri.to_string())); diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs index 5959c06..90780a4 100644 --- a/src/c14n/ns_inclusive.rs +++ b/src/c14n/ns_inclusive.rs @@ -10,6 +10,7 @@ use std::collections::HashMap; use roxmltree::Node; +use super::prefix::has_in_scope_default_namespace; use super::serialize::NsRenderer; /// Inclusive C14N namespace renderer. @@ -46,10 +47,19 @@ impl NsRenderer for InclusiveNsRenderer { continue; } - // Suppress xmlns="" when no default namespace was previously in scope. - // Only emit xmlns="" to undeclare a previously declared default ns. - if prefix.is_empty() && uri.is_empty() && !parent_rendered.contains_key("") { - continue; + // Suppress xmlns="" when no non-empty default namespace is in scope. + // Check the source tree (not just parent_rendered) to handle document + // subsets where output ancestors may be absent. + if prefix.is_empty() && uri.is_empty() { + // Only emit xmlns="" if there's actually a default ns to undeclare. + // parent_rendered tracks what output ancestors emitted; but for subsets, + // a non-output ancestor may have declared xmlns="uri" which needs + // undeclaring. Fall back to source tree inspection. + let has_rendered_default = parent_rendered.contains_key(""); + let has_in_scope_default = has_in_scope_default_namespace(node); + if !has_rendered_default && !has_in_scope_default { + continue; + } } decls.push((prefix.to_string(), uri.to_string())); diff --git a/src/c14n/prefix.rs b/src/c14n/prefix.rs index 1ec1a84..88303e9 100644 --- a/src/c14n/prefix.rs +++ b/src/c14n/prefix.rs @@ -52,6 +52,34 @@ pub(crate) fn attribute_prefix<'a>(node: Node<'a, '_>, attr: &Attribute<'a, '_>) } } +/// Check whether `xmlns=""` on `node` would be meaningful — i.e., whether +/// any ancestor in the source tree has a non-empty default namespace that +/// this `xmlns=""` would undeclare. +/// +/// This is needed for correct `xmlns=""` suppression in C14N document subsets: +/// when output ancestors are absent, `parent_rendered` alone cannot determine +/// whether `xmlns=""` is meaningful. We check the source tree ancestors. +/// +/// Returns `true` if any ancestor element has `xmlns=""`, +/// meaning `xmlns=""` on `node` is an active undeclaration. +pub(crate) fn has_in_scope_default_namespace(node: Node) -> bool { + // Walk ancestors to find any default namespace declaration. + let mut current = node.parent(); + while let Some(n) = current { + if n.is_element() { + for ns in n.namespaces() { + if ns.name().is_none() { + // Found a default namespace on an ancestor. + // Non-empty URI means xmlns="" would undeclare it. + return !ns.uri().is_empty(); + } + } + } + current = n.parent(); + } + false +} + #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { @@ -119,4 +147,36 @@ mod tests { let attr = child.attributes().next().expect("attr"); assert_eq!(attribute_prefix(child, &attr), "a"); } + + #[test] + fn has_default_ns_from_ancestor() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let child = root.first_element_child().expect("child"); + // root has no ancestor with default ns → false + assert!(!has_in_scope_default_namespace(root)); + // child's parent (root) has xmlns="http://example.com" → true + // (child's own xmlns="" means it needs to undeclare it) + assert!(has_in_scope_default_namespace(child)); + } + + #[test] + fn no_default_ns() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let child = root.first_element_child().expect("child"); + assert!(!has_in_scope_default_namespace(root)); + assert!(!has_in_scope_default_namespace(child)); + } + + #[test] + fn inherited_default_ns() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + // child's parent (root) has xmlns="http://example.com" → true + assert!(has_in_scope_default_namespace(child)); + } } diff --git a/tests/c14n_integration.rs b/tests/c14n_integration.rs index 4b56b93..c05956a 100644 --- a/tests/c14n_integration.rs +++ b/tests/c14n_integration.rs @@ -3,7 +3,7 @@ //! Reference outputs generated by: `xmllint --c14n ` and `xmllint --exc-c14n ` //! using libxml2 (system xmllint). These are the authoritative C14N implementations. -use xml_sec::c14n::{canonicalize_xml, C14nAlgorithm, C14nMode}; +use xml_sec::c14n::{canonicalize, canonicalize_xml, C14nAlgorithm, C14nMode}; // ─── Helpers ──────────────────────────────────────────────────────────────── @@ -350,3 +350,69 @@ fn default_ns_undeclaration() { r#""#, ); } + +// ─── Document subset: xmlns="" with excluded ancestor ─────────────────────── + +#[test] +fn subset_xmlns_empty_with_excluded_default_ns_ancestor() { + // Scenario: root declares xmlns="http://u", child undeclares with xmlns="". + // Node-set includes only child (root excluded). + // parent_rendered is empty (root wasn't rendered), but child MUST emit + // xmlns="" to undeclare the inherited default namespace from source tree. + let xml = r#""#; + let doc = roxmltree::Document::parse(xml).expect("parse"); + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + + let child_id = doc + .root_element() + .first_element_child() + .expect("child") + .id(); + + let mut output = Vec::new(); + canonicalize( + &doc, + Some(&|node| node.id() == child_id), + &algo, + &mut output, + ) + .expect("c14n"); + + let result = String::from_utf8(output).expect("utf8"); + // child must emit xmlns="" even though parent_rendered has no default ns, + // because the source tree has xmlns="http://example.com" in scope. + assert!( + result.contains(r#"xmlns="""#), + "xmlns=\"\" must be emitted to undeclare inherited default ns. Got: {result}" + ); +} + +#[test] +fn subset_no_spurious_xmlns_empty() { + // Scenario: no default namespace anywhere in document. + // Node-set includes only child. xmlns="" must NOT appear. + let xml = r#""#; + let doc = roxmltree::Document::parse(xml).expect("parse"); + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + + let child_id = doc + .root_element() + .first_element_child() + .expect("child") + .id(); + + let mut output = Vec::new(); + canonicalize( + &doc, + Some(&|node| node.id() == child_id), + &algo, + &mut output, + ) + .expect("c14n"); + + let result = String::from_utf8(output).expect("utf8"); + assert!( + !result.contains("xmlns="), + "xmlns=\"\" must NOT appear when no default ns in scope. Got: {result}" + ); +} From 04fc3a212d9cae5896d0a8732873c63109d5e0b3 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 16 Mar 2026 01:15:54 +0200 Subject: [PATCH 10/11] fix(c14n): use Result in doctests, fix has_in_scope_default_namespace doc - Replace .unwrap() with ? in lib.rs and c14n/mod.rs doctests to comply with deny(clippy::unwrap_used) on doctest targets - Fix has_in_scope_default_namespace doc: says "nearest" not "any" ancestor, matching the actual implementation behavior --- src/c14n/mod.rs | 7 +++++-- src/c14n/prefix.rs | 14 ++++++++------ src/lib.rs | 7 +++++-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index eef0ea6..a129011 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -9,15 +9,18 @@ //! # Example //! //! ``` +//! # fn main() -> Result<(), Box> { //! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; //! //! let xml = b""; //! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); -//! let canonical = canonicalize_xml(xml, &algo).unwrap(); +//! let canonical = canonicalize_xml(xml, &algo)?; //! assert_eq!( -//! String::from_utf8(canonical).unwrap(), +//! String::from_utf8(canonical)?, //! "" //! ); +//! # Ok(()) +//! # } //! ``` mod escape; diff --git a/src/c14n/prefix.rs b/src/c14n/prefix.rs index 88303e9..037eec0 100644 --- a/src/c14n/prefix.rs +++ b/src/c14n/prefix.rs @@ -53,17 +53,19 @@ pub(crate) fn attribute_prefix<'a>(node: Node<'a, '_>, attr: &Attribute<'a, '_>) } /// Check whether `xmlns=""` on `node` would be meaningful — i.e., whether -/// any ancestor in the source tree has a non-empty default namespace that -/// this `xmlns=""` would undeclare. +/// the in-scope default namespace (as determined by the nearest ancestor +/// `xmlns` declaration) is non-empty and would be undeclared. /// /// This is needed for correct `xmlns=""` suppression in C14N document subsets: /// when output ancestors are absent, `parent_rendered` alone cannot determine -/// whether `xmlns=""` is meaningful. We check the source tree ancestors. +/// whether `xmlns=""` is meaningful. We walk the source tree ancestors to +/// find the effective in-scope default namespace. /// -/// Returns `true` if any ancestor element has `xmlns=""`, -/// meaning `xmlns=""` on `node` is an active undeclaration. +/// Returns `true` if the nearest ancestor element with a default-namespace +/// declaration has `xmlns=""`, meaning `xmlns=""` on `node` +/// would actively undeclare the in-scope default namespace. pub(crate) fn has_in_scope_default_namespace(node: Node) -> bool { - // Walk ancestors to find any default namespace declaration. + // Walk ancestors to find the nearest default namespace declaration. let mut current = node.parent(); while let Some(n) = current { if n.is_element() { diff --git a/src/lib.rs b/src/lib.rs index aa900e6..9d273c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,15 +12,18 @@ //! ## Quick Start //! //! ```rust +//! # fn main() -> Result<(), Box> { //! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; //! //! let xml = b""; //! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); -//! let canonical = canonicalize_xml(xml, &algo).unwrap(); +//! let canonical = canonicalize_xml(xml, &algo)?; //! assert_eq!( -//! String::from_utf8(canonical).unwrap(), +//! String::from_utf8(canonical)?, //! "" //! ); +//! # Ok(()) +//! # } //! ``` #![deny(unsafe_code)] From 9965969f35f7d14bebb4d80900a9c3ee2329f06f Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Mon, 16 Mar 2026 01:28:03 +0200 Subject: [PATCH 11/11] fix(c14n): treat default ns as visibly utilized for unprefixed elements In exclusive C14N, an unprefixed element relies on the current default-namespace binding (including xmlns="" undeclaration). Without marking the default namespace as visibly utilized, xmlns="" would not be emitted, placing the element in the wrong namespace context. --- src/c14n/ns_exclusive.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs index dc04b36..a2e562d 100644 --- a/src/c14n/ns_exclusive.rs +++ b/src/c14n/ns_exclusive.rs @@ -96,8 +96,11 @@ fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { let el_prefix = element_prefix(node); if !el_prefix.is_empty() { utilized.insert(el_prefix); - } else if node.tag_name().namespace().is_some() { - // Unprefixed element with namespace = default namespace utilized. + } else { + // Unprefixed element relies on the current default-namespace binding + // (including xmlns="" undeclaration), so the default namespace is + // visibly utilized. This ensures xmlns="" is emitted when needed + // to undeclare an inherited default namespace in exclusive C14N. utilized.insert(""); } @@ -170,4 +173,16 @@ mod tests { r#""# ); } + + #[test] + fn unprefixed_element_undeclares_default_ns() { + // child undeclares default ns with xmlns="". In exclusive C14N, + // the default namespace must be visibly utilized so xmlns="" is emitted. + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + assert!( + result.contains(r#""#), + "xmlns=\"\" must be emitted for undeclaration. Got: {result}" + ); + } }