diff --git a/.gitignore b/.gitignore index b1c081e..5716ccf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ Cargo.lock .DS_Store donors/ .refs/ +arch/ +ROADMAP.md diff --git a/Cargo.toml b/Cargo.toml index 42b7c6d..ebdbbbd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,13 +14,13 @@ readme = "README.md" [dependencies] # XML parsing -roxmltree = "0.20" +roxmltree = { version = "0.21", features = ["positions"] } # Crypto ring = "0.17" # X.509 certificates -x509-parser = "0.16" +x509-parser = "0.18" der = "0.7" # Error handling diff --git a/src/c14n/escape.rs b/src/c14n/escape.rs new file mode 100644 index 0000000..0586945 --- /dev/null +++ b/src/c14n/escape.rs @@ -0,0 +1,79 @@ +//! Text and attribute value escaping for canonical XML. + +/// Escape text node content for canonical XML. +/// +/// Replaces: `&` → `&`, `<` → `<`, `>` → `>`, `\r` → ` ` +pub(crate) fn escape_text(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'&' => output.extend_from_slice(b"&"), + b'<' => output.extend_from_slice(b"<"), + b'>' => output.extend_from_slice(b">"), + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + +/// Escape attribute value for canonical XML. +/// +/// Replaces: `&` → `&`, `<` → `<`, `"` → `"`, +/// `\t` → ` `, `\n` → ` `, `\r` → ` ` +pub(crate) fn escape_attr(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'&' => output.extend_from_slice(b"&"), + b'<' => output.extend_from_slice(b"<"), + b'"' => output.extend_from_slice(b"""), + b'\t' => output.extend_from_slice(b" "), + b'\n' => output.extend_from_slice(b" "), + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + +/// Escape only carriage returns in comment/PI content for canonical XML. +/// +/// C14N spec section 2.3: `\r` in comments and PIs → ` ` +pub(crate) fn escape_cr(s: &str, output: &mut Vec) { + for b in s.bytes() { + match b { + b'\r' => output.extend_from_slice(b" "), + _ => output.push(b), + } + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + #[test] + fn text_escaping() { + let mut out = Vec::new(); + escape_text("a < b & c > d\r\n", &mut out); + assert_eq!( + String::from_utf8(out).expect("valid utf8"), + "a < b & c > d \n" + ); + } + + #[test] + fn attr_escaping() { + let mut out = Vec::new(); + escape_attr("he said \"hi\" & \t\n\r", &mut out); + assert_eq!( + String::from_utf8(out).expect("valid utf8"), + "he said "hi" & " + ); + } + + #[test] + fn passthrough_plain_text() { + let mut out = Vec::new(); + escape_text("hello world", &mut out); + assert_eq!(String::from_utf8(out).expect("valid utf8"), "hello world"); + } +} diff --git a/src/c14n/mod.rs b/src/c14n/mod.rs index 4a65e69..a129011 100644 --- a/src/c14n/mod.rs +++ b/src/c14n/mod.rs @@ -1,23 +1,265 @@ //! XML Canonicalization (C14N). //! //! Implements: -//! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) -//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) -//! - [Exclusive XML Canonicalization](https://www.w3.org/TR/xml-exc-c14n/) +//! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) (inclusive) +//! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) — URI parsing only; +//! canonicalization returns `UnsupportedAlgorithm` (1.1-specific rules not yet implemented) +//! - [Exclusive XML Canonicalization 1.0](https://www.w3.org/TR/xml-exc-c14n/) (exclusive) +//! +//! # Example +//! +//! ``` +//! # fn main() -> Result<(), Box> { +//! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; +//! +//! let xml = b""; +//! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); +//! let canonical = canonicalize_xml(xml, &algo)?; +//! assert_eq!( +//! String::from_utf8(canonical)?, +//! "" +//! ); +//! # Ok(()) +//! # } +//! ``` + +mod escape; +pub(crate) mod ns_exclusive; +pub(crate) mod ns_inclusive; +mod prefix; +pub(crate) mod serialize; -/// Canonicalization algorithm selection. +use std::collections::HashSet; + +use roxmltree::{Document, Node}; + +use ns_exclusive::ExclusiveNsRenderer; +use ns_inclusive::InclusiveNsRenderer; +use serialize::serialize_canonical; + +/// C14N algorithm mode (without the comments flag). #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum C14nAlgorithm { - /// Canonical XML 1.0 (with comments). - Inclusive10WithComments, - /// Canonical XML 1.0 (without comments). - Inclusive10, - /// Canonical XML 1.1 (with comments). - Inclusive11WithComments, - /// Canonical XML 1.1 (without comments). - Inclusive11, - /// Exclusive Canonical XML (with comments). - Exclusive10WithComments, - /// Exclusive Canonical XML (without comments). - Exclusive10, +pub enum C14nMode { + /// Inclusive C14N 1.0 — all in-scope namespaces rendered. + Inclusive1_0, + /// Inclusive C14N 1.1 — like 1.0 with xml:id propagation and xml:base fixup. + Inclusive1_1, + /// Exclusive C14N 1.0 — only visibly-utilized namespaces rendered. + Exclusive1_0, +} + +/// Full C14N algorithm identifier. +/// +/// Constructed from algorithm URIs found in `` or +/// `` elements. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct C14nAlgorithm { + mode: C14nMode, + with_comments: bool, + /// For Exclusive C14N: prefixes forced via InclusiveNamespaces PrefixList. + /// `"#default"` is normalized to `""` (empty string) by `with_prefix_list()`. + inclusive_prefixes: HashSet, +} + +impl C14nAlgorithm { + /// The canonicalization mode. + pub fn mode(&self) -> C14nMode { + self.mode + } + + /// Whether comment nodes are preserved. + pub fn with_comments(&self) -> bool { + self.with_comments + } + + /// Prefixes forced via InclusiveNamespaces PrefixList (exclusive C14N). + pub fn inclusive_prefixes(&self) -> &HashSet { + &self.inclusive_prefixes + } + + /// Create a new algorithm with the given mode and comments flag. + pub fn new(mode: C14nMode, with_comments: bool) -> Self { + Self { + mode, + with_comments, + inclusive_prefixes: HashSet::new(), + } + } + + /// Parse from an algorithm URI. Returns `None` for unrecognized URIs. + pub fn from_uri(uri: &str) -> Option { + let (mode, with_comments) = match uri { + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315" => (C14nMode::Inclusive1_0, false), + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" => { + (C14nMode::Inclusive1_0, true) + } + "http://www.w3.org/2006/12/xml-c14n11" => (C14nMode::Inclusive1_1, false), + "http://www.w3.org/2006/12/xml-c14n11#WithComments" => (C14nMode::Inclusive1_1, true), + "http://www.w3.org/2001/10/xml-exc-c14n#" => (C14nMode::Exclusive1_0, false), + "http://www.w3.org/2001/10/xml-exc-c14n#WithComments" => (C14nMode::Exclusive1_0, true), + _ => return None, + }; + Some(Self { + mode, + with_comments, + inclusive_prefixes: HashSet::new(), + }) + } + + /// Set the InclusiveNamespaces PrefixList (exclusive C14N only). + /// `"#default"` is normalized to empty string `""`. + /// + /// Only meaningful for [`C14nMode::Exclusive1_0`]. For inclusive modes, + /// the prefix list is ignored during canonicalization. + pub fn with_prefix_list(mut self, prefix_list: &str) -> Self { + self.inclusive_prefixes = prefix_list + .split_whitespace() + .map(|p| { + if p == "#default" { + String::new() + } else { + p.to_string() + } + }) + .collect(); + self + } + + /// Get the algorithm URI for this configuration. + pub fn uri(&self) -> &'static str { + match (self.mode, self.with_comments) { + (C14nMode::Inclusive1_0, false) => "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", + (C14nMode::Inclusive1_0, true) => { + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" + } + (C14nMode::Inclusive1_1, false) => "http://www.w3.org/2006/12/xml-c14n11", + (C14nMode::Inclusive1_1, true) => "http://www.w3.org/2006/12/xml-c14n11#WithComments", + (C14nMode::Exclusive1_0, false) => "http://www.w3.org/2001/10/xml-exc-c14n#", + (C14nMode::Exclusive1_0, true) => "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", + } + } +} + +/// Error type for C14N operations. +#[derive(Debug, thiserror::Error)] +pub enum C14nError { + /// XML parsing error. + #[error("XML parse error: {0}")] + Parse(String), + /// Invalid node reference. + #[error("invalid node reference")] + InvalidNode, + /// Algorithm not yet implemented. + #[error("unsupported algorithm: {0}")] + UnsupportedAlgorithm(String), + /// I/O error. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), +} + +/// Canonicalize an XML document or document subset. +/// +/// - `doc`: parsed roxmltree document (read-only DOM). +/// - `node_set`: optional predicate controlling which nodes appear in output. +/// `None` means the entire document. +/// - `algo`: algorithm parameters (mode, comments, prefix list). +/// - `output`: byte buffer receiving canonical XML. +pub fn canonicalize( + doc: &Document, + node_set: Option<&dyn Fn(Node) -> bool>, + algo: &C14nAlgorithm, + output: &mut Vec, +) -> Result<(), C14nError> { + match algo.mode { + C14nMode::Inclusive1_0 => { + let renderer = InclusiveNsRenderer; + serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) + } + // C14N 1.1 has observable differences (xml:id propagation, xml:base fixup) + // that are not yet implemented. Fail explicitly rather than silently + // producing 1.0 output. + C14nMode::Inclusive1_1 => Err(C14nError::UnsupportedAlgorithm( + "C14N 1.1 is not yet implemented".to_string(), + )), + C14nMode::Exclusive1_0 => { + let renderer = ExclusiveNsRenderer::new(&algo.inclusive_prefixes); + serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) + } + } +} + +/// Convenience: parse XML bytes and canonicalize the whole document. +/// +/// Input must be valid UTF-8 (XML 1.0 documents are UTF-8 or declare their +/// encoding; roxmltree only accepts UTF-8). Returns `C14nError::Parse` for +/// invalid UTF-8 or malformed XML. +pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result, C14nError> { + let xml_str = + std::str::from_utf8(xml).map_err(|e| C14nError::Parse(format!("invalid UTF-8: {e}")))?; + let doc = Document::parse(xml_str).map_err(|e| C14nError::Parse(e.to_string()))?; + let mut output = Vec::new(); + canonicalize(&doc, None, algo, &mut output)?; + Ok(output) +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + #[test] + fn from_uri_roundtrip() { + let uris = [ + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", + "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments", + "http://www.w3.org/2006/12/xml-c14n11", + "http://www.w3.org/2006/12/xml-c14n11#WithComments", + "http://www.w3.org/2001/10/xml-exc-c14n#", + "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", + ]; + for uri in uris { + let algo = C14nAlgorithm::from_uri(uri).expect(uri); + assert_eq!(algo.uri(), uri); + } + } + + #[test] + fn unknown_uri_returns_none() { + assert!(C14nAlgorithm::from_uri("http://example.com/unknown").is_none()); + } + + #[test] + fn prefix_list_parsing() { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false) + .with_prefix_list("foo bar #default baz"); + assert!(algo.inclusive_prefixes.contains("foo")); + assert!(algo.inclusive_prefixes.contains("bar")); + assert!(algo.inclusive_prefixes.contains("baz")); + assert!(algo.inclusive_prefixes.contains("")); // #default → "" + assert_eq!(algo.inclusive_prefixes.len(), 4); + } + + #[test] + fn canonicalize_xml_basic() { + let xml = b""; + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + let result = canonicalize_xml(xml, &algo).expect("c14n"); + assert_eq!( + String::from_utf8(result).expect("utf8"), + r#""# + ); + } + + #[test] + fn c14n_1_1_returns_error() { + let xml = b""; + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_1, false); + let result = canonicalize_xml(xml, &algo); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + matches!(err, C14nError::UnsupportedAlgorithm(_)), + "expected UnsupportedAlgorithm, got: {err:?}" + ); + } } diff --git a/src/c14n/ns_exclusive.rs b/src/c14n/ns_exclusive.rs new file mode 100644 index 0000000..a2e562d --- /dev/null +++ b/src/c14n/ns_exclusive.rs @@ -0,0 +1,188 @@ +//! Exclusive namespace rendering for Exclusive C14N 1.0. +//! +//! In exclusive mode, only **visibly utilized** namespace prefixes are emitted +//! on each element (plus any prefixes forced by the InclusiveNamespaces PrefixList). + +use std::collections::{HashMap, HashSet}; + +use roxmltree::Node; + +use super::prefix::{attribute_prefix, element_prefix, has_in_scope_default_namespace}; +use super::serialize::NsRenderer; + +/// Exclusive C14N namespace renderer. +/// +/// Only emits namespace declarations for prefixes that are visibly utilized +/// by the element's tag name or attributes, plus any forced prefixes from +/// the `InclusiveNamespaces PrefixList`. +pub(crate) struct ExclusiveNsRenderer<'a> { + inclusive_prefixes: &'a HashSet, +} + +impl<'a> ExclusiveNsRenderer<'a> { + pub(crate) fn new(inclusive_prefixes: &'a HashSet) -> Self { + Self { inclusive_prefixes } + } +} + +impl NsRenderer for ExclusiveNsRenderer<'_> { + fn render_namespaces<'n>( + &self, + node: Node<'n, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap) { + let utilized = visibly_utilized_prefixes(node); + // Clone is O(n) per element. Acceptable for typical XML depths (<20 levels). + // Optimization: pass &mut and restore on backtrack — deferred to perf phase. + let mut rendered = parent_rendered.clone(); + let mut decls: Vec<(String, String)> = Vec::new(); + + for ns in node.namespaces() { + let prefix = ns.name().unwrap_or(""); + let uri = ns.uri(); + + // The `xml` prefix is never declared. + if prefix == "xml" { + continue; + } + + // Include if visibly utilized OR in the forced prefix list. + let dominated = utilized.contains(prefix) || self.inclusive_prefixes.contains(prefix); + if !dominated { + continue; + } + + // Only render if different from nearest output ancestor. + if parent_rendered.get(prefix).map(|s| s.as_str()) == Some(uri) { + continue; + } + + // Suppress xmlns="" when no non-empty default namespace is in scope. + // Check source tree for subsets where output ancestors may be absent. + if prefix.is_empty() && uri.is_empty() { + let has_rendered_default = parent_rendered.contains_key(""); + let has_in_scope_default = has_in_scope_default_namespace(node); + if !has_rendered_default && !has_in_scope_default { + continue; + } + } + + decls.push((prefix.to_string(), uri.to_string())); + } + + // Sort by prefix. + decls.sort_by(|a, b| a.0.cmp(&b.0)); + + for (prefix, uri) in &decls { + rendered.insert(prefix.clone(), uri.clone()); + } + + (decls, rendered) + } +} + +/// Determine which namespace prefixes are visibly utilized by an element. +/// +/// A prefix is visibly utilized if: +/// 1. The element's tag name uses that prefix, OR +/// 2. Any attribute on the element uses that prefix. +/// +/// Uses lexical prefixes extracted from source XML byte positions, +/// avoiding ambiguity when multiple prefixes bind the same namespace URI. +fn visibly_utilized_prefixes<'a>(node: Node<'a, '_>) -> HashSet<&'a str> { + let mut utilized = HashSet::new(); + + // Element's own lexical prefix from source XML. + let el_prefix = element_prefix(node); + if !el_prefix.is_empty() { + utilized.insert(el_prefix); + } else { + // Unprefixed element relies on the current default-namespace binding + // (including xmlns="" undeclaration), so the default namespace is + // visibly utilized. This ensures xmlns="" is emitted when needed + // to undeclare an inherited default namespace in exclusive C14N. + utilized.insert(""); + } + + // Attribute lexical prefixes from source XML. + for attr in node.attributes() { + let attr_prefix = attribute_prefix(node, &attr); + if !attr_prefix.is_empty() { + utilized.insert(attr_prefix); + } + } + + utilized +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::super::serialize::serialize_canonical; + use super::*; + use roxmltree::Document; + use std::collections::HashSet; + + fn exc_c14n(xml: &str, prefix_list: &HashSet) -> String { + let doc = Document::parse(xml).expect("parse"); + let renderer = ExclusiveNsRenderer::new(prefix_list); + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + String::from_utf8(out).expect("utf8") + } + + #[test] + fn only_utilized_ns_rendered() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // root uses no prefix → no ns decls on root. + // a:child uses a: → xmlns:a on child. + // b: is not utilized anywhere → not rendered. + assert!(!result.contains("xmlns:b")); + assert!(result.contains(r#""#)); + } + + #[test] + fn forced_prefix_via_prefix_list() { + let xml = r#""#; + let mut forced = HashSet::new(); + forced.insert("b".to_string()); + let result = exc_c14n(xml, &forced); + // b: is forced via PrefixList → should appear on root. + assert!(result.contains(r#"xmlns:b="http://b.com""#)); + } + + #[test] + fn sibling_elements_redeclare() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // In exclusive mode, each sibling must independently declare a:. + assert!(result.contains(r#""#)); + assert!(result.contains(r#""#)); + } + + #[test] + fn default_ns_utilized() { + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + // Both root and child use the default ns. + assert!(result.contains(r#""#)); + // child inherits → parent already rendered → not redeclared. + assert_eq!( + result, + r#""# + ); + } + + #[test] + fn unprefixed_element_undeclares_default_ns() { + // child undeclares default ns with xmlns="". In exclusive C14N, + // the default namespace must be visibly utilized so xmlns="" is emitted. + let xml = r#""#; + let result = exc_c14n(xml, &HashSet::new()); + assert!( + result.contains(r#""#), + "xmlns=\"\" must be emitted for undeclaration. Got: {result}" + ); + } +} diff --git a/src/c14n/ns_inclusive.rs b/src/c14n/ns_inclusive.rs new file mode 100644 index 0000000..90780a4 --- /dev/null +++ b/src/c14n/ns_inclusive.rs @@ -0,0 +1,156 @@ +//! Inclusive namespace rendering for C14N 1.0. +//! +//! In inclusive mode, all in-scope namespace declarations are rendered on each +//! element unless the same binding was already emitted by the nearest output +//! ancestor (suppressing redundant redeclarations). Unlike exclusive C14N, +//! namespaces are rendered even if not visibly used by the element's tag or +//! attributes. + +use std::collections::HashMap; + +use roxmltree::Node; + +use super::prefix::has_in_scope_default_namespace; +use super::serialize::NsRenderer; + +/// Inclusive C14N namespace renderer. +/// +/// Emits all in-scope namespace bindings that differ from what the nearest +/// output ancestor already rendered. +pub(crate) struct InclusiveNsRenderer; + +impl NsRenderer for InclusiveNsRenderer { + fn render_namespaces<'a>( + &self, + node: Node<'a, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap) { + // Clone is O(n) per element. Acceptable for typical XML depths (<20 levels). + // Optimization: pass &mut and restore on backtrack — deferred to perf phase. + let mut rendered = parent_rendered.clone(); + let mut decls: Vec<(String, String)> = Vec::new(); + + // Collect all in-scope namespace declarations from roxmltree. + // node.namespaces() yields every ns declared on this element or + // inherited from ancestors. + for ns in node.namespaces() { + let prefix = ns.name().unwrap_or(""); // None = default namespace + let uri = ns.uri(); + + // The `xml` prefix namespace is never declared in canonical XML. + if prefix == "xml" { + continue; + } + + // Emit only if different from what the nearest output ancestor rendered. + if parent_rendered.get(prefix).map(|u| u.as_str()) == Some(uri) { + continue; + } + + // Suppress xmlns="" when no non-empty default namespace is in scope. + // Check the source tree (not just parent_rendered) to handle document + // subsets where output ancestors may be absent. + if prefix.is_empty() && uri.is_empty() { + // Only emit xmlns="" if there's actually a default ns to undeclare. + // parent_rendered tracks what output ancestors emitted; but for subsets, + // a non-output ancestor may have declared xmlns="uri" which needs + // undeclaring. Fall back to source tree inspection. + let has_rendered_default = parent_rendered.contains_key(""); + let has_in_scope_default = has_in_scope_default_namespace(node); + if !has_rendered_default && !has_in_scope_default { + continue; + } + } + + decls.push((prefix.to_string(), uri.to_string())); + } + + // Sort namespace declarations by prefix (lexicographic, "" sorts first). + decls.sort_by(|a, b| a.0.cmp(&b.0)); + + // Update the rendered map. + for (prefix, uri) in &decls { + rendered.insert(prefix.clone(), uri.clone()); + } + + (decls, rendered) + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::super::serialize::serialize_canonical; + use super::*; + use roxmltree::Document; + + #[test] + fn namespaces_rendered_on_first_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // Default ns and a: ns should appear on root. + assert!(result.contains(r#"xmlns="http://example.com""#)); + assert!(result.contains(r#"xmlns:a="http://a.com""#)); + } + + #[test] + fn inherited_ns_not_redeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // xmlns:a should appear only once (on root), not redeclared on child. + assert_eq!( + result, + r#""# + ); + } + + #[test] + fn overridden_ns_is_redeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // child should redeclare a: with different URI. + assert!(result.contains(r#""#)); + } + + #[test] + fn default_ns_undeclared() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // child should have xmlns="" to undeclare the default namespace. + assert!(result.contains(r#""#)); + } + + #[test] + fn ns_decls_sorted_by_prefix() { + let xml = + r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + let result = String::from_utf8(out).expect("utf8"); + // Order should be: xmlns="..." (default), xmlns:a, xmlns:z. + // find(r#"xmlns=""#) matches the start of xmlns="http://default.com" + let idx_default = result.find(r#"xmlns=""#).expect("default ns"); + let idx_a = result.find(r#"xmlns:a="#).expect("a ns"); + let idx_z = result.find(r#"xmlns:z="#).expect("z ns"); + assert!(idx_default < idx_a, "default before a"); + assert!(idx_a < idx_z, "a before z"); + } +} diff --git a/src/c14n/prefix.rs b/src/c14n/prefix.rs new file mode 100644 index 0000000..037eec0 --- /dev/null +++ b/src/c14n/prefix.rs @@ -0,0 +1,184 @@ +//! Lexical prefix extraction from roxmltree nodes. +//! +//! roxmltree's DOM stores only `(namespace_uri, local_name)` — the lexical +//! prefix is discarded during parsing. We recover it from the source XML +//! using byte-range positions (`roxmltree` `positions` feature). +//! +//! This avoids ambiguity when multiple prefixes bind the same namespace URI +//! (e.g., `xmlns:a="u" xmlns:b="u"`), where `lookup_prefix()` would return +//! an arbitrary match. + +use roxmltree::{Attribute, Node}; + +/// Extract the lexical prefix of an element from the source XML. +/// +/// Returns `""` for unprefixed elements, or the prefix string (e.g., `"foo"` +/// for ``). +pub(crate) fn element_prefix<'a>(node: Node<'a, '_>) -> &'a str { + let input = node.document().input_text(); + let range = node.range(); + + // range starts at '<', skip it to get to the QName + let tag_start = range.start + 1; + let tag_bytes = input.as_bytes(); + + // Find end of QName: first space, '>', or '/' + let mut qname_end = tag_start; + while qname_end < input.len() { + match tag_bytes[qname_end] { + b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/' => break, + _ => qname_end += 1, + } + } + + let qname = &input[tag_start..qname_end]; + match qname.find(':') { + Some(pos) => &qname[..pos], + None => "", + } +} + +/// Extract the lexical prefix of an attribute from the source XML. +/// +/// Returns `""` for unprefixed attributes, or the prefix string (e.g., `"xml"` +/// for `xml:lang="en"`). +pub(crate) fn attribute_prefix<'a>(node: Node<'a, '_>, attr: &Attribute<'a, '_>) -> &'a str { + let input = node.document().input_text(); + let qname_range = attr.range_qname(); + let qname = &input[qname_range]; + match qname.find(':') { + Some(pos) => &qname[..pos], + None => "", + } +} + +/// Check whether `xmlns=""` on `node` would be meaningful — i.e., whether +/// the in-scope default namespace (as determined by the nearest ancestor +/// `xmlns` declaration) is non-empty and would be undeclared. +/// +/// This is needed for correct `xmlns=""` suppression in C14N document subsets: +/// when output ancestors are absent, `parent_rendered` alone cannot determine +/// whether `xmlns=""` is meaningful. We walk the source tree ancestors to +/// find the effective in-scope default namespace. +/// +/// Returns `true` if the nearest ancestor element with a default-namespace +/// declaration has `xmlns=""`, meaning `xmlns=""` on `node` +/// would actively undeclare the in-scope default namespace. +pub(crate) fn has_in_scope_default_namespace(node: Node) -> bool { + // Walk ancestors to find the nearest default namespace declaration. + let mut current = node.parent(); + while let Some(n) = current { + if n.is_element() { + for ns in n.namespaces() { + if ns.name().is_none() { + // Found a default namespace on an ancestor. + // Non-empty URI means xmlns="" would undeclare it. + return !ns.uri().is_empty(); + } + } + } + current = n.parent(); + } + false +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + use roxmltree::Document; + + #[test] + fn prefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + assert_eq!(element_prefix(root), "foo"); + } + + #[test] + fn unprefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + assert_eq!(element_prefix(root), ""); + } + + #[test] + fn nested_prefixed_element() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + assert_eq!(element_prefix(child), "b"); + } + + #[test] + fn prefixed_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), "ns"); + } + + #[test] + fn unprefixed_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), ""); + } + + #[test] + fn xml_lang_attribute() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let attr = root.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(root, &attr), "xml"); + } + + #[test] + fn aliased_prefixes_same_uri() { + // Two prefixes bound to same URI — element uses "b", not "a" + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + assert_eq!(element_prefix(child), "b"); + let attr = child.attributes().next().expect("attr"); + assert_eq!(attribute_prefix(child, &attr), "a"); + } + + #[test] + fn has_default_ns_from_ancestor() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let child = root.first_element_child().expect("child"); + // root has no ancestor with default ns → false + assert!(!has_in_scope_default_namespace(root)); + // child's parent (root) has xmlns="http://example.com" → true + // (child's own xmlns="" means it needs to undeclare it) + assert!(has_in_scope_default_namespace(child)); + } + + #[test] + fn no_default_ns() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let root = doc.root_element(); + let child = root.first_element_child().expect("child"); + assert!(!has_in_scope_default_namespace(root)); + assert!(!has_in_scope_default_namespace(child)); + } + + #[test] + fn inherited_default_ns() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let child = doc.root_element().first_element_child().expect("child"); + // child's parent (root) has xmlns="http://example.com" → true + assert!(has_in_scope_default_namespace(child)); + } +} diff --git a/src/c14n/serialize.rs b/src/c14n/serialize.rs new file mode 100644 index 0000000..35e9dc2 --- /dev/null +++ b/src/c14n/serialize.rs @@ -0,0 +1,401 @@ +//! Document-order serialization for canonical XML. +//! +//! Walks an XML document tree in document order, emitting canonical bytes. +//! Namespace rendering is delegated to the caller via [`NsRenderer`] trait. + +use std::collections::HashMap; + +use roxmltree::{Document, Node, NodeType}; + +use super::escape::{escape_attr, escape_cr, escape_text}; +use super::prefix::{attribute_prefix, element_prefix}; +use super::C14nError; + +/// Trait for namespace rendering strategies (inclusive vs exclusive). +pub(crate) trait NsRenderer { + /// Compute namespace declarations to emit for this element. + /// Returns (sorted_ns_decls, updated_rendered_map). + /// + /// `parent_rendered` maps prefix → URI for what the nearest output ancestor + /// already declared in the canonical form. + fn render_namespaces<'a>( + &self, + node: Node<'a, '_>, + parent_rendered: &HashMap, + ) -> (Vec<(String, String)>, HashMap); +} + +/// Canonicalize a document (or subset) to the output buffer. +/// +/// - `doc`: parsed roxmltree document +/// - `node_set`: optional predicate — if `Some`, only nodes where predicate +/// returns `true` are included in the output +/// - `with_comments`: whether to preserve comment nodes +/// - `ns_renderer`: namespace rendering strategy +/// - `output`: destination buffer +pub(crate) fn serialize_canonical( + doc: &Document, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + output: &mut Vec, +) -> Result<(), C14nError> { + let root = doc.root(); + serialize_children( + root, + node_set, + with_comments, + ns_renderer, + &HashMap::new(), + output, + ); + Ok(()) +} + +/// Serialize children of a node in document order. +fn serialize_children( + parent: Node, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + parent_rendered: &HashMap, + output: &mut Vec, +) { + let is_doc_root = parent.node_type() == NodeType::Root; + + for child in parent.children() { + // Node-set filtering: skip nodes not in the set. + let in_set = node_set.map_or(true, |pred| pred(child)); + + match child.node_type() { + NodeType::Element => { + if in_set { + // Before root element: emit \n separator if there was output before + // (e.g., a preceding comment/PI). Note: when node_set excludes the + // root element but includes preceding comments, those comments won't + // get a trailing \n — this is acceptable because document subsets + // that exclude the root element are only used with XPath transforms, + // which are not yet implemented. + if is_doc_root && !output.is_empty() { + output.push(b'\n'); + } + serialize_element( + child, + node_set, + with_comments, + ns_renderer, + parent_rendered, + output, + ); + } else { + // Element not in set, but descendants might be — walk children. + serialize_children( + child, + node_set, + with_comments, + ns_renderer, + parent_rendered, + output, + ); + } + } + NodeType::Text => { + if in_set { + // Document-level text nodes are ignored by C14N. + // Only text inside elements is serialized. + if !is_doc_root { + if let Some(text) = child.text() { + escape_text(text, output); + } + } + } + } + NodeType::Comment => { + if with_comments && in_set { + if is_doc_root { + write_doc_level_separator(&child, output); + } + output.extend_from_slice(b""); + } + } + NodeType::PI => { + if in_set { + if let Some(pi) = child.pi() { + if is_doc_root { + write_doc_level_separator(&child, output); + } + output.extend_from_slice(b""); + } + } + } + NodeType::Root => { + // Should not happen as a child. + } + } + } +} + +/// Serialize a single element node (start tag + children + end tag). +fn serialize_element( + node: Node, + node_set: Option<&dyn Fn(Node) -> bool>, + with_comments: bool, + ns_renderer: &dyn NsRenderer, + parent_rendered: &HashMap, + output: &mut Vec, +) { + let (ns_decls, rendered) = ns_renderer.render_namespaces(node, parent_rendered); + + // Start tag: = node.attributes().collect(); + attrs.sort_by(|a, b| { + let a_key = (a.namespace().unwrap_or(""), a.name()); + let b_key = (b.namespace().unwrap_or(""), b.name()); + a_key.cmp(&b_key) + }); + + for attr in &attrs { + output.push(b' '); + write_attribute_name(node, attr, output); + output.extend_from_slice(b"=\""); + escape_attr(attr.value(), output); + output.push(b'"'); + } + + // Always use form, never self-closing. + output.push(b'>'); + + // Children. + serialize_children( + node, + node_set, + with_comments, + ns_renderer, + &rendered, + output, + ); + + // End tag. + output.extend_from_slice(b"'); +} + +/// Write `\n` separator for document-level nodes. +/// +/// C14N spec: document-level comments and PIs get `\n` between them and the +/// root element. Specifically: +/// - Before root element: comment/PI followed by `\n` +/// - After root element: `\n` followed by comment/PI +/// +/// This function emits `\n` either before or after the node, depending on +/// whether the root element has already been emitted. +/// +/// Limitation: when `node_set` excludes the root element, the `\n` logic may +/// be incorrect for preceding comments/PIs. This only affects XPath-selected +/// subsets that exclude the root — not relevant for SAML enveloped signatures. +fn write_doc_level_separator(node: &Node, output: &mut Vec) { + let root_elem_seen = has_preceding_element_sibling(node); + if root_elem_seen { + // After root element: \n before this node. + output.push(b'\n'); + } else if !output.is_empty() { + // Before root element but not first output: \n after previous node. + output.push(b'\n'); + } +} + +/// Check if a node has a preceding sibling that is an element. +fn has_preceding_element_sibling(node: &Node) -> bool { + let mut prev = node.prev_sibling(); + while let Some(p) = prev { + if p.is_element() { + return true; + } + prev = p.prev_sibling(); + } + false +} + +/// Write the qualified name (prefix:localname or just localname) of an element. +/// +/// Extracts the lexical prefix from the source XML via byte-range positions, +/// avoiding ambiguity when multiple prefixes bind the same namespace URI. +fn write_qualified_name(node: Node, output: &mut Vec) { + let prefix = element_prefix(node); + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); + } + output.extend_from_slice(node.tag_name().name().as_bytes()); +} + +/// Write an attribute's qualified name (prefix:localname or just localname). +/// +/// Extracts the lexical prefix from the source XML via byte-range positions. +fn write_attribute_name(element: Node, attr: &roxmltree::Attribute, output: &mut Vec) { + let prefix = attribute_prefix(element, attr); + if !prefix.is_empty() { + output.extend_from_slice(prefix.as_bytes()); + output.push(b':'); + } + output.extend_from_slice(attr.name().as_bytes()); +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::super::ns_inclusive::InclusiveNsRenderer; + use super::*; + + #[test] + fn empty_element_expanded() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn text_preserved() { + let xml = " hello & world "; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + " hello & world " + ); + } + + #[test] + fn comments_stripped_by_default() { + let xml = "text"; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!(String::from_utf8(out).expect("utf8"), "text"); + } + + #[test] + fn comments_preserved_with_flag() { + let xml = "text"; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, true, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "text" + ); + } + + #[test] + fn attribute_sorting() { + let xml = r#""#; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + r#""# + ); + } + + #[test] + fn pi_serialization() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + // XML declaration is omitted by roxmltree parsing. + // PI inside root is preserved. + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn nested_elements_document_order() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "" + ); + } + + #[test] + fn document_level_comments() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, true, &renderer, &mut out).expect("c14n"); + // C14N spec: \n between document-level nodes. + // Before root: comment + \n + root + // After root: root + \n + comment + assert_eq!( + String::from_utf8(out).expect("utf8"), + "\n\n" + ); + } + + #[test] + fn document_level_pi_before_root() { + let xml = ""; + let doc = Document::parse(xml).expect("parse"); + let renderer = InclusiveNsRenderer; + let mut out = Vec::new(); + serialize_canonical(&doc, None, false, &renderer, &mut out).expect("c14n"); + assert_eq!( + String::from_utf8(out).expect("utf8"), + "\n" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 992babe..9d273c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,13 +11,19 @@ //! //! ## Quick Start //! -//! ```rust,no_run -//! use xml_sec::{XmlSigner, XmlVerifier}; +//! ```rust +//! # fn main() -> Result<(), Box> { +//! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; //! -//! // Verify a signed XML document -//! let doc = std::fs::read_to_string("signed.xml").unwrap(); -//! let cert = std::fs::read("cert.pem").unwrap(); -//! let valid = XmlVerifier::new(&cert).verify(&doc).unwrap(); +//! let xml = b""; +//! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); +//! let canonical = canonicalize_xml(xml, &algo)?; +//! assert_eq!( +//! String::from_utf8(canonical)?, +//! "" +//! ); +//! # Ok(()) +//! # } //! ``` #![deny(unsafe_code)] diff --git a/tests/c14n_integration.rs b/tests/c14n_integration.rs new file mode 100644 index 0000000..c05956a --- /dev/null +++ b/tests/c14n_integration.rs @@ -0,0 +1,418 @@ +//! Integration tests for C14N against xmllint reference outputs. +//! +//! Reference outputs generated by: `xmllint --c14n ` and `xmllint --exc-c14n ` +//! using libxml2 (system xmllint). These are the authoritative C14N implementations. + +use xml_sec::c14n::{canonicalize, canonicalize_xml, C14nAlgorithm, C14nMode}; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +fn assert_c14n(xml: &[u8], mode: C14nMode, with_comments: bool, expected: &str) { + let algo = C14nAlgorithm::new(mode, with_comments); + let result = canonicalize_xml(xml, &algo).expect("canonicalize failed"); + let result_str = String::from_utf8(result).expect("invalid utf8"); + assert_eq!( + result_str, expected, + "\n--- GOT ---\n{result_str}\n--- EXPECTED ---\n{expected}" + ); +} + +fn assert_exc_c14n(xml: &[u8], prefix_list: &str, expected: &str) { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false).with_prefix_list(prefix_list); + let result = canonicalize_xml(xml, &algo).expect("canonicalize failed"); + let result_str = String::from_utf8(result).expect("invalid utf8"); + assert_eq!( + result_str, expected, + "\n--- GOT ---\n{result_str}\n--- EXPECTED ---\n{expected}" + ); +} + +// ─── Simple namespace document (inclusive) ────────────────────────────────── +// Reference output verified against: xmllint --c14n + +const SIMPLE_NS_XML: &[u8] = + br#" + + + + +"#; + +#[test] +fn simple_ns_inclusive() { + // Reference from xmllint --c14n: + // - ns decls sorted: xmlns= first, then xmlns:a, xmlns:z + // - attrs sorted: a before b (no namespace, lexicographic) + // - child: ns inherited, not redeclared; attrs sorted: plain, z:attr + // - z:other: xmlns:a overridden, must redeclare + assert_c14n( + SIMPLE_NS_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#""#, + ), + ); +} + +#[test] +fn simple_ns_exclusive() { + // Reference from xmllint --exc-c14n: + // - root: only xmlns= (default ns used by root tag) + // - a:child: xmlns:a, xmlns:z (used by a: prefix and z:attr) + // - inner: no ns (no prefix used, inherits default from parent) + // - z:other: xmlns:z only (a: not used by z:other) + assert_exc_c14n( + SIMPLE_NS_XML, + "", + concat!( + r#""#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#" "#, + "\n", + r#""#, + ), + ); +} + +// ─── Merlin-like document ─────────────────────────────────────────────────── +// Structure from merlin-c14n-three/signature.xml (data portion only) +// Reference from xmllint --c14n / --exc-c14n + +const MERLIN_DATA_XML: &[u8] = br#" + + + + + + + + + + + + + + + + +"#; + +#[test] +fn merlin_data_inclusive() { + // All 4 ns prefixes + default ns + xml:lang on root. + // Inner elements inherit everything, no redeclarations. + assert_c14n( + MERLIN_DATA_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "", + ), + ); +} + +#[test] +fn merlin_data_exclusive() { + // Exclusive: each element only declares what it visibly uses. + // foo:Root uses foo: prefix → xmlns:foo + // xml:lang is an attribute, always present on root + // bar:Something uses bar: → xmlns:bar + // foo:Nothing uses foo: → already rendered by parent? No — exc mode + // tracks per output ancestor, foo:Root rendered foo:, so foo:Nothing inherits + // baz:Something uses baz: → xmlns:baz + // Default ns (xmlns="http://example.org/") not used by any element prefix → NOT rendered + assert_exc_c14n( + MERLIN_DATA_XML, + "", + concat!( + r#""#, + "\n", + r#" "#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + r#" "#, + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "", + ), + ); +} + +// ─── Attribute sorting with namespaced attributes ─────────────────────────── +// Reference from xmllint --c14n + +const ATTR_SORT_XML: &[u8] = br#" + +"#; + +#[test] +fn attr_sort_inclusive() { + // Attributes sorted by (namespace-uri, local-name): + // - plain (no ns, "") → first + // - ns1:a (http://ns1, a) + // - ns1:b (http://ns1, b) + // - ns2:a (http://ns2, a) + // - ns2:b (http://ns2, b) + assert_c14n( + ATTR_SORT_XML, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + "\n", + r#" "#, + "\n", + "", + ), + ); +} + +#[test] +fn attr_sort_exclusive() { + // In exclusive mode: root uses no ns → no decls on root. + // elem uses ns1: and ns2: → declares both. + assert_exc_c14n( + ATTR_SORT_XML, + "", + concat!( + "\n", + r#" "#, + "\n", + "", + ), + ); +} + +// ─── Comments and Processing Instructions ─────────────────────────────────── + +const COMMENTS_PI_XML: &[u8] = b"\n\n\n \n \n\n"; + +#[test] +fn comments_stripped() { + // Comments are stripped entirely. PI is first output, no leading \n. + // Trailing comment also stripped, no trailing \n. + // The whitespace text node between stripped comment and PI is at + // document level → ignored by C14N. + assert_c14n( + COMMENTS_PI_XML, + C14nMode::Inclusive1_0, + false, + concat!( + "\n", + "\n", + " \n", + " \n", + "", + ), + ); +} + +#[test] +fn comments_preserved() { + assert_c14n( + COMMENTS_PI_XML, + C14nMode::Inclusive1_0, + true, + concat!( + "\n\n", + "\n", + " \n", + " \n", + "\n", + ), + ); +} + +// ─── Idempotency ──────────────────────────────────────────────────────────── + +#[test] +fn idempotency_inclusive() { + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + let pass1 = canonicalize_xml(MERLIN_DATA_XML, &algo).expect("pass1"); + let pass2 = canonicalize_xml(&pass1, &algo).expect("pass2"); + assert_eq!(pass1, pass2, "C14N must be idempotent"); +} + +#[test] +fn idempotency_exclusive() { + let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false); + let pass1 = canonicalize_xml(MERLIN_DATA_XML, &algo).expect("pass1"); + let pass2 = canonicalize_xml(&pass1, &algo).expect("pass2"); + assert_eq!(pass1, pass2, "Exclusive C14N must be idempotent"); +} + +// ─── Empty document edge case ─────────────────────────────────────────────── + +#[test] +fn minimal_document() { + assert_c14n(b"", C14nMode::Inclusive1_0, false, ""); +} + +#[test] +fn cdata_flattened() { + // CDATA sections should be replaced with escaped text content. + assert_c14n( + b"", + C14nMode::Inclusive1_0, + false, + "a < b & c", + ); +} + +// ─── Prefix aliasing (multiple prefixes for same URI) ─────────────────────── + +#[test] +fn aliased_prefixes_exclusive() { + // Two prefixes bound to the same URI. Each element/attribute must preserve + // the lexical prefix actually used in the source, not an arbitrary one + // returned by lookup_prefix(). + let xml = br#""#; + assert_exc_c14n( + xml, + "", + concat!( + "", + r#""#, + "", + ), + ); +} + +#[test] +fn aliased_prefixes_inclusive() { + let xml = br#""#; + assert_c14n( + xml, + C14nMode::Inclusive1_0, + false, + concat!( + r#""#, + r#""#, + "", + ), + ); +} + +// ─── Default namespace undeclaration ──────────────────────────────────────── + +#[test] +fn default_ns_undeclaration() { + let xml = br#""#; + assert_c14n( + xml, + C14nMode::Inclusive1_0, + false, + r#""#, + ); +} + +// ─── Document subset: xmlns="" with excluded ancestor ─────────────────────── + +#[test] +fn subset_xmlns_empty_with_excluded_default_ns_ancestor() { + // Scenario: root declares xmlns="http://u", child undeclares with xmlns="". + // Node-set includes only child (root excluded). + // parent_rendered is empty (root wasn't rendered), but child MUST emit + // xmlns="" to undeclare the inherited default namespace from source tree. + let xml = r#""#; + let doc = roxmltree::Document::parse(xml).expect("parse"); + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + + let child_id = doc + .root_element() + .first_element_child() + .expect("child") + .id(); + + let mut output = Vec::new(); + canonicalize( + &doc, + Some(&|node| node.id() == child_id), + &algo, + &mut output, + ) + .expect("c14n"); + + let result = String::from_utf8(output).expect("utf8"); + // child must emit xmlns="" even though parent_rendered has no default ns, + // because the source tree has xmlns="http://example.com" in scope. + assert!( + result.contains(r#"xmlns="""#), + "xmlns=\"\" must be emitted to undeclare inherited default ns. Got: {result}" + ); +} + +#[test] +fn subset_no_spurious_xmlns_empty() { + // Scenario: no default namespace anywhere in document. + // Node-set includes only child. xmlns="" must NOT appear. + let xml = r#""#; + let doc = roxmltree::Document::parse(xml).expect("parse"); + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + + let child_id = doc + .root_element() + .first_element_child() + .expect("child") + .id(); + + let mut output = Vec::new(); + canonicalize( + &doc, + Some(&|node| node.id() == child_id), + &algo, + &mut output, + ) + .expect("c14n"); + + let result = String::from_utf8(output).expect("utf8"); + assert!( + !result.contains("xmlns="), + "xmlns=\"\" must NOT appear when no default ns in scope. Got: {result}" + ); +}