diff --git a/src/xmldsig/mod.rs b/src/xmldsig/mod.rs index 04a4156..c68dd30 100644 --- a/src/xmldsig/mod.rs +++ b/src/xmldsig/mod.rs @@ -1,3 +1,14 @@ //! XML Digital Signatures (XMLDSig). //! //! Implements [XML Signature Syntax and Processing](https://www.w3.org/TR/xmldsig-core1/). +//! +//! ## Current Status +//! +//! - URI dereference: same-document references (`""`, `#id`, `#xpointer(/)`, `#xpointer(id('...'))`) +//! - ID attribute resolution with configurable attribute names +//! - Node set types for the transform pipeline + +pub mod types; +pub mod uri; + +pub use types::{NodeSet, TransformData, TransformError}; diff --git a/src/xmldsig/types.rs b/src/xmldsig/types.rs new file mode 100644 index 0000000..8513bd6 --- /dev/null +++ b/src/xmldsig/types.rs @@ -0,0 +1,227 @@ +//! Core types for the XMLDSig transform pipeline. +//! +//! These types flow between URI dereference, transforms, and digest computation. +//! +//! These types are consumed by URI dereference, the transform chain (P1-014, +//! P1-015), and reference processing (P1-018). + +use std::collections::HashSet; + +use roxmltree::{Document, Node, NodeId}; + +// roxmltree 0.21 uses `Node<'a, 'input: 'a>`. We tie both lifetimes together +// with a single `'a` by requiring `'input = 'a` at every use site (`Node<'a, 'a>`). +// This is safe because our NodeSet borrows the Document which owns the input. + +/// Data flowing between transforms in the verification/signing pipeline. +/// +/// Transforms consume and produce either a node set (XML-level) or raw bytes +/// (after canonicalization or base64 decode). +pub enum TransformData<'a> { + /// A set of nodes from the parsed XML document. + NodeSet(NodeSet<'a>), + /// Raw bytes (e.g., after canonicalization). + Binary(Vec), +} + +impl std::fmt::Debug for TransformData<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NodeSet(_) => f.debug_tuple("NodeSet").field(&"...").finish(), + Self::Binary(b) => f.debug_tuple("Binary").field(&b.len()).finish(), + } + } +} + +impl<'a> TransformData<'a> { + /// Convert to `NodeSet`, returning an error if this is `Binary` data. + pub fn into_node_set(self) -> Result, TransformError> { + match self { + Self::NodeSet(ns) => Ok(ns), + Self::Binary(_) => Err(TransformError::TypeMismatch { + expected: "NodeSet", + got: "Binary", + }), + } + } + + /// Convert to binary bytes, returning an error if this is a `NodeSet`. + pub fn into_binary(self) -> Result, TransformError> { + match self { + Self::Binary(b) => Ok(b), + Self::NodeSet(_) => Err(TransformError::TypeMismatch { + expected: "Binary", + got: "NodeSet", + }), + } + } +} + +/// A set of nodes from a roxmltree document. +/// +/// Represents "which nodes are included" for canonicalization and transforms. +/// Two modes: +/// - **Whole document**: `included` is `None`, meaning all nodes are in the set +/// (minus any in `excluded`). +/// - **Subset**: `included` is `Some(ids)`, meaning only those node IDs are in +/// the set (minus any in `excluded`). +pub struct NodeSet<'a> { + /// Reference to the parsed document. + doc: &'a Document<'a>, + /// If `None`, all nodes are included. If `Some`, only these nodes. + included: Option>, + /// Nodes explicitly excluded (e.g., `` subtree for enveloped transform). + excluded: HashSet, + /// Whether comment nodes are included. For empty URI dereference (whole + /// document), comments are excluded per XMLDSig spec. + with_comments: bool, +} + +impl<'a> NodeSet<'a> { + /// Create a node set representing the entire document without comments. + /// + /// Per XMLDSig §4.3.3.2: "An empty URI [...] is a reference to the document + /// [...] and the comment nodes are not included." + pub fn entire_document_without_comments(doc: &'a Document<'a>) -> Self { + Self { + doc, + included: None, + excluded: HashSet::new(), + with_comments: false, + } + } + + /// Create a node set representing the entire document with comments. + /// + /// Used for `#xpointer(/)` which, unlike empty URI, includes comment nodes. + pub fn entire_document_with_comments(doc: &'a Document<'a>) -> Self { + Self { + doc, + included: None, + excluded: HashSet::new(), + with_comments: true, + } + } + + /// Create a node set rooted at `element`, containing that element and all + /// of its descendant nodes (elements, text, and, for this constructor, + /// comment nodes). + /// + /// Note: in `roxmltree`, attributes and namespaces are not separate nodes + /// and therefore are not tracked individually in this `NodeSet`. During + /// canonicalization, any attributes and namespace declarations belonging to + /// the included elements are serialized as part of those elements. + pub fn subtree(element: Node<'a, 'a>) -> Self { + let mut ids = HashSet::new(); + collect_subtree_ids(element, &mut ids); + Self { + doc: element.document(), + included: Some(ids), + excluded: HashSet::new(), + with_comments: true, + } + } + + /// Reference to the underlying document. + pub fn document(&self) -> &'a Document<'a> { + self.doc + } + + /// Check whether a node is in this set. + /// + /// Returns `false` for nodes from a different document than this set's + /// owning document (prevents cross-document NodeId collisions). + pub fn contains(&self, node: Node<'_, '_>) -> bool { + // Guard: reject nodes from a different document. NodeIds are + // per-document indices — the same index from another document + // would reference a completely different node. + if !std::ptr::eq(node.document() as *const _, self.doc as *const _) { + return false; + } + + let id = node.id(); + + // Check exclusion first + if self.excluded.contains(&id) { + return false; + } + + // Filter comments if not included + if !self.with_comments && node.is_comment() { + return false; + } + + // Check inclusion + match &self.included { + None => true, + Some(ids) => ids.contains(&id), + } + } + + /// Exclude a node and all its descendants from this set. + /// + /// No-op for nodes from a different document. + pub fn exclude_subtree(&mut self, node: Node<'_, '_>) { + // Guard: only exclude nodes from our document + if !std::ptr::eq(node.document() as *const _, self.doc as *const _) { + return; + } + collect_subtree_ids(node, &mut self.excluded); + } + + /// Whether comments are included in this node set. + pub fn with_comments(&self) -> bool { + self.with_comments + } +} + +/// Collect a node and all its descendants into a set of `NodeId`s. +/// +/// Uses an explicit stack instead of recursion to avoid stack overflow +/// on deeply nested XML (attacker-controlled input in SAML contexts). +fn collect_subtree_ids(node: Node<'_, '_>, ids: &mut HashSet) { + let mut stack = vec![node]; + while let Some(current) = stack.pop() { + ids.insert(current.id()); + for child in current.children() { + stack.push(child); + } + } + // In roxmltree, attributes and namespaces are not nodes and do not + // appear in `children()` traversal; they're accessed via + // node.attributes(). We therefore track the NodeIds of all descendant + // nodes reachable via `children()` (elements, text, comments, + // processing instructions, etc.). During C14N, the serializer checks + // whether an element is in the node set and then serializes all of + // that element's attributes/namespaces as part of the element, so + // separate attribute/namespace identifiers are unnecessary. +} + +/// Errors during transform processing. +#[derive(Debug, thiserror::Error)] +pub enum TransformError { + /// Data type mismatch between transforms. + #[error("type mismatch: expected {expected}, got {got}")] + TypeMismatch { + /// Expected data type. + expected: &'static str, + /// Actual data type. + got: &'static str, + }, + + /// Element not found by ID. + #[error("element not found by ID: {0}")] + ElementNotFound(String), + + /// Unsupported URI scheme or format. + #[error("unsupported URI: {0}")] + UnsupportedUri(String), + + /// Unsupported transform algorithm. + #[error("unsupported transform: {0}")] + UnsupportedTransform(String), + + /// Canonicalization error during transform. + #[error("C14N error: {0}")] + C14n(String), +} diff --git a/src/xmldsig/uri.rs b/src/xmldsig/uri.rs new file mode 100644 index 0000000..539cf83 --- /dev/null +++ b/src/xmldsig/uri.rs @@ -0,0 +1,699 @@ +//! URI dereference for XMLDSig `` elements. +//! +//! Implements same-document URI resolution per +//! [XMLDSig §4.3.3.2](https://www.w3.org/TR/xmldsig-core1/#sec-Same-Document): +//! +//! - **Empty URI** (`""` or absent): the entire document, excluding comments. +//! - **Bare-name `#id`**: the element whose ID attribute matches `id`, as a subtree. +//! - **`#xpointer(/)`**: the entire document, including comments. +//! - **`#xpointer(id('id'))` / `#xpointer(id("id"))`**: element by ID (equivalent to bare-name). +//! +//! External URIs (http://, file://, etc.) are not supported — only same-document +//! references are needed for SAML signature verification. + +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; + +use roxmltree::{Document, Node}; + +use super::types::{NodeSet, TransformData, TransformError}; + +/// Default ID attribute names to scan when building the ID index. +/// +/// These cover the most common conventions: +/// - `ID` — SAML 2.0 (``) +/// - `Id` — XMLDSig (``) +/// - `id` — general XML +const DEFAULT_ID_ATTRS: &[&str] = &["ID", "Id", "id"]; + +/// Resolves same-document URI references against a parsed XML document. +/// +/// Builds a `HashMap<&str, Node>` index on construction for O(1) fragment +/// lookups. Supports caller-provided ID attribute names (important for SAML +/// which uses `ID` rather than the xml:id mechanism). +/// +/// # Example +/// +/// ``` +/// # fn main() -> Result<(), Box> { +/// use xml_sec::xmldsig::uri::UriReferenceResolver; +/// +/// let xml = r#"content"#; +/// let doc = roxmltree::Document::parse(xml)?; +/// let resolver = UriReferenceResolver::new(&doc); +/// +/// assert!(resolver.has_id("abc")); +/// assert_eq!(resolver.id_count(), 1); +/// # Ok(()) +/// # } +/// ``` +pub struct UriReferenceResolver<'a> { + doc: &'a Document<'a>, + /// ID → element node mapping for O(1) fragment lookups. + id_map: HashMap<&'a str, Node<'a, 'a>>, +} + +impl<'a> UriReferenceResolver<'a> { + /// Build a resolver with default ID attribute names (`ID`, `Id`, `id`). + pub fn new(doc: &'a Document<'a>) -> Self { + Self::with_id_attrs(doc, DEFAULT_ID_ATTRS) + } + + /// Build a resolver scanning additional ID attribute names beyond the defaults. + /// + /// The defaults (`ID`, `Id`, `id`) are always included; `extra_attrs` + /// adds to them (does not replace). Pass an empty slice to use only defaults. + /// + /// Attribute names are matched using `roxmltree`'s *local-name* view of + /// attributes: any namespace prefix is stripped before comparison. For + /// example, an attribute written as `wsu:Id="..."` in the XML is seen as + /// simply `Id` by `roxmltree`, so callers **must** pass `"Id"`, not + /// `"wsu:Id"` or `"{namespace}Id"`. + pub fn with_id_attrs(doc: &'a Document<'a>, extra_attrs: &[&str]) -> Self { + let mut id_map = HashMap::new(); + // Track IDs seen more than once so they are never reinserted + // after being removed (handles 3+ occurrences correctly). + let mut duplicate_ids: HashSet<&'a str> = HashSet::new(); + + // Merge default + extra attribute names, dedup + let mut attr_names: Vec<&str> = DEFAULT_ID_ATTRS.to_vec(); + for name in extra_attrs { + if !attr_names.contains(name) { + attr_names.push(name); + } + } + + // Scan all elements for ID attributes + for node in doc.descendants() { + if node.is_element() { + for attr_name in &attr_names { + if let Some(value) = node.attribute(*attr_name) { + // Skip IDs already marked as duplicate + if duplicate_ids.contains(value) { + continue; + } + + // Duplicate IDs are invalid per XML spec and can enable + // signature-wrapping attacks. Remove the entry so that + // lookups for ambiguous IDs fail with ElementNotFound + // rather than silently picking an arbitrary node. + match id_map.entry(value) { + Entry::Vacant(v) => { + v.insert(node); + } + Entry::Occupied(o) => { + // Only treat as duplicate if a *different* element + // maps the same ID value. The same element can + // expose the same value via multiple scanned attrs + // (e.g., both `ID="x"` and `Id="x"`). + if o.get().id() != node.id() { + o.remove(); + duplicate_ids.insert(value); + } + } + } + } + } + } + } + + Self { doc, id_map } + } + + /// Dereference a URI string to a [`TransformData`]. + /// + /// # URI forms + /// + /// | URI | Result | + /// |-----|--------| + /// | `""` (empty) | Entire document, comments excluded | + /// | `"#foo"` | Subtree rooted at element with ID `foo` | + /// | `"#xpointer(/)"` | Entire document, comments included | + /// | `"#xpointer(id('foo'))"` | Subtree rooted at element with ID `foo` | + /// | other | `Err(UnsupportedUri)` | + pub fn dereference(&self, uri: &str) -> Result, TransformError> { + if uri.is_empty() { + // Empty URI = entire document without comments + // XMLDSig §4.3.3.2: "the reference is to the document [...], + // and the comment nodes are not included" + Ok(TransformData::NodeSet( + NodeSet::entire_document_without_comments(self.doc), + )) + } else if let Some(fragment) = uri.strip_prefix('#') { + // Note: we intentionally do NOT percent-decode the fragment. + // XMLDSig ID values are XML Name tokens (no spaces/special chars), + // and real-world SAML never uses percent-encoded fragments. + // xmlsec1 also passes fragments through without decoding. + self.dereference_fragment(fragment) + } else { + Err(TransformError::UnsupportedUri(uri.to_string())) + } + } + + /// Resolve a URI fragment (the part after `#`). + /// + /// Handles: + /// - `xpointer(/)` → entire document (with comments, per XPointer spec) + /// - `xpointer(id('foo'))` → element by ID (equivalent to bare-name `#foo`) + /// - bare name `foo` → element by ID attribute + fn dereference_fragment(&self, fragment: &str) -> Result, TransformError> { + if fragment.is_empty() { + // Bare "#" is not a valid same-document reference + return Err(TransformError::UnsupportedUri("#".to_string())); + } + + if fragment == "xpointer(/)" { + // XPointer root: entire document WITH comments (unlike empty URI). + // Per XMLDSig §4.3.3.3: "the XPointer expression [...] includes + // comment nodes" + Ok(TransformData::NodeSet( + NodeSet::entire_document_with_comments(self.doc), + )) + } else if let Some(id) = parse_xpointer_id(fragment) { + // xpointer(id('foo')) → same as bare-name #foo + // Reject empty parsed ID (e.g., xpointer(id(''))) — not a valid XML Name + if id.is_empty() { + return Err(TransformError::UnsupportedUri(format!("#{fragment}"))); + } + self.resolve_id(id) + } else if fragment.starts_with("xpointer(") { + // Any other XPointer expression is unsupported + Err(TransformError::UnsupportedUri(format!("#{fragment}"))) + } else { + // Bare-name fragment: #foo → element by ID + self.resolve_id(fragment) + } + } + + /// Look up an element by its ID attribute value and return a subtree node set. + fn resolve_id(&self, id: &str) -> Result, TransformError> { + match self.id_map.get(id) { + Some(&element) => Ok(TransformData::NodeSet(NodeSet::subtree(element))), + None => Err(TransformError::ElementNotFound(id.to_string())), + } + } + + /// Check if an ID is registered in the resolver's index. + pub fn has_id(&self, id: &str) -> bool { + self.id_map.contains_key(id) + } + + /// Get the number of registered IDs. + pub fn id_count(&self) -> usize { + self.id_map.len() + } +} + +/// Parse `xpointer(id('value'))` or `xpointer(id("value"))` and return the ID value. +/// Returns `None` if the fragment doesn't match this pattern. +fn parse_xpointer_id(fragment: &str) -> Option<&str> { + let inner = fragment.strip_prefix("xpointer(id(")?.strip_suffix("))")?; + + // Strip single or double quotes using safe helpers to avoid panics + // on malformed input (e.g., `xpointer(id('))` where inner is `'`) + if let Some(stripped) = inner.strip_prefix('\'').and_then(|s| s.strip_suffix('\'')) { + Some(stripped) + } else if let Some(stripped) = inner.strip_prefix('"').and_then(|s| s.strip_suffix('"')) { + Some(stripped) + } else { + None + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::super::types::NodeSet; + use super::*; + + #[test] + fn empty_uri_returns_whole_document() { + let xml = "text"; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("").unwrap(); + let node_set = data.into_node_set().unwrap(); + + // Whole document: root and child should be in the set + let root = doc.root_element(); + assert!(node_set.contains(root)); + let child = root.first_child().unwrap(); + assert!(node_set.contains(child)); + } + + #[test] + fn empty_uri_excludes_comments() { + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("").unwrap(); + let node_set = data.into_node_set().unwrap(); + + // Comment should be excluded + for node in doc.descendants() { + if node.is_comment() { + assert!( + !node_set.contains(node), + "comment should be excluded for empty URI" + ); + } + } + // Element should still be included + assert!(node_set.contains(doc.root_element())); + } + + #[test] + fn fragment_uri_resolves_by_id_attr() { + let xml = r#"contentother"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("#abc").unwrap(); + let node_set = data.into_node_set().unwrap(); + + // The element with ID="abc" and its children should be in the set + let abc_elem = doc + .descendants() + .find(|n| n.attribute("ID") == Some("abc")) + .unwrap(); + assert!(node_set.contains(abc_elem)); + + // The text child "content" should also be in the set + let text_child = abc_elem.first_child().unwrap(); + assert!(node_set.contains(text_child)); + + // The root element should NOT be in the set (subtree only) + assert!(!node_set.contains(doc.root_element())); + + // The element with ID="def" should NOT be in the set + let def_elem = doc + .descendants() + .find(|n| n.attribute("ID") == Some("def")) + .unwrap(); + assert!(!node_set.contains(def_elem)); + } + + #[test] + fn fragment_uri_resolves_lowercase_id() { + let xml = r#"text"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("#lower").unwrap(); + let node_set = data.into_node_set().unwrap(); + + let elem = doc + .descendants() + .find(|n| n.attribute("id") == Some("lower")) + .unwrap(); + assert!(node_set.contains(elem)); + } + + #[test] + fn fragment_uri_resolves_mixed_case_id() { + let xml = r#""#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + assert!(resolver.has_id("sig1")); + let data = resolver.dereference("#sig1").unwrap(); + assert!(data.into_node_set().is_ok()); + } + + #[test] + fn fragment_uri_not_found() { + let xml = "text"; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("#nonexistent"); + assert!(result.is_err()); + match result.unwrap_err() { + TransformError::ElementNotFound(id) => assert_eq!(id, "nonexistent"), + other => panic!("expected ElementNotFound, got: {other:?}"), + } + } + + #[test] + fn unsupported_external_uri() { + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("http://example.com/doc.xml"); + assert!(result.is_err()); + match result.unwrap_err() { + TransformError::UnsupportedUri(uri) => { + assert_eq!(uri, "http://example.com/doc.xml") + } + other => panic!("expected UnsupportedUri, got: {other:?}"), + } + } + + #[test] + fn unsupported_xpointer_expression() { + // XPointer expressions other than xpointer(/) and xpointer(id(...)) + // should return UnsupportedUri, not fall through to ID lookup + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("#xpointer(foo())"); + assert!(result.is_err()); + match result.unwrap_err() { + TransformError::UnsupportedUri(uri) => { + assert_eq!(uri, "#xpointer(foo())") + } + other => panic!("expected UnsupportedUri, got: {other:?}"), + } + + // Generic XPointer with XPath should also be unsupported + let result = resolver.dereference("#xpointer(//element)"); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + TransformError::UnsupportedUri(_) + )); + } + + #[test] + fn empty_fragment_rejected() { + // Bare "#" (empty fragment) is not a valid same-document reference + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("#"); + assert!(result.is_err()); + match result.unwrap_err() { + TransformError::UnsupportedUri(uri) => assert_eq!(uri, "#"), + other => panic!("expected UnsupportedUri, got: {other:?}"), + } + } + + #[test] + fn foreign_document_node_rejected() { + // NodeSet.contains() must reject nodes from a different document + let xml1 = ""; + let xml2 = ""; + let doc1 = Document::parse(xml1).unwrap(); + let doc2 = Document::parse(xml2).unwrap(); + + let node_set = NodeSet::entire_document_without_comments(&doc1); + + // Node from doc2 should NOT be in doc1's node set + let foreign_node = doc2.root_element(); + assert!( + !node_set.contains(foreign_node), + "foreign document node should be rejected" + ); + + // Node from doc1 should be in the set + let own_node = doc1.root_element(); + assert!(node_set.contains(own_node)); + } + + #[test] + fn custom_id_attr_name() { + // roxmltree stores `wsu:Id` with local name "Id" — already in DEFAULT_ID_ATTRS. + // Test with a truly custom attribute name instead. + let xml = r#"data"#; + let doc = Document::parse(xml).unwrap(); + + // Default resolver doesn't know about "myid" + let resolver_default = UriReferenceResolver::new(&doc); + assert!(!resolver_default.has_id("custom1")); + + // Custom resolver with "myid" added + let resolver_custom = UriReferenceResolver::with_id_attrs(&doc, &["myid"]); + assert!(resolver_custom.has_id("custom1")); + + let data = resolver_custom.dereference("#custom1").unwrap(); + assert!(data.into_node_set().is_ok()); + } + + #[test] + fn namespaced_id_attr_found_by_local_name() { + // roxmltree strips prefix: `wsu:Id` → local name "Id", which is in DEFAULT_ID_ATTRS + let xml = + r#"data"#; + let doc = Document::parse(xml).unwrap(); + + let resolver = UriReferenceResolver::new(&doc); + assert!(resolver.has_id("ts1")); + } + + #[test] + fn id_count_reports_unique_ids() { + let xml = r#""#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + // 4 elements with ID-like attributes + assert_eq!(resolver.id_count(), 4); + } + + #[test] + fn duplicate_ids_are_rejected() { + // Duplicate IDs are removed from the index to prevent signature-wrapping + // attacks — lookups for ambiguous IDs fail instead of picking arbitrarily. + let xml = r#"firstsecond"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + // "dup" appears twice → removed from index + assert!(!resolver.has_id("dup")); + let result = resolver.dereference("#dup"); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + TransformError::ElementNotFound(_) + )); + } + + #[test] + fn triple_duplicate_ids_stay_rejected() { + // Verify that 3+ occurrences don't re-insert (the HashSet tracks + // permanently removed IDs so Entry::Vacant after remove doesn't re-add) + let xml = r#"123"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + assert!(!resolver.has_id("dup")); + assert!(resolver.dereference("#dup").is_err()); + } + + #[test] + fn node_set_exclude_subtree() { + let xml = r#"yesno"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("").unwrap(); + let mut node_set = data.into_node_set().unwrap(); + + // Find and exclude the subtree + let remove_elem = doc + .descendants() + .find(|n| n.is_element() && n.has_tag_name("remove")) + .unwrap(); + node_set.exclude_subtree(remove_elem); + + // should still be in the set + let keep_elem = doc + .descendants() + .find(|n| n.is_element() && n.has_tag_name("keep")) + .unwrap(); + assert!(node_set.contains(keep_elem)); + + // and its children should be excluded + assert!(!node_set.contains(remove_elem)); + let deep_elem = doc + .descendants() + .find(|n| n.is_element() && n.has_tag_name("deep")) + .unwrap(); + assert!(!node_set.contains(deep_elem)); + } + + #[test] + fn subtree_includes_comments() { + // Subtree dereference (via #id) includes comments, unlike empty URI + let xml = r#""#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("#x").unwrap(); + let node_set = data.into_node_set().unwrap(); + + for node in doc.descendants() { + if node.is_comment() { + assert!( + node_set.contains(node), + "comment should be included in #id subtree" + ); + } + } + } + + #[test] + fn xpointer_root_returns_whole_document_with_comments() { + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("#xpointer(/)").unwrap(); + let node_set = data.into_node_set().unwrap(); + + // Unlike empty URI, xpointer(/) includes comments + for node in doc.descendants() { + if node.is_comment() { + assert!( + node_set.contains(node), + "comment should be included for #xpointer(/)" + ); + } + } + assert!(node_set.contains(doc.root_element())); + } + + #[test] + fn xpointer_id_single_quotes() { + let xml = r#"content"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("#xpointer(id('abc'))").unwrap(); + let node_set = data.into_node_set().unwrap(); + + let elem = doc + .descendants() + .find(|n| n.attribute("ID") == Some("abc")) + .unwrap(); + assert!(node_set.contains(elem)); + } + + #[test] + fn xpointer_id_double_quotes() { + let xml = r#"content"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference(r#"#xpointer(id("xyz"))"#).unwrap(); + let node_set = data.into_node_set().unwrap(); + + let elem = doc + .descendants() + .find(|n| n.attribute("ID") == Some("xyz")) + .unwrap(); + assert!(node_set.contains(elem)); + } + + #[test] + fn xpointer_id_not_found() { + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("#xpointer(id('missing'))"); + assert!(result.is_err()); + match result.unwrap_err() { + TransformError::ElementNotFound(id) => assert_eq!(id, "missing"), + other => panic!("expected ElementNotFound, got: {other:?}"), + } + } + + #[test] + fn xpointer_id_empty_value_rejected() { + // xpointer(id('')) parses to empty string — reject as UnsupportedUri + let xml = ""; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + let result = resolver.dereference("#xpointer(id(''))"); + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + TransformError::UnsupportedUri(_) + )); + } + + #[test] + fn parse_xpointer_id_variants() { + // Valid forms + assert_eq!(super::parse_xpointer_id("xpointer(id('foo'))"), Some("foo")); + assert_eq!( + super::parse_xpointer_id(r#"xpointer(id("bar"))"#), + Some("bar") + ); + + // Invalid forms + assert_eq!(super::parse_xpointer_id("xpointer(/)"), None); + assert_eq!(super::parse_xpointer_id("xpointer(id(foo))"), None); // no quotes + assert_eq!(super::parse_xpointer_id("not-xpointer"), None); + assert_eq!(super::parse_xpointer_id(""), None); + + // Malformed: single quote char — must not panic (was slicing bug) + assert_eq!(super::parse_xpointer_id("xpointer(id('))"), None); + assert_eq!(super::parse_xpointer_id(r#"xpointer(id("))"#), None); + } + + #[test] + fn same_element_multiple_id_attrs_not_duplicate() { + // An element with both ID="x" and Id="x" should NOT be treated as + // duplicate — it's the same element exposing the same value via + // different scanned attribute names. + let xml = r#"data"#; + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + assert!(resolver.has_id("x")); + assert!(resolver.dereference("#x").is_ok()); + } + + #[test] + fn saml_style_document() { + // Realistic SAML-like structure + let xml = r#" + + user@example.com + + + + + "#; + + let doc = Document::parse(xml).unwrap(); + let resolver = UriReferenceResolver::new(&doc); + + // Should find all three IDs + assert!(resolver.has_id("_resp1")); + assert!(resolver.has_id("_assert1")); + assert!(resolver.has_id("sig1")); + assert_eq!(resolver.id_count(), 3); + + // Dereference the assertion + let data = resolver.dereference("#_assert1").unwrap(); + let node_set = data.into_node_set().unwrap(); + + // Assertion element should be in the set + let assertion = doc + .descendants() + .find(|n| n.attribute("ID") == Some("_assert1")) + .unwrap(); + assert!(node_set.contains(assertion)); + + // Subject (child of assertion) should be in the set + let subject = assertion + .children() + .find(|n| n.is_element() && n.has_tag_name("Subject")) + .unwrap(); + assert!(node_set.contains(subject)); + + // Response (parent) should NOT be in the set + assert!(!node_set.contains(doc.root_element())); + } +} diff --git a/tests/uri_integration.rs b/tests/uri_integration.rs new file mode 100644 index 0000000..b4776fd --- /dev/null +++ b/tests/uri_integration.rs @@ -0,0 +1,218 @@ +//! Integration tests: URI dereference → NodeSet → C14N canonicalization. +//! +//! Verifies that dereferencing a URI produces a NodeSet that, when used as +//! a predicate for C14N, produces the correct canonical output. + +use xml_sec::c14n::{canonicalize, C14nAlgorithm, C14nMode}; +use xml_sec::xmldsig::uri::UriReferenceResolver; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +/// Dereference `uri`, build a C14N predicate from the resulting NodeSet, +/// canonicalize with inclusive C14N 1.0, and return the canonical string. +fn deref_and_canonicalize(xml: &str, uri: &str) -> String { + deref_and_canonicalize_impl(xml, uri, false) +} + +/// Same but with comments enabled (for xpointer(/) which includes comments). +fn deref_and_canonicalize_with_comments(xml: &str, uri: &str) -> String { + deref_and_canonicalize_impl(xml, uri, true) +} + +/// Shared implementation for dereferencing and canonicalizing, parameterized +/// by whether comments should be included. +fn deref_and_canonicalize_impl(xml: &str, uri: &str, with_comments: bool) -> String { + let doc = roxmltree::Document::parse(xml).expect("parse"); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference(uri).expect("dereference"); + let node_set = data.into_node_set().expect("into_node_set"); + + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, with_comments); + let predicate = |n: roxmltree::Node| node_set.contains(n); + let mut output = Vec::new(); + canonicalize(&doc, Some(&predicate), &algo, &mut output).expect("canonicalize"); + String::from_utf8(output).expect("utf8") +} + +// ─── Empty URI: whole document without comments ───────────────────────────── + +#[test] +fn empty_uri_canonicalizes_whole_document() { + let xml = r#""#; + let result = deref_and_canonicalize(xml, ""); + // C14N sorts attributes and expands empty elements + assert_eq!(result, r#""#); +} + +#[test] +fn empty_uri_strips_comments() { + let xml = "text"; + let result = deref_and_canonicalize(xml, ""); + // Comment should be stripped for empty URI dereference + assert_eq!(result, "text"); +} + +#[test] +fn empty_uri_with_namespaces() { + let xml = r#""#; + let result = deref_and_canonicalize(xml, ""); + // Inclusive C14N: root declares both ns, child suppresses redundant redeclarations + assert_eq!( + result, + r#""# + ); +} + +// ─── #id: subtree by ID ───────────────────────────────────────────────────── + +#[test] +fn fragment_id_canonicalizes_subtree_only() { + let xml = r#"skiptextskip"#; + let result = deref_and_canonicalize(xml, "#t1"); + // Only the target subtree should appear; root, before, after excluded + assert_eq!( + result, + r#"text"# + ); +} + +#[test] +fn fragment_id_includes_comments_in_subtree() { + // Unlike empty URI, #id subtrees include comments + let xml = r#""#; + let result = deref_and_canonicalize_with_comments(xml, "#x"); + assert_eq!( + result, + r#""# + ); +} + +#[test] +fn fragment_id_inherits_ancestor_namespaces() { + // When canonicalizing a subtree, inclusive C14N emits in-scope namespaces + // from ancestor elements even though those ancestors are not in the node set + let xml = + r#""#; + let result = deref_and_canonicalize(xml, "#sub"); + // ns:item declares xmlns:ns (inherited from root ancestor outside subset). + // ns:child suppresses redundant redeclaration since parent ns:item already declared it. + assert_eq!( + result, + r#""# + ); +} + +// ─── #xpointer(/) : whole document WITH comments ─────────────────────────── + +#[test] +fn xpointer_root_includes_comments() { + let xml = ""; + let result = deref_and_canonicalize_with_comments(xml, "#xpointer(/)"); + // xpointer(/) includes comments, unlike empty URI + assert_eq!(result, ""); +} + +#[test] +fn xpointer_root_vs_empty_uri_comment_difference() { + let xml = ""; + + let empty_uri = deref_and_canonicalize(xml, ""); + let xpointer_root = deref_and_canonicalize_with_comments(xml, "#xpointer(/)"); + + // Empty URI: comments stripped + assert_eq!(empty_uri, ""); + // xpointer(/): comments preserved + assert_eq!( + xpointer_root, + "" + ); +} + +// ─── #xpointer(id('...')) : equivalent to bare-name ──────────────────────── + +#[test] +fn xpointer_id_canonicalizes_same_as_bare_name() { + let xml = r#"data"#; + + let bare_name = deref_and_canonicalize(xml, "#abc"); + let xpointer = deref_and_canonicalize(xml, "#xpointer(id('abc'))"); + + assert_eq!(bare_name, xpointer); +} + +// ─── SAML-like scenario ───────────────────────────────────────────────────── + +#[test] +fn saml_assertion_subtree_canonicalization() { + // Realistic SAML: dereference assertion by ID, canonicalize the subtree + let xml = r#" + + user@example.com + +"#; + + let result = deref_and_canonicalize(xml, "#_a1"); + + // Assertion subtree with inherited namespace declarations + assert!( + result.contains("xmlns:saml=\"urn:oasis:names:tc:SAML:2.0:assertion\""), + "should inherit saml namespace from ancestor: {result}" + ); + assert!( + result.contains("user@example.com"), + "should include Subject child: {result}" + ); + // Response element should NOT appear + assert!( + !result.contains("samlp:Response"), + "Response should not be in subtree: {result}" + ); +} + +#[test] +fn saml_enveloped_signature_exclusion() { + // Simulate enveloped signature: dereference whole doc, then exclude Signature subtree + // This is what P1-014 (enveloped transform) will do, but we can test the + // NodeSet.exclude_subtree() + C14N combination here + let xml = r#" + data + + digest + +"#; + + let doc = roxmltree::Document::parse(xml).expect("parse"); + let resolver = UriReferenceResolver::new(&doc); + + let data = resolver.dereference("").expect("dereference"); + let mut node_set = data.into_node_set().expect("into_node_set"); + + // Exclude the Signature subtree (simulating enveloped-signature transform) + let sig_elem = doc + .descendants() + .find(|n| n.is_element() && n.has_tag_name("Signature")) + .expect("Signature element"); + node_set.exclude_subtree(sig_elem); + + let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); + let predicate = |n: roxmltree::Node| node_set.contains(n); + let mut output = Vec::new(); + canonicalize(&doc, Some(&predicate), &algo, &mut output).expect("canonicalize"); + let result = String::from_utf8(output).expect("utf8"); + + // Signature and its children should be gone + assert!( + !result.contains("Signature"), + "Signature should be excluded: {result}" + ); + assert!( + !result.contains("SignedInfo"), + "SignedInfo should be excluded: {result}" + ); + // Assertion should remain + assert!( + result.contains("data"), + "Assertion should remain: {result}" + ); +}