-
Notifications
You must be signed in to change notification settings - Fork 0
feat(c14n): implement XML canonicalization (inclusive + exclusive) #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
c78ffad
feat(c14n): implement XML canonicalization (inclusive + exclusive)
polaz d4326d7
fix(c14n): escape CR in PI/node content, reject C14N 1.1
polaz e427464
refactor(c14n): encapsulate C14nAlgorithm fields, remove dead code
polaz 0f88992
fix(c14n): clarify UTF-8 error, fix inclusive ns doc, document subset…
polaz 55f7820
docs(c14n): clarify with_prefix_list is exclusive-mode only
polaz 74f2e9d
docs(c14n): add UTF-8 requirement doc, remove stale file reference
polaz a9fb11b
perf(c14n): document clone() cost and deferred optimization path
polaz 7f8ca36
fix(c14n): use lexical prefixes from source XML instead of lookup_pre…
polaz 026a410
fix(c14n): correct xmlns="" suppression for document subsets
polaz 04fc3a2
fix(c14n): use Result in doctests, fix has_in_scope_default_namespace…
polaz 9965969
fix(c14n): treat default ns as visibly utilized for unprefixed elements
polaz File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,3 +5,5 @@ Cargo.lock | |
| .DS_Store | ||
| donors/ | ||
| .refs/ | ||
| arch/ | ||
| ROADMAP.md | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| //! Text and attribute value escaping for canonical XML. | ||
|
|
||
| /// Escape text node content for canonical XML. | ||
| /// | ||
| /// Replaces: `&` → `&`, `<` → `<`, `>` → `>`, `\r` → `
` | ||
| pub(crate) fn escape_text(s: &str, output: &mut Vec<u8>) { | ||
| for b in s.bytes() { | ||
| match b { | ||
| b'&' => output.extend_from_slice(b"&"), | ||
| b'<' => output.extend_from_slice(b"<"), | ||
| b'>' => output.extend_from_slice(b">"), | ||
| b'\r' => output.extend_from_slice(b"
"), | ||
| _ => output.push(b), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Escape attribute value for canonical XML. | ||
| /// | ||
| /// Replaces: `&` → `&`, `<` → `<`, `"` → `"`, | ||
| /// `\t` → `	`, `\n` → `
`, `\r` → `
` | ||
| pub(crate) fn escape_attr(s: &str, output: &mut Vec<u8>) { | ||
| for b in s.bytes() { | ||
| match b { | ||
| b'&' => output.extend_from_slice(b"&"), | ||
| b'<' => output.extend_from_slice(b"<"), | ||
| b'"' => output.extend_from_slice(b"""), | ||
| b'\t' => output.extend_from_slice(b"	"), | ||
| b'\n' => output.extend_from_slice(b"
"), | ||
| b'\r' => output.extend_from_slice(b"
"), | ||
| _ => output.push(b), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Escape only carriage returns in comment/PI content for canonical XML. | ||
| /// | ||
| /// C14N spec section 2.3: `\r` in comments and PIs → `
` | ||
| pub(crate) fn escape_cr(s: &str, output: &mut Vec<u8>) { | ||
| for b in s.bytes() { | ||
| match b { | ||
| b'\r' => output.extend_from_slice(b"
"), | ||
| _ => output.push(b), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| #[allow(clippy::unwrap_used)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn text_escaping() { | ||
| let mut out = Vec::new(); | ||
| escape_text("a < b & c > d\r\n", &mut out); | ||
| assert_eq!( | ||
| String::from_utf8(out).expect("valid utf8"), | ||
| "a < b & c > d
\n" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn attr_escaping() { | ||
| let mut out = Vec::new(); | ||
| escape_attr("he said \"hi\" & \t\n\r", &mut out); | ||
| assert_eq!( | ||
| String::from_utf8(out).expect("valid utf8"), | ||
| "he said "hi" & 	

" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn passthrough_plain_text() { | ||
| let mut out = Vec::new(); | ||
| escape_text("hello world", &mut out); | ||
| assert_eq!(String::from_utf8(out).expect("valid utf8"), "hello world"); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,265 @@ | ||
| //! XML Canonicalization (C14N). | ||
| //! | ||
| //! Implements: | ||
| //! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) | ||
| //! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) | ||
| //! - [Exclusive XML Canonicalization](https://www.w3.org/TR/xml-exc-c14n/) | ||
| //! - [Canonical XML 1.0](https://www.w3.org/TR/xml-c14n/) (inclusive) | ||
| //! - [Canonical XML 1.1](https://www.w3.org/TR/xml-c14n11/) — URI parsing only; | ||
| //! canonicalization returns `UnsupportedAlgorithm` (1.1-specific rules not yet implemented) | ||
| //! - [Exclusive XML Canonicalization 1.0](https://www.w3.org/TR/xml-exc-c14n/) (exclusive) | ||
|
polaz marked this conversation as resolved.
|
||
| //! | ||
| //! # Example | ||
| //! | ||
| //! ``` | ||
| //! # fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
| //! use xml_sec::c14n::{C14nAlgorithm, C14nMode, canonicalize_xml}; | ||
| //! | ||
| //! let xml = b"<root b=\"2\" a=\"1\"><empty/></root>"; | ||
| //! let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); | ||
| //! let canonical = canonicalize_xml(xml, &algo)?; | ||
| //! assert_eq!( | ||
| //! String::from_utf8(canonical)?, | ||
| //! "<root a=\"1\" b=\"2\"><empty></empty></root>" | ||
| //! ); | ||
|
polaz marked this conversation as resolved.
|
||
| //! # Ok(()) | ||
| //! # } | ||
| //! ``` | ||
|
|
||
| mod escape; | ||
| pub(crate) mod ns_exclusive; | ||
| pub(crate) mod ns_inclusive; | ||
| mod prefix; | ||
| pub(crate) mod serialize; | ||
|
|
||
| /// Canonicalization algorithm selection. | ||
| use std::collections::HashSet; | ||
|
|
||
| use roxmltree::{Document, Node}; | ||
|
|
||
| use ns_exclusive::ExclusiveNsRenderer; | ||
| use ns_inclusive::InclusiveNsRenderer; | ||
| use serialize::serialize_canonical; | ||
|
|
||
| /// C14N algorithm mode (without the comments flag). | ||
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
| pub enum C14nAlgorithm { | ||
| /// Canonical XML 1.0 (with comments). | ||
| Inclusive10WithComments, | ||
| /// Canonical XML 1.0 (without comments). | ||
| Inclusive10, | ||
| /// Canonical XML 1.1 (with comments). | ||
| Inclusive11WithComments, | ||
| /// Canonical XML 1.1 (without comments). | ||
| Inclusive11, | ||
| /// Exclusive Canonical XML (with comments). | ||
| Exclusive10WithComments, | ||
| /// Exclusive Canonical XML (without comments). | ||
| Exclusive10, | ||
| pub enum C14nMode { | ||
| /// Inclusive C14N 1.0 — all in-scope namespaces rendered. | ||
| Inclusive1_0, | ||
| /// Inclusive C14N 1.1 — like 1.0 with xml:id propagation and xml:base fixup. | ||
| Inclusive1_1, | ||
| /// Exclusive C14N 1.0 — only visibly-utilized namespaces rendered. | ||
| Exclusive1_0, | ||
| } | ||
|
|
||
| /// Full C14N algorithm identifier. | ||
| /// | ||
| /// Constructed from algorithm URIs found in `<CanonicalizationMethod>` or | ||
| /// `<Transform>` elements. | ||
| #[derive(Debug, Clone, PartialEq, Eq)] | ||
| pub struct C14nAlgorithm { | ||
| mode: C14nMode, | ||
| with_comments: bool, | ||
| /// For Exclusive C14N: prefixes forced via InclusiveNamespaces PrefixList. | ||
| /// `"#default"` is normalized to `""` (empty string) by `with_prefix_list()`. | ||
| inclusive_prefixes: HashSet<String>, | ||
| } | ||
|
|
||
| impl C14nAlgorithm { | ||
| /// The canonicalization mode. | ||
| pub fn mode(&self) -> C14nMode { | ||
| self.mode | ||
| } | ||
|
|
||
| /// Whether comment nodes are preserved. | ||
| pub fn with_comments(&self) -> bool { | ||
| self.with_comments | ||
| } | ||
|
|
||
| /// Prefixes forced via InclusiveNamespaces PrefixList (exclusive C14N). | ||
| pub fn inclusive_prefixes(&self) -> &HashSet<String> { | ||
| &self.inclusive_prefixes | ||
| } | ||
|
|
||
| /// Create a new algorithm with the given mode and comments flag. | ||
| pub fn new(mode: C14nMode, with_comments: bool) -> Self { | ||
| Self { | ||
| mode, | ||
| with_comments, | ||
| inclusive_prefixes: HashSet::new(), | ||
| } | ||
| } | ||
|
|
||
| /// Parse from an algorithm URI. Returns `None` for unrecognized URIs. | ||
| pub fn from_uri(uri: &str) -> Option<Self> { | ||
| let (mode, with_comments) = match uri { | ||
| "http://www.w3.org/TR/2001/REC-xml-c14n-20010315" => (C14nMode::Inclusive1_0, false), | ||
| "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" => { | ||
| (C14nMode::Inclusive1_0, true) | ||
| } | ||
| "http://www.w3.org/2006/12/xml-c14n11" => (C14nMode::Inclusive1_1, false), | ||
| "http://www.w3.org/2006/12/xml-c14n11#WithComments" => (C14nMode::Inclusive1_1, true), | ||
| "http://www.w3.org/2001/10/xml-exc-c14n#" => (C14nMode::Exclusive1_0, false), | ||
| "http://www.w3.org/2001/10/xml-exc-c14n#WithComments" => (C14nMode::Exclusive1_0, true), | ||
| _ => return None, | ||
| }; | ||
| Some(Self { | ||
| mode, | ||
| with_comments, | ||
| inclusive_prefixes: HashSet::new(), | ||
| }) | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| /// Set the InclusiveNamespaces PrefixList (exclusive C14N only). | ||
| /// `"#default"` is normalized to empty string `""`. | ||
| /// | ||
| /// Only meaningful for [`C14nMode::Exclusive1_0`]. For inclusive modes, | ||
| /// the prefix list is ignored during canonicalization. | ||
| pub fn with_prefix_list(mut self, prefix_list: &str) -> Self { | ||
| self.inclusive_prefixes = prefix_list | ||
| .split_whitespace() | ||
| .map(|p| { | ||
| if p == "#default" { | ||
| String::new() | ||
| } else { | ||
| p.to_string() | ||
| } | ||
| }) | ||
| .collect(); | ||
| self | ||
| } | ||
|
|
||
| /// Get the algorithm URI for this configuration. | ||
| pub fn uri(&self) -> &'static str { | ||
| match (self.mode, self.with_comments) { | ||
| (C14nMode::Inclusive1_0, false) => "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", | ||
| (C14nMode::Inclusive1_0, true) => { | ||
| "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments" | ||
| } | ||
| (C14nMode::Inclusive1_1, false) => "http://www.w3.org/2006/12/xml-c14n11", | ||
| (C14nMode::Inclusive1_1, true) => "http://www.w3.org/2006/12/xml-c14n11#WithComments", | ||
| (C14nMode::Exclusive1_0, false) => "http://www.w3.org/2001/10/xml-exc-c14n#", | ||
| (C14nMode::Exclusive1_0, true) => "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Error type for C14N operations. | ||
| #[derive(Debug, thiserror::Error)] | ||
| pub enum C14nError { | ||
| /// XML parsing error. | ||
| #[error("XML parse error: {0}")] | ||
| Parse(String), | ||
| /// Invalid node reference. | ||
| #[error("invalid node reference")] | ||
| InvalidNode, | ||
| /// Algorithm not yet implemented. | ||
| #[error("unsupported algorithm: {0}")] | ||
| UnsupportedAlgorithm(String), | ||
| /// I/O error. | ||
| #[error("I/O error: {0}")] | ||
| Io(#[from] std::io::Error), | ||
| } | ||
|
|
||
| /// Canonicalize an XML document or document subset. | ||
| /// | ||
| /// - `doc`: parsed roxmltree document (read-only DOM). | ||
| /// - `node_set`: optional predicate controlling which nodes appear in output. | ||
| /// `None` means the entire document. | ||
| /// - `algo`: algorithm parameters (mode, comments, prefix list). | ||
| /// - `output`: byte buffer receiving canonical XML. | ||
| pub fn canonicalize( | ||
| doc: &Document, | ||
| node_set: Option<&dyn Fn(Node) -> bool>, | ||
| algo: &C14nAlgorithm, | ||
| output: &mut Vec<u8>, | ||
| ) -> Result<(), C14nError> { | ||
| match algo.mode { | ||
| C14nMode::Inclusive1_0 => { | ||
| let renderer = InclusiveNsRenderer; | ||
| serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) | ||
| } | ||
|
polaz marked this conversation as resolved.
|
||
| // C14N 1.1 has observable differences (xml:id propagation, xml:base fixup) | ||
| // that are not yet implemented. Fail explicitly rather than silently | ||
| // producing 1.0 output. | ||
| C14nMode::Inclusive1_1 => Err(C14nError::UnsupportedAlgorithm( | ||
| "C14N 1.1 is not yet implemented".to_string(), | ||
| )), | ||
| C14nMode::Exclusive1_0 => { | ||
| let renderer = ExclusiveNsRenderer::new(&algo.inclusive_prefixes); | ||
| serialize_canonical(doc, node_set, algo.with_comments, &renderer, output) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Convenience: parse XML bytes and canonicalize the whole document. | ||
| /// | ||
| /// Input must be valid UTF-8 (XML 1.0 documents are UTF-8 or declare their | ||
| /// encoding; roxmltree only accepts UTF-8). Returns `C14nError::Parse` for | ||
| /// invalid UTF-8 or malformed XML. | ||
| pub fn canonicalize_xml(xml: &[u8], algo: &C14nAlgorithm) -> Result<Vec<u8>, C14nError> { | ||
| let xml_str = | ||
| std::str::from_utf8(xml).map_err(|e| C14nError::Parse(format!("invalid UTF-8: {e}")))?; | ||
| let doc = Document::parse(xml_str).map_err(|e| C14nError::Parse(e.to_string()))?; | ||
| let mut output = Vec::new(); | ||
|
polaz marked this conversation as resolved.
|
||
| canonicalize(&doc, None, algo, &mut output)?; | ||
| Ok(output) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| #[allow(clippy::unwrap_used)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn from_uri_roundtrip() { | ||
| let uris = [ | ||
| "http://www.w3.org/TR/2001/REC-xml-c14n-20010315", | ||
| "http://www.w3.org/TR/2001/REC-xml-c14n-20010315#WithComments", | ||
| "http://www.w3.org/2006/12/xml-c14n11", | ||
| "http://www.w3.org/2006/12/xml-c14n11#WithComments", | ||
| "http://www.w3.org/2001/10/xml-exc-c14n#", | ||
| "http://www.w3.org/2001/10/xml-exc-c14n#WithComments", | ||
| ]; | ||
| for uri in uris { | ||
| let algo = C14nAlgorithm::from_uri(uri).expect(uri); | ||
| assert_eq!(algo.uri(), uri); | ||
|
polaz marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn unknown_uri_returns_none() { | ||
| assert!(C14nAlgorithm::from_uri("http://example.com/unknown").is_none()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn prefix_list_parsing() { | ||
| let algo = C14nAlgorithm::new(C14nMode::Exclusive1_0, false) | ||
| .with_prefix_list("foo bar #default baz"); | ||
| assert!(algo.inclusive_prefixes.contains("foo")); | ||
| assert!(algo.inclusive_prefixes.contains("bar")); | ||
| assert!(algo.inclusive_prefixes.contains("baz")); | ||
| assert!(algo.inclusive_prefixes.contains("")); // #default → "" | ||
| assert_eq!(algo.inclusive_prefixes.len(), 4); | ||
| } | ||
|
|
||
| #[test] | ||
| fn canonicalize_xml_basic() { | ||
| let xml = b"<root b=\"2\" a=\"1\"><empty/></root>"; | ||
| let algo = C14nAlgorithm::new(C14nMode::Inclusive1_0, false); | ||
| let result = canonicalize_xml(xml, &algo).expect("c14n"); | ||
| assert_eq!( | ||
| String::from_utf8(result).expect("utf8"), | ||
| r#"<root a="1" b="2"><empty></empty></root>"# | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn c14n_1_1_returns_error() { | ||
| let xml = b"<root/>"; | ||
| let algo = C14nAlgorithm::new(C14nMode::Inclusive1_1, false); | ||
| let result = canonicalize_xml(xml, &algo); | ||
| assert!(result.is_err()); | ||
| let err = result.unwrap_err(); | ||
| assert!( | ||
| matches!(err, C14nError::UnsupportedAlgorithm(_)), | ||
| "expected UnsupportedAlgorithm, got: {err:?}" | ||
| ); | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.