From d87df0088de77790f9d209f03e18207922893b34 Mon Sep 17 00:00:00 2001 From: Fazal Majid Date: Thu, 19 Feb 2026 18:43:48 +0000 Subject: [PATCH 1/2] fix: handle GUIDs with XML entities in them like feedparser-py --- .../feedparser-rs-core/src/parser/common.rs | 121 +++++++++++++++++- .../tests/test_guid_entities.py | 73 +++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 crates/feedparser-rs-py/tests/test_guid_entities.py diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs index 6f06ee1..4f9f975 100644 --- a/crates/feedparser-rs-core/src/parser/common.rs +++ b/crates/feedparser-rs-core/src/parser/common.rs @@ -8,7 +8,10 @@ use crate::{ error::{FeedError, Result}, types::{FeedVersion, ParsedFeed}, }; -use quick_xml::{Reader, events::Event}; +use quick_xml::{ + Reader, + events::{BytesRef, Event}, +}; pub use crate::types::{FromAttributes, LimitedCollectionExt}; pub use crate::util::text::bytes_to_string; @@ -370,6 +373,10 @@ pub fn read_text( Ok(Event::CData(e)) => { append_bytes(&mut text, e.as_ref(), limits.max_text_length)?; } + Ok(Event::GeneralRef(e)) => { + let resolved = resolve_entity(&e)?; + append_bytes(&mut text, resolved.as_bytes(), limits.max_text_length)?; + } Ok(Event::End(_) | Event::Eof) => break, Err(e) => return Err(e.into()), _ => {} @@ -380,6 +387,30 @@ pub fn read_text( Ok(text) } +/// Resolve a general entity reference (character or named) to a string. +fn resolve_entity(e: &BytesRef<'_>) -> Result { + // Try numeric character references first: & & etc. + if let Some(ch) = e + .resolve_char_ref() + .map_err(|err| FeedError::InvalidFormat(format!("Invalid character reference: {err}")))? + { + return Ok(ch.to_string()); + } + // Predefined XML named entity references + match e.as_ref() { + b"amp" => Ok("&".to_string()), + b"lt" => Ok("<".to_string()), + b"gt" => Ok(">".to_string()), + b"quot" => Ok("\"".to_string()), + b"apos" => Ok("'".to_string()), + other => { + // Unknown entity — preserve as-is + let name = std::str::from_utf8(other).unwrap_or("?"); + Ok(format!("&{name};")) + } + } +} + #[inline] fn append_bytes(text: &mut String, bytes: &[u8], max_len: usize) -> Result<()> { if text.len() + bytes.len() > max_len { @@ -510,6 +541,94 @@ mod tests { assert!(result.is_err()); } + #[test] + fn test_read_text_numeric_char_ref() { + let xml = b"https://example.com/?post_type=webcomic1&p=3172"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "https://example.com/?post_type=webcomic1&p=3172"); + } + + #[test] + fn test_read_text_amp_entity() { + let xml = b"https://example.com/?a=1&b=2"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "https://example.com/?a=1&b=2"); + } + + #[test] + fn test_read_text_hex_char_ref() { + let xml = b"https://example.com/?a=1&b=2"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "https://example.com/?a=1&b=2"); + } + + #[test] + fn test_read_text_multiple_entities() { + let xml = b"https://example.com/?a=1&b=2&c=3"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "https://example.com/?a=1&b=2&c=3"); + } + #[test] fn test_skip_element_basic() { let xml = b"content"; diff --git a/crates/feedparser-rs-py/tests/test_guid_entities.py b/crates/feedparser-rs-py/tests/test_guid_entities.py new file mode 100644 index 0000000..d14a02b --- /dev/null +++ b/crates/feedparser-rs-py/tests/test_guid_entities.py @@ -0,0 +1,73 @@ +"""Test XML entity decoding in guid elements. + +Regression test for a bug where quick-xml 0.39's Event::GeneralRef events +(emitted for XML entity references like &) were silently dropped, +causing decoded characters to vanish from entry.id / entry.guid. +""" + +import feedparser_rs + + +def test_guid_with_numeric_character_reference(): + """Test that & in guid decodes to & (matching Python feedparser).""" + xml = b""" + + + + https://sidequested.com/?post_type=webcomic1&p=3172 + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "https://sidequested.com/?post_type=webcomic1&p=3172" + assert d.entries[0].guid == d.entries[0].id + + +def test_guid_with_amp_entity(): + """Test that & in guid decodes to &.""" + xml = b""" + + + + https://example.com/?a=1&b=2 + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "https://example.com/?a=1&b=2" + + +def test_guid_with_hex_character_reference(): + """Test that & (hex for &) in guid decodes correctly.""" + xml = b""" + + + + https://example.com/?a=1&b=2 + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "https://example.com/?a=1&b=2" + + +def test_multiple_entities_in_guid(): + """Test guid with multiple entity references.""" + xml = b""" + + + + https://example.com/?a=1&b=2&c=3 + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "https://example.com/?a=1&b=2&c=3" From c5fcb542b8bdaa1769fbe597ed5776f6dfefc782 Mon Sep 17 00:00:00 2001 From: Fazal Majid Date: Fri, 20 Feb 2026 12:19:31 +0000 Subject: [PATCH 2/2] implemented Copilot recommendations --- .../feedparser-rs-core/src/parser/common.rs | 138 +++++++++++++++--- .../tests/test_guid_entities.py | 80 ++++++++++ 2 files changed, 201 insertions(+), 17 deletions(-) diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs index 4f9f975..b422f88 100644 --- a/crates/feedparser-rs-core/src/parser/common.rs +++ b/crates/feedparser-rs-core/src/parser/common.rs @@ -374,7 +374,7 @@ pub fn read_text( append_bytes(&mut text, e.as_ref(), limits.max_text_length)?; } Ok(Event::GeneralRef(e)) => { - let resolved = resolve_entity(&e)?; + let resolved = resolve_entity(&e); append_bytes(&mut text, resolved.as_bytes(), limits.max_text_length)?; } Ok(Event::End(_) | Event::Eof) => break, @@ -387,26 +387,33 @@ pub fn read_text( Ok(text) } -/// Resolve a general entity reference (character or named) to a string. -fn resolve_entity(e: &BytesRef<'_>) -> Result { +/// Resolve a general entity reference (numeric or named) to a string. +/// Preserves invalid entities as-is but has no way to set the bozo flag +/// if that occurs +/// Also, feedparser-py returns all entities unresolved if a single one +/// fails, whereas this function cannot know if another call raised bozo +fn resolve_entity(e: &BytesRef<'_>) -> String { // Try numeric character references first: & & etc. - if let Some(ch) = e - .resolve_char_ref() - .map_err(|err| FeedError::InvalidFormat(format!("Invalid character reference: {err}")))? - { - return Ok(ch.to_string()); + match e.resolve_char_ref() { + Ok(Some(ch)) => return ch.to_string(), + Ok(None) => {} // Not a numeric reference; fall through to named entities. + Err(_) => { + // Invalid character reference — preserve as-is (bozo tolerance). + let name = String::from_utf8_lossy(e.as_ref()); + return format!("&{name};"); + } } - // Predefined XML named entity references + // These are the only 5 allowed XML named entities match e.as_ref() { - b"amp" => Ok("&".to_string()), - b"lt" => Ok("<".to_string()), - b"gt" => Ok(">".to_string()), - b"quot" => Ok("\"".to_string()), - b"apos" => Ok("'".to_string()), + b"amp" => "&".to_string(), + b"lt" => "<".to_string(), + b"gt" => ">".to_string(), + b"quot" => "\"".to_string(), + b"apos" => "'".to_string(), other => { - // Unknown entity — preserve as-is - let name = std::str::from_utf8(other).unwrap_or("?"); - Ok(format!("&{name};")) + // Unknown entity — preserve as-is (bozo tolerance). + let name = String::from_utf8_lossy(other).into_owned(); + format!("&{name};") } } } @@ -629,6 +636,103 @@ mod tests { assert_eq!(text, "https://example.com/?a=1&b=2&c=3"); } + #[test] + fn test_read_text_unknown_entity_preserved() { + // Unknown entities should be kept verbatim, not cause errors (bozo pattern). + let xml = b"https://example.com/?a=1&customEntity;b=2"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "https://example.com/?a=1&customEntity;b=2"); + } + + #[test] + fn test_read_text_mixed_valid_and_unknown_entities() { + // Mix of standard and unknown entities — all should resolve without error. + // trim_text(true) strips leading/trailing whitespace from each text chunk, + // so spaces adjacent to entity refs are dropped; test reflects that reality. + let xml = b"AT&T&unknown;rocks"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + let limits = ParserLimits::default(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + + let text = read_text(&mut reader, &mut buf, &limits).unwrap(); + assert_eq!(text, "AT&T&unknown;rocks"); + } + + /// Advance `reader` past the first Start event and return a fresh reader ready for `read_text`. + fn advance_past_start(reader: &mut Reader<&[u8]>, buf: &mut Vec) { + loop { + match reader.read_event_into(buf) { + Ok(Event::Start(_)) => break, + Ok(Event::Eof) => panic!("Unexpected EOF"), + _ => {} + } + buf.clear(); + } + buf.clear(); + } + + #[test] + fn test_read_text_malformed_hex_char_ref() { + // &#x; (no hex digits after x) must be preserved verbatim, not cause an error. + let xml = b"pre&#x;suf"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + advance_past_start(&mut reader, &mut buf); + let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap(); + assert_eq!(text, "pre&#x;suf"); + } + + #[test] + fn test_read_text_malformed_decimal_char_ref() { + // &#; (no digits at all) must be preserved verbatim, not cause an error. + let xml = b"pre&#;suf"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + advance_past_start(&mut reader, &mut buf); + let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap(); + assert_eq!(text, "pre&#;suf"); + } + + #[test] + fn test_read_text_empty_entity_name() { + // &; (empty entity name) must be preserved verbatim, not cause an error. + let xml = b"pre&;suf"; + let mut reader = Reader::from_reader(&xml[..]); + reader.config_mut().trim_text(true); + let mut buf = Vec::new(); + advance_past_start(&mut reader, &mut buf); + let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap(); + assert_eq!(text, "pre&;suf"); + } + #[test] fn test_skip_element_basic() { let xml = b"content"; diff --git a/crates/feedparser-rs-py/tests/test_guid_entities.py b/crates/feedparser-rs-py/tests/test_guid_entities.py index d14a02b..ed49324 100644 --- a/crates/feedparser-rs-py/tests/test_guid_entities.py +++ b/crates/feedparser-rs-py/tests/test_guid_entities.py @@ -71,3 +71,83 @@ def test_multiple_entities_in_guid(): d = feedparser_rs.parse(xml) assert d.entries[0].id == "https://example.com/?a=1&b=2&c=3" + + +def test_guid_with_unknown_entity_preserved(): + """Unknown entities are preserved verbatim rather than causing a parse error (bozo pattern).""" + xml = b""" + + + + https://example.com/?a=1&customEntity;b=2 + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "https://example.com/?a=1&customEntity;b=2" + + +def test_guid_with_mixed_valid_and_unknown_entities(): + """Mix of standard and unknown entities — parsing succeeds and both are handled.""" + xml = b""" + + + + AT&T&unknown; + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "AT&T&unknown;" + + +def test_guid_with_malformed_hex_char_ref(): + """&#x; (hex reference with no digits) is preserved verbatim (bozo pattern).""" + xml = b""" + + + + pre&#x;suf + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "pre&#x;suf" + + +def test_guid_with_malformed_decimal_char_ref(): + """&#; (decimal reference with no digits) is preserved verbatim (bozo pattern).""" + xml = b""" + + + + pre&#;suf + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "pre&#;suf" + + +def test_guid_with_empty_entity_name(): + """&; (entity with empty name) is preserved verbatim (bozo pattern).""" + xml = b""" + + + + pre&;suf + + + """ + + d = feedparser_rs.parse(xml) + + assert d.entries[0].id == "pre&;suf"