diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs
index 6f06ee1..b422f88 100644
--- a/crates/feedparser-rs-core/src/parser/common.rs
+++ b/crates/feedparser-rs-core/src/parser/common.rs
@@ -8,7 +8,10 @@ use crate::{
error::{FeedError, Result},
types::{FeedVersion, ParsedFeed},
};
-use quick_xml::{Reader, events::Event};
+use quick_xml::{
+ Reader,
+ events::{BytesRef, Event},
+};
pub use crate::types::{FromAttributes, LimitedCollectionExt};
pub use crate::util::text::bytes_to_string;
@@ -370,6 +373,10 @@ pub fn read_text(
Ok(Event::CData(e)) => {
append_bytes(&mut text, e.as_ref(), limits.max_text_length)?;
}
+ Ok(Event::GeneralRef(e)) => {
+ let resolved = resolve_entity(&e);
+ append_bytes(&mut text, resolved.as_bytes(), limits.max_text_length)?;
+ }
Ok(Event::End(_) | Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => {}
@@ -380,6 +387,37 @@ pub fn read_text(
Ok(text)
}
+/// Resolve a general entity reference (numeric or named) to a string.
+/// Preserves invalid entities as-is but has no way to set the bozo flag
+/// if that occurs
+/// Also, feedparser-py returns all entities unresolved if a single one
+/// fails, whereas this function cannot know if another call raised bozo
+fn resolve_entity(e: &BytesRef<'_>) -> String {
+ // Try numeric character references first: & & etc.
+ match e.resolve_char_ref() {
+ Ok(Some(ch)) => return ch.to_string(),
+ Ok(None) => {} // Not a numeric reference; fall through to named entities.
+ Err(_) => {
+ // Invalid character reference — preserve as-is (bozo tolerance).
+ let name = String::from_utf8_lossy(e.as_ref());
+ return format!("&{name};");
+ }
+ }
+ // These are the only 5 allowed XML named entities
+ match e.as_ref() {
+ b"amp" => "&".to_string(),
+ b"lt" => "<".to_string(),
+ b"gt" => ">".to_string(),
+ b"quot" => "\"".to_string(),
+ b"apos" => "'".to_string(),
+ other => {
+ // Unknown entity — preserve as-is (bozo tolerance).
+ let name = String::from_utf8_lossy(other).into_owned();
+ format!("&{name};")
+ }
+ }
+}
+
#[inline]
fn append_bytes(text: &mut String, bytes: &[u8], max_len: usize) -> Result<()> {
if text.len() + bytes.len() > max_len {
@@ -510,6 +548,191 @@ mod tests {
assert!(result.is_err());
}
+ #[test]
+ fn test_read_text_numeric_char_ref() {
+ let xml = b"https://example.com/?post_type=webcomic1&p=3172";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "https://example.com/?post_type=webcomic1&p=3172");
+ }
+
+ #[test]
+ fn test_read_text_amp_entity() {
+ let xml = b"https://example.com/?a=1&b=2";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "https://example.com/?a=1&b=2");
+ }
+
+ #[test]
+ fn test_read_text_hex_char_ref() {
+ let xml = b"https://example.com/?a=1&b=2";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "https://example.com/?a=1&b=2");
+ }
+
+ #[test]
+ fn test_read_text_multiple_entities() {
+ let xml = b"https://example.com/?a=1&b=2&c=3";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "https://example.com/?a=1&b=2&c=3");
+ }
+
+ #[test]
+ fn test_read_text_unknown_entity_preserved() {
+ // Unknown entities should be kept verbatim, not cause errors (bozo pattern).
+ let xml = b"https://example.com/?a=1&customEntity;b=2";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "https://example.com/?a=1&customEntity;b=2");
+ }
+
+ #[test]
+ fn test_read_text_mixed_valid_and_unknown_entities() {
+ // Mix of standard and unknown entities — all should resolve without error.
+ // trim_text(true) strips leading/trailing whitespace from each text chunk,
+ // so spaces adjacent to entity refs are dropped; test reflects that reality.
+ let xml = b"
AT&T&unknown;rocks";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ let limits = ParserLimits::default();
+
+ loop {
+ match reader.read_event_into(&mut buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+
+ let text = read_text(&mut reader, &mut buf, &limits).unwrap();
+ assert_eq!(text, "AT&T&unknown;rocks");
+ }
+
+ /// Advance `reader` past the first Start event and return a fresh reader ready for `read_text`.
+ fn advance_past_start(reader: &mut Reader<&[u8]>, buf: &mut Vec) {
+ loop {
+ match reader.read_event_into(buf) {
+ Ok(Event::Start(_)) => break,
+ Ok(Event::Eof) => panic!("Unexpected EOF"),
+ _ => {}
+ }
+ buf.clear();
+ }
+ buf.clear();
+ }
+
+ #[test]
+ fn test_read_text_malformed_hex_char_ref() {
+ // (no hex digits after x) must be preserved verbatim, not cause an error.
+ let xml = b"presuf";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ advance_past_start(&mut reader, &mut buf);
+ let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
+ assert_eq!(text, "presuf");
+ }
+
+ #[test]
+ fn test_read_text_malformed_decimal_char_ref() {
+ // (no digits at all) must be preserved verbatim, not cause an error.
+ let xml = b"presuf";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ advance_past_start(&mut reader, &mut buf);
+ let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
+ assert_eq!(text, "presuf");
+ }
+
+ #[test]
+ fn test_read_text_empty_entity_name() {
+ // &; (empty entity name) must be preserved verbatim, not cause an error.
+ let xml = b"pre&;suf";
+ let mut reader = Reader::from_reader(&xml[..]);
+ reader.config_mut().trim_text(true);
+ let mut buf = Vec::new();
+ advance_past_start(&mut reader, &mut buf);
+ let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
+ assert_eq!(text, "pre&;suf");
+ }
+
#[test]
fn test_skip_element_basic() {
let xml = b"content";
diff --git a/crates/feedparser-rs-py/tests/test_guid_entities.py b/crates/feedparser-rs-py/tests/test_guid_entities.py
new file mode 100644
index 0000000..ed49324
--- /dev/null
+++ b/crates/feedparser-rs-py/tests/test_guid_entities.py
@@ -0,0 +1,153 @@
+"""Test XML entity decoding in guid elements.
+
+Regression test for a bug where quick-xml 0.39's Event::GeneralRef events
+(emitted for XML entity references like &) were silently dropped,
+causing decoded characters to vanish from entry.id / entry.guid.
+"""
+
+import feedparser_rs
+
+
+def test_guid_with_numeric_character_reference():
+ """Test that & in guid decodes to & (matching Python feedparser)."""
+ xml = b"""
+
+
+ -
+ https://sidequested.com/?post_type=webcomic1&p=3172
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "https://sidequested.com/?post_type=webcomic1&p=3172"
+ assert d.entries[0].guid == d.entries[0].id
+
+
+def test_guid_with_amp_entity():
+ """Test that & in guid decodes to &."""
+ xml = b"""
+
+
+ -
+ https://example.com/?a=1&b=2
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "https://example.com/?a=1&b=2"
+
+
+def test_guid_with_hex_character_reference():
+ """Test that & (hex for &) in guid decodes correctly."""
+ xml = b"""
+
+
+ -
+ https://example.com/?a=1&b=2
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "https://example.com/?a=1&b=2"
+
+
+def test_multiple_entities_in_guid():
+ """Test guid with multiple entity references."""
+ xml = b"""
+
+
+ -
+ https://example.com/?a=1&b=2&c=3
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "https://example.com/?a=1&b=2&c=3"
+
+
+def test_guid_with_unknown_entity_preserved():
+ """Unknown entities are preserved verbatim rather than causing a parse error (bozo pattern)."""
+ xml = b"""
+
+
+ -
+ https://example.com/?a=1&customEntity;b=2
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "https://example.com/?a=1&customEntity;b=2"
+
+
+def test_guid_with_mixed_valid_and_unknown_entities():
+ """Mix of standard and unknown entities — parsing succeeds and both are handled."""
+ xml = b"""
+
+
+ -
+ AT&T&unknown;
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "AT&T&unknown;"
+
+
+def test_guid_with_malformed_hex_char_ref():
+ """ (hex reference with no digits) is preserved verbatim (bozo pattern)."""
+ xml = b"""
+
+
+ -
+ presuf
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "presuf"
+
+
+def test_guid_with_malformed_decimal_char_ref():
+ """ (decimal reference with no digits) is preserved verbatim (bozo pattern)."""
+ xml = b"""
+
+
+ -
+ presuf
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "presuf"
+
+
+def test_guid_with_empty_entity_name():
+ """&; (entity with empty name) is preserved verbatim (bozo pattern)."""
+ xml = b"""
+
+
+ -
+ pre&;suf
+
+
+ """
+
+ d = feedparser_rs.parse(xml)
+
+ assert d.entries[0].id == "pre&;suf"