Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 224 additions & 1 deletion crates/feedparser-rs-core/src/parser/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ use crate::{
error::{FeedError, Result},
types::{FeedVersion, ParsedFeed},
};
use quick_xml::{Reader, events::Event};
use quick_xml::{
Reader,
events::{BytesRef, Event},
};

pub use crate::types::{FromAttributes, LimitedCollectionExt};
pub use crate::util::text::bytes_to_string;
Expand Down Expand Up @@ -370,6 +373,10 @@ pub fn read_text(
Ok(Event::CData(e)) => {
append_bytes(&mut text, e.as_ref(), limits.max_text_length)?;
}
Ok(Event::GeneralRef(e)) => {
let resolved = resolve_entity(&e);
append_bytes(&mut text, resolved.as_bytes(), limits.max_text_length)?;
}
Comment on lines +376 to +379
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation violates the bozo pattern that is fundamental to this project. According to the coding guidelines (CodingGuidelineID: 1000002), the parser must NEVER panic or return errors for malformed feeds. Instead, it should set the bozo flag and continue parsing.

When resolve_entity returns an error (e.g., for an invalid character reference), this will propagate up as a Result error, causing the entire parsing operation to fail or set bozo at a higher level. However, the bozo pattern requires continuing to parse and extract as much data as possible even when encountering entity resolution errors.

The fix should:

  1. Change resolve_entity to return String instead of Result<String>
  2. When an invalid character reference is encountered, preserve it as-is (e.g., "&#xFFFF;") rather than returning an error
  3. Optionally, the caller (read_text) could detect when entity resolution fails and set bozo on the feed object if that context is available

This is particularly important because real-world feeds may contain invalid character references, and Python feedparser would handle these gracefully. For API compatibility (the #1 priority), we must match that behavior.

Copilot generated this review using guidance from repository custom instructions.
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree

Ok(Event::End(_) | Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => {}
Expand All @@ -380,6 +387,37 @@ pub fn read_text(
Ok(text)
}

/// Resolve a general entity reference (numeric or named) to a string.
/// Preserves invalid entities as-is but has no way to set the bozo flag
/// if that occurs
/// Also, feedparser-py returns all entities unresolved if a single one
/// fails, whereas this function cannot know if another call raised bozo
fn resolve_entity(e: &BytesRef<'_>) -> String {
// Try numeric character references first: &#038; &#x26; etc.
match e.resolve_char_ref() {
Ok(Some(ch)) => return ch.to_string(),
Ok(None) => {} // Not a numeric reference; fall through to named entities.
Err(_) => {
// Invalid character reference — preserve as-is (bozo tolerance).
let name = String::from_utf8_lossy(e.as_ref());
return format!("&{name};");
}
}
// These are the only 5 allowed XML named entities
match e.as_ref() {
b"amp" => "&".to_string(),
b"lt" => "<".to_string(),
b"gt" => ">".to_string(),
b"quot" => "\"".to_string(),
b"apos" => "'".to_string(),
other => {
// Unknown entity — preserve as-is (bozo tolerance).
let name = String::from_utf8_lossy(other).into_owned();
format!("&{name};")
}
}
}

#[inline]
fn append_bytes(text: &mut String, bytes: &[u8], max_len: usize) -> Result<()> {
if text.len() + bytes.len() > max_len {
Expand Down Expand Up @@ -510,6 +548,191 @@ mod tests {
assert!(result.is_err());
}

#[test]
fn test_read_text_numeric_char_ref() {
let xml = b"<guid>https://example.com/?post_type=webcomic1&#038;p=3172</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?post_type=webcomic1&p=3172");
}

#[test]
fn test_read_text_amp_entity() {
let xml = b"<guid>https://example.com/?a=1&amp;b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2");
}

#[test]
fn test_read_text_hex_char_ref() {
let xml = b"<guid>https://example.com/?a=1&#x26;b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2");
}

#[test]
fn test_read_text_multiple_entities() {
let xml = b"<guid>https://example.com/?a=1&amp;b=2&amp;c=3</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2&c=3");
}

#[test]
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing test coverage for edge cases in entity handling. According to the testing guidelines (CodingGuidelineID: 1000004), tests should cover both success and failure cases, including edge cases.

Recommended additional tests:

  1. Invalid character reference (e.g., &amp;#xFFFF; or &amp;#999999;) - should set bozo but continue parsing
  2. Unknown/custom entity references (e.g., &amp;customEntity;) - should preserve as-is
  3. Malformed entity syntax (e.g., &amp;#x; or &amp;) - should handle gracefully
  4. Mix of valid and invalid entities in same text

These tests are particularly important for verifying the bozo pattern works correctly when entity resolution encounters errors.

Suggested change
#[test]
#[test]
fn test_read_text_invalid_numeric_char_ref() {
let xml = b"<guid>invalid &#999999; ref</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = read_text(&mut reader, &mut buf, &limits);
assert!(result.is_err(), "Invalid numeric character reference should result in an error");
}
#[test]
fn test_read_text_unknown_named_entity() {
let xml = b"<guid>value &customEntity; here</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = read_text(&mut reader, &mut buf, &limits);
assert!(result.is_err(), "Unknown/custom entity should be surfaced as an error at this level");
}
#[test]
fn test_read_text_malformed_entity_syntax() {
// Malformed entity: missing code after &#x and bare '&'
let xml = b"<guid>value &#x; and &</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = read_text(&mut reader, &mut buf, &limits);
assert!(result.is_err(), "Malformed entity syntax should not cause a panic and must be reported as an error");
}
#[test]
fn test_read_text_mixed_valid_and_invalid_entities() {
let xml = b"<guid>ok &amp; bad &#999999; mix</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = read_text(&mut reader, &mut buf, &limits);
assert!(result.is_err(), "Mixed valid and invalid entities should be handled without panicking");
}
#[test]

Copilot uses AI. Check for mistakes.
fn test_read_text_unknown_entity_preserved() {
// Unknown entities should be kept verbatim, not cause errors (bozo pattern).
let xml = b"<guid>https://example.com/?a=1&customEntity;b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&customEntity;b=2");
}

#[test]
fn test_read_text_mixed_valid_and_unknown_entities() {
// Mix of standard and unknown entities — all should resolve without error.
// trim_text(true) strips leading/trailing whitespace from each text chunk,
// so spaces adjacent to entity refs are dropped; test reflects that reality.
let xml = b"<title>AT&amp;T&unknown;rocks</title>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let limits = ParserLimits::default();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();

let text = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "AT&T&unknown;rocks");
}

/// Advance `reader` past the first Start event and return a fresh reader ready for `read_text`.
fn advance_past_start(reader: &mut Reader<&[u8]>, buf: &mut Vec<u8>) {
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
}

#[test]
fn test_read_text_malformed_hex_char_ref() {
// &#x; (no hex digits after x) must be preserved verbatim, not cause an error.
let xml = b"<guid>pre&#x;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&#x;suf");
}

#[test]
fn test_read_text_malformed_decimal_char_ref() {
// &#; (no digits at all) must be preserved verbatim, not cause an error.
let xml = b"<guid>pre&#;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&#;suf");
}

#[test]
fn test_read_text_empty_entity_name() {
// &; (empty entity name) must be preserved verbatim, not cause an error.
let xml = b"<guid>pre&;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let text = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&;suf");
}

#[test]
fn test_skip_element_basic() {
let xml = b"<parent><child>content</child></parent>";
Expand Down
Loading
Loading