google · kojiishi · Nov 11, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 11, 2023
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
@@ -29,6 +29,19 @@
   SKIP_NODES: typing.Set[str] = set(json.load(f))
 
 
+class ElementState(object):
+  """Represents the state for an element.
+
+  Attributes:
+    tag (str): The tag name.
+    to_skip (bool): Whether the content should be skipped or not.
+  """
+
+  def __init__(self, tag: str, to_skip: bool) -> None:
+    self.tag = tag
+    self.to_skip = to_skip
+
+
 class TextContentExtractor(HTMLParser):
   """An HTML parser to extract text content.
 
@@ -61,7 +74,7 @@ def __init__(self, chunks: typing.List[str], separator: str):
     self.separator = separator
     self.to_skip = False
     self.scan_index = 0
-    self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
+    self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()
 
   def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
     attr_pairs = []
@@ -71,7 +84,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
       else:
         attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
     encoded_attrs = ''.join(attr_pairs)
-    self.element_stack.put(self.to_skip)
+    self.element_stack.put(ElementState(tag, self.to_skip))
     if tag.upper() in SKIP_NODES:
       if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
         self.scan_index += 1
@@ -81,7 +94,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
 
   def handle_endtag(self, tag: str) -> None:
     self.output += '</%s>' % (tag)
-    self.to_skip = self.element_stack.get_nowait()
+    while not self.element_stack.empty():
+      state = self.element_stack.get_nowait()
+      if state.tag == tag:
+        self.to_skip = state.to_skip
+        break
+      # If the close tag doesn't match the open tag, remove it and keep looking.
+      # This means that close tags close their corresponding open tags.
+      # e.g., `<span>abc<img>def</span>` or `<p>abc<span>def</p>` are both valid
+      # HTML as per the HTML spec.
+      # Note the HTML "adoption agency algorithm" isn't fully supported.
+      # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
 
   def handle_data(self, data: str) -> None:
     for char in data:

diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
@@ -46,6 +46,38 @@ def test_output(self) -> None:
     self.assertEqual(resolver.output, expected,
                      'WBR tags should be inserted as specified by chunks.')
 
+  def test_unpaired(self) -> None:
+    input = '<p>abcdef</p></p>'
+    expected = '<p>abc<wbr>def</p></p>'
+    resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'Unpaired close tag should not cause errors.')
+
+  def test_nobr(self) -> None:
+    input = '<p>ab<nobr>cde</nobr>f</p>'
+    expected = '<p>ab<nobr>cde</nobr>f</p>'
+    resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'WBR tags should not be inserted if in NOBR.')
+
+  def test_after_nobr(self) -> None:
+    input = '<p>ab<nobr>xy</nobr>abcdef</p>'
+    expected = '<p>ab<nobr>xy</nobr>abc<wbr>def</p>'
+    resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'WBR tags should be inserted if after NOBR.')
+
+  def test_img_in_nobr(self) -> None:
+    input = '<p>ab<nobr>x<img>y</nobr>abcdef</p>'
+    expected = '<p>ab<nobr>x<img>y</nobr>abc<wbr>def</p>'
+    resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'IMG should not affect surrounding NOBR.')
+
 
 class TestResolve(unittest.TestCase):