From 0df2fb26fb6a9d4555582abfde556226d5a3292c Mon Sep 17 00:00:00 2001
From: Koji Ishii
Date: Fri, 10 Nov 2023 20:43:41 +0900
Subject: [PATCH 1/3] Fix unpaired close tags and self-closing tags
https://github.com/google/budoux/pull/251 assumed that all tags are closed properly.
This assumption doesn't stand for cases like:
1. Self-closing tags such as `
` don't have corresponding close tags.
2. Unpaired close tags are still valid HTML.
This patch supports these cases by assuming all open tags that doesn't
nest correctly or that doesn't close are automatically closed.
This isn't the full HTML "adoption agency algorithm", but it should be
good enough for the needs of BudouX.
Fixes #355
---
budoux/html_processor.py | 28 +++++++++++++++++++++++++---
tests/test_html_processor.py | 32 ++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
index cd3bebf6..3c7b7f70 100644
--- a/budoux/html_processor.py
+++ b/budoux/html_processor.py
@@ -29,6 +29,18 @@
SKIP_NODES: typing.Set[str] = set(json.load(f))
+class ElementState(object):
+ """Represents the state for an element.
+
+ Attributes:
+ tag (str): The tag name.
+ to_skip (bool): Whether the content should be skipped or not.
+ """
+ def __init__(self, tag: str, to_skip: bool) -> None:
+ self.tag = tag
+ self.to_skip = to_skip
+
+
class TextContentExtractor(HTMLParser):
"""An HTML parser to extract text content.
@@ -61,7 +73,7 @@ def __init__(self, chunks: typing.List[str], separator: str):
self.separator = separator
self.to_skip = False
self.scan_index = 0
- self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
+ self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()
def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
attr_pairs = []
@@ -71,7 +83,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
else:
attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
encoded_attrs = ''.join(attr_pairs)
- self.element_stack.put(self.to_skip)
+ self.element_stack.put(ElementState(tag, self.to_skip))
if tag.upper() in SKIP_NODES:
if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
self.scan_index += 1
@@ -81,7 +93,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
def handle_endtag(self, tag: str) -> None:
self.output += '%s>' % (tag)
- self.to_skip = self.element_stack.get_nowait()
+ while not self.element_stack.empty():
+ state = self.element_stack.get_nowait()
+ if state.tag == tag:
+ self.to_skip = state.to_skip
+ break
+ # If the close tag doesn't match the open tag, remove it and keep looking.
+ # This means that close tags close their corresponding open tags.
+ # e.g., `abc
def` or `abcdef
` are both valid
+ # HTML as per the HTML spec.
+ # Note the HTML "adoption agency algorithm" isn't fully supported.
+ # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
def handle_data(self, data: str) -> None:
for char in data:
diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
index cb7c6134..e52219ab 100644
--- a/tests/test_html_processor.py
+++ b/tests/test_html_processor.py
@@ -46,6 +46,38 @@ def test_output(self) -> None:
self.assertEqual(resolver.output, expected,
'WBR tags should be inserted as specified by chunks.')
+ def test_unpaired(self) -> None:
+ input = 'abcdef
'
+ expected = 'abcdef
'
+ resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '')
+ resolver.feed(input)
+ self.assertEqual(resolver.output, expected,
+ 'Unpaired close tag should not cause errors.')
+
+ def test_nobr(self) -> None:
+ input = 'abcdef
'
+ expected = 'abcdef
'
+ resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '')
+ resolver.feed(input)
+ self.assertEqual(resolver.output, expected,
+ 'WBR tags should not be inserted if in NOBR.')
+
+ def test_after_nobr(self) -> None:
+ input = 'abxyabcdef
'
+ expected = 'abxyabcdef
'
+ resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '')
+ resolver.feed(input)
+ self.assertEqual(resolver.output, expected,
+ 'WBR tags should be inserted if after NOBR.')
+
+ def test_img_in_nobr(self) -> None:
+ input = 'abx
yabcdef
'
+ expected = 'abx
yabcdef
'
+ resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '')
+ resolver.feed(input)
+ self.assertEqual(resolver.output, expected,
+ 'WBR tags should not be inserted if NOBR.')
+
class TestResolve(unittest.TestCase):
From 448a2679acd03840bf8d1ce23c1852cea097de82 Mon Sep 17 00:00:00 2001
From: Koji Ishii
Date: Fri, 10 Nov 2023 20:56:29 +0900
Subject: [PATCH 2/3] yapf
---
budoux/html_processor.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
index 3c7b7f70..589459a8 100644
--- a/budoux/html_processor.py
+++ b/budoux/html_processor.py
@@ -36,6 +36,7 @@ class ElementState(object):
tag (str): The tag name.
to_skip (bool): Whether the content should be skipped or not.
"""
+
def __init__(self, tag: str, to_skip: bool) -> None:
self.tag = tag
self.to_skip = to_skip
From a563f8556c10724cc761c1a0e312bbdf833e33c5 Mon Sep 17 00:00:00 2001
From: Koji Ishii
Date: Sat, 11 Nov 2023 14:28:32 +0900
Subject: [PATCH 3/3] Update message in `test_img_in_nobr`
---
tests/test_html_processor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
index e52219ab..6334ae9f 100644
--- a/tests/test_html_processor.py
+++ b/tests/test_html_processor.py
@@ -76,7 +76,7 @@ def test_img_in_nobr(self) -> None:
resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '')
resolver.feed(input)
self.assertEqual(resolver.output, expected,
- 'WBR tags should not be inserted if NOBR.')
+ 'IMG should not affect surrounding NOBR.')
class TestResolve(unittest.TestCase):