From 3b75c49a084b10f2f4f7fea920b757e62e2be19d Mon Sep 17 00:00:00 2001 From: Afzal Shahul Hameed Date: Thu, 16 Sep 2021 12:04:37 +0530 Subject: [PATCH] Don't ignore images Originally solved by @andreip in https://github.com/closeio/quotequail/pull/26 In his words: "Couldn't think of a different approach, since an img isn't really a block, so it'll never have a text within it, so no point in generating a different html in get_line_info functions. Instead, what was missing was it being treated as a special case: don't want to slice a line from the HTML by just looking at the plain text lines, since that could slice an img, need to also look at the start/end refs for replaced tags. See more about a replaced element (https://developer.mozilla.org/en-US/docs/Web/CSS/Replaced_element). I think it might be worth adding a few more things to the list? e.g. video, embed etc. ; not sure about iframe and how that'd be treated in lxml parsing though, but I suppose you could have an iframe with just an image in it, in which case you'd still want to keep it? Full list would be a total of 9 replaced elements (or 10 if we also count input; although I'm not sure of all examples where that'd generate sth even if it apparently has no text in it)." --- quotequail/__init__.py | 6 +++--- quotequail/_html.py | 26 ++++++++++++++++++++------ tests/test_quotequail.py | 13 +++++++++++++ 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/quotequail/__init__.py b/quotequail/__init__.py index 21e0ab8..a316961 100644 --- a/quotequail/__init__.py +++ b/quotequail/__init__.py @@ -146,9 +146,9 @@ def unwrap_html(html): "type": typ, } - top_range = _html.trim_slice(lines, top_range) - main_range = _html.trim_slice(lines, main_range) - bottom_range = _html.trim_slice(lines, bottom_range) + top_range = _html.trim_slice(lines, top_range, start_refs, end_refs) + main_range = _html.trim_slice(lines, main_range, start_refs, end_refs) + bottom_range = _html.trim_slice(lines, bottom_range, start_refs, end_refs) if top_range: top_tree = _html.slice_tree( diff --git a/quotequail/_html.py b/quotequail/_html.py index 69ffca5..5e95b5a 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -22,7 +22,9 @@ "td", "th", ] - +# replaced by binary data, so should be preserved in HTML no matter the text +# around it. +REPLACED_TAGS = ["img"] BEGIN = "begin" END = "end" @@ -71,7 +73,11 @@ def trim_tree_before(element, include_element=True, keep_head=True): el = parent_el -def trim_slice(lines, slice_tuple): +def is_replaced(el): + return isinstance(el.tag, string_class) and el.tag.lower() in REPLACED_TAGS + + +def trim_slice(lines, slice_tuple, start_refs, end_refs): """ Trim a slice tuple (begin, end) so it starts at the first non-empty line (obtained via indented_tree_line_generator / get_line_info) and ends at the @@ -92,11 +98,19 @@ def _empty(line): slice_end = len(lines) # Trim from beginning - while slice_start < slice_end and _empty(lines[slice_start]): + while ( + slice_start < slice_end + and _empty(lines[slice_start]) + and not is_replaced(start_refs[slice_start][0]) + ): slice_start += 1 # Trim from end - while slice_end > slice_start and _empty(lines[slice_end - 1]): + while ( + slice_end > slice_start + and _empty(lines[slice_end - 1]) + and not is_replaced(end_refs[slice_end - 1][0]) + ): slice_end -= 1 return (slice_start, slice_end) @@ -173,9 +187,9 @@ def slice_tree(tree, start_refs, end_refs, slice_tuple, html_copy=None): new_tree = tree if start_ref: - include_start = start_ref[1] == BEGIN + include_start = start_ref[1] == BEGIN or is_replaced(start_ref[0]) if end_ref: - include_end = end_ref[1] == END + include_end = end_ref[1] == END or is_replaced(end_ref[0]) # If start_ref is the same as end_ref, and we don't include the element, # we are removing the entire tree. We need to handle this separately, diff --git a/tests/test_quotequail.py b/tests/test_quotequail.py index d2de31a..8dcd513 100644 --- a/tests/test_quotequail.py +++ b/tests/test_quotequail.py @@ -949,6 +949,19 @@ def test_gmail_reply(self): }, ) + def test_reply_with_image(self): + html = 'Test 2.

On Jun 05, 2018, at 09:56 AM, John Doe <john@example.com> wrote:

Some text 1.

Bart
' + self.assertEqual( + unwrap_html(html), + { + "date": "Jun 05, 2018, at 09:56 AM", + "from": "John Doe ", + "html": u'

Some text 1.

Bart
', + "html_top": u"Test 2.", + "type": "reply", + }, + ) + def test_outlook_forward(self): data = self.read_file("outlook_forward.html") result = unwrap_html(data)