From 5d5d14b56eab4cc7755623579e6d96e002ffbecf Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 16:20:21 -0400 Subject: [PATCH 1/8] refs #18: flake8 --- docx2html/core.py | 10 +++------- docx2html/tests/test_docx.py | 12 ++++-------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index 72e1efb..43d161b 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -165,8 +165,7 @@ def is_natural_header(el, styles_dict): if ( style_id in styles_dict and 'header' in styles_dict[style_id] and - styles_dict[style_id]['header'] - ): + styles_dict[style_id]['header']): return styles_dict[style_id]['header'] @@ -309,7 +308,6 @@ def get_li_nodes(li, meta_data): # Not a subsequent list. yield el break - yield el @@ -1018,8 +1016,7 @@ def get_tr_data(tr, meta_data, row_spans): # ignored. if ( v_merge is not None and - v_merge.get('%sval' % w_namespace) != 'restart' - ): + v_merge.get('%sval' % w_namespace) != 'restart'): continue # Loop through each and build a list of all the content. @@ -1072,8 +1069,7 @@ def get_tr_data(tr, meta_data, row_spans): # here. if ( v_merge is not None and - v_merge.get('%sval' % w_namespace) == 'restart' - ): + v_merge.get('%sval' % w_namespace) == 'restart'): rowspan = next(row_spans) td_el.set('rowspan', '%d' % rowspan) diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index bb2a566..5863353 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -33,8 +33,7 @@ def test_extract_html(): 'simple.docx', ) actual_html = convert(file_path) - assert_html_equal(actual_html, - ''' + assert_html_equal(actual_html, '''

Simple text @@ -66,8 +65,7 @@ def test_nested_list(): 'nested_lists.docx', ) actual_html = convert(file_path) - assert_html_equal(actual_html, - ''' + assert_html_equal(actual_html, '''

  1. one
  2. @@ -111,8 +109,7 @@ def test_simple_list(): 'simple_lists.docx', ) actual_html = convert(file_path) - assert_html_equal(actual_html, - ''' + assert_html_equal(actual_html, '''
    1. One
    2. @@ -132,8 +129,7 @@ def test_inline_tags(): 'inline_tags.docx', ) actual_html = convert(file_path) - assert_html_equal(actual_html, - ''' + assert_html_equal(actual_html, '''

      This sentence has some bold, some italics and some underline, as well as a hyperlink.

      ''') # noqa From eaf075a1834c22955e5ddba7457ba6082f43da79 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 16:20:59 -0400 Subject: [PATCH 2/8] refs #18: added a test showing the incorrect behaviour --- docx2html/tests/test_xml.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py index 586b42e..89b945c 100644 --- a/docx2html/tests/test_xml.py +++ b/docx2html/tests/test_xml.py @@ -593,3 +593,33 @@ def test_get_headings(self): styles_xml = etree.fromstring(xml) styles_dict = get_style_dict(styles_xml) self.assertEqual(styles_dict['heading 1']['header'], 'h2') + + +class MangledIlvlTestCase(_TranslationTestCase): + expected_output = ''' + +
        +
      1. AAA
      2. +
      +
        +
      1. BBB +
          +
        1. CCC
        2. +
        +
      2. +
      + + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 2), + ('BBB', 1, 1), + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return etree.fromstring(xml) From f04bfcdf8ff52cbbe3f140b492e234f906161038 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 16:23:53 -0400 Subject: [PATCH 3/8] refs #18: dealt with mangled ilvl in lists --- docx2html/core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docx2html/core.py b/docx2html/core.py index 43d161b..d239109 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -912,6 +912,15 @@ def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol): del ol_dict[key] return current_ol + def _current_ol_in_root_ol(): + # Checking to see if current_ol in root_ol does not work. Loop + # through each of the elements in root_ol and return true if one + # of them is current ol. + for el in root_ol: + if current_ol is root_ol: + return True + return False + for li_node in li_nodes: w_namespace = get_namespace(li_node, 'w') if not is_li(li_node, meta_data): @@ -992,6 +1001,9 @@ def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol): current_ol=current_ol, ) + if not _current_ol_in_root_ol(): + root_ol[-1].append(current_ol) + return root_ol, visited_nodes From d7f5f59c43396407fca9dec1b5e41519ba5f3322 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 16:27:11 -0400 Subject: [PATCH 4/8] refs #18: update note --- CHANGELOG | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 854cca9..9474bd4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,12 @@ Changelog ========= +* 0.1.7 + * If the indentation level of a set of lists (with the same list id) were + mangled (Starting off with a higher indentation level followed by a + lower) then the entire sub list (the list with the lower indentation + level) would not be added to the root list, thusly removing it from the + final output. This issue has been addressed. * 0.1.6 * Header detection was relying on case. However it is possible for a lower case version of headers to show up. Those are now handled correctly. From 12fc8dc64132fa61c9f3a0ed30121b55ba67eaf6 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 16:43:38 -0400 Subject: [PATCH 5/8] refs #18: refactor --- docx2html/core.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index d239109..5217877 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -912,15 +912,6 @@ def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol): del ol_dict[key] return current_ol - def _current_ol_in_root_ol(): - # Checking to see if current_ol in root_ol does not work. Loop - # through each of the elements in root_ol and return true if one - # of them is current ol. - for el in root_ol: - if current_ol is root_ol: - return True - return False - for li_node in li_nodes: w_namespace = get_namespace(li_node, 'w') if not is_li(li_node, meta_data): @@ -1001,7 +992,7 @@ def _current_ol_in_root_ol(): current_ol=current_ol, ) - if not _current_ol_in_root_ol(): + if current_ol is not root_ol: root_ol[-1].append(current_ol) return root_ol, visited_nodes From f48b6cb47d4f825b5466e8a2869d0c58e98678ec Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 17:06:25 -0400 Subject: [PATCH 6/8] refs #18: updated the test for no longer doing stupid nesting --- docx2html/tests/test_xml.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py index 89b945c..52bca28 100644 --- a/docx2html/tests/test_xml.py +++ b/docx2html/tests/test_xml.py @@ -602,11 +602,10 @@ class MangledIlvlTestCase(_TranslationTestCase):
    3. AAA
      -
    1. BBB -
        -
      1. CCC
      2. -
      -
    2. +
    3. BBB
    4. +
    +
      +
    1. CCC
    ''' From 19c5f903c4ac9396c2b813c063484b46e45a5997 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 17:06:49 -0400 Subject: [PATCH 7/8] refs #18: break ``get_li_nodes`` early if it detects a mangled list --- docx2html/core.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index 5217877..b020cc5 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -286,6 +286,7 @@ def get_li_nodes(li, meta_data): yield li w_namespace = get_namespace(li, 'w') current_numId = get_numId(li, w_namespace) + starting_ilvl = get_ilvl(li, w_namespace) el = li while True: el = el.getnext() @@ -300,6 +301,11 @@ def get_li_nodes(li, meta_data): if _is_top_level_upper_roman(el, meta_data): break + if ( + is_li(el, meta_data) and + (starting_ilvl > get_ilvl(el, w_namespace))): + break + # If the list id of the next tag is different that the previous that # means a new list being made (not nested) if is_last_li(el, meta_data, current_numId): @@ -992,9 +998,6 @@ def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol): current_ol=current_ol, ) - if current_ol is not root_ol: - root_ol[-1].append(current_ol) - return root_ol, visited_nodes From 01c588121b6f23430f3eb2bd2ccbf604a08b436f Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 19 Mar 2013 17:08:24 -0400 Subject: [PATCH 8/8] refs #18: updated update note --- CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9474bd4..7f6a437 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,8 +6,8 @@ Changelog * If the indentation level of a set of lists (with the same list id) were mangled (Starting off with a higher indentation level followed by a lower) then the entire sub list (the list with the lower indentation - level) would not be added to the root list, thusly removing it from the - final output. This issue has been addressed. + level) would not be added to the root list. This would result in removing + the mangled list from the final output. This issue has been addressed. * 0.1.6 * Header detection was relying on case. However it is possible for a lower case version of headers to show up. Those are now handled correctly.