diff --git a/html2docx/builder.py b/html2docx/builder.py index 9b60392..a6fed8e 100644 --- a/html2docx/builder.py +++ b/html2docx/builder.py @@ -7,11 +7,8 @@ def xml(self): return cElementTree.tostring(self.tree) -class ParagraphParser(object): - html_to_ooxml_tag_conversions = { - 'strong': 'bold', - 'em': 'italics', - } +class BaseParser(object): + abstract = True def __init__(self, element): self.element = element @@ -39,6 +36,13 @@ def _parse(self, element, styles): if element.tail: yield element.tail, styles[-1] + +class ParagraphParser(BaseParser): + html_to_ooxml_tag_conversions = { + 'strong': 'bold', + 'em': 'italics', + } + def build_runs(self): for text, styles in self.parse(self.element): run = Run(text) @@ -46,10 +50,6 @@ def build_runs(self): ooxml_style = self.html_to_ooxml_tag_conversions.get(style) if ooxml_style: setattr(run.properties, ooxml_style, True) - if 'strong' in styles: - run.properties.bold = True - if 'em' in styles: - run.properties.italics = True yield run @property @@ -130,3 +130,75 @@ def italics(self, value): self._italics = True else: self._italics = False + + +class TableParser(BaseParser): + @property + def tag(self): + table_rows = [] + for table_row in self.element.findall('tr'): + table_rows.append(TableRowParser(table_row)) + return Table(table_rows) + + +class Table(BaseTag): + tag_name = 'w:tbl' + + def __init__(self, table_rows=None): + self.table_rows = table_rows + + @property + def tree(self): + element = cElementTree.Element(self.tag_name) + if self.table_rows is None: + return element + for table_row in self.table_rows: + element.append(table_row.tag.tree) + return element + + +class TableRowParser(BaseParser): + @property + def tag(self): + table_cells = [] + for table_cell in self.element.findall('td'): + table_cells.append(TableCellParser(table_cell)) + return TableRow(table_cells) + + +class TableRow(BaseTag): + tag_name = 'w:tr' + + def __init__(self, table_cells=None): + self.table_cells = table_cells + + @property + def tree(self): + element = cElementTree.Element(self.tag_name) + if self.table_cells is None: + return element + for table_cell in self.table_cells: + element.append(table_cell.tag.tree) + return element + + +class TableCellParser(BaseParser): + @property + def tag(self): + paragraph = ParagraphParser(self.element) + return TableCell(paragraph) + + +class TableCell(BaseTag): + tag_name = 'w:tc' + + def __init__(self, element=None): + self.element = element + + @property + def tree(self): + element = cElementTree.Element(self.tag_name) + if self.element is None: + return element + element.append(self.element.tag.tree) + return element diff --git a/html2docx/core.py b/html2docx/core.py index 2d8fd47..7fb7e0b 100644 --- a/html2docx/core.py +++ b/html2docx/core.py @@ -3,7 +3,13 @@ from jinja2 import Environment, PackageLoader from html2docx.utils import ZipFile -from html2docx.builder import ParagraphParser +from html2docx.builder import ParagraphParser, TableParser + + +tag_to_parser_conversions = { + 'p': ParagraphParser, + 'table': TableParser +} class HTML2Docx(object): @@ -47,8 +53,9 @@ def _convert(self): if el in self.visited: continue self.visited.update([el]) - if el.tag == 'p': - parser = ParagraphParser(el) + Parser = tag_to_parser_conversions.get(el.tag) + if Parser: + parser = Parser(el) self.document_state.append(parser.tag) self.visited.update(el.getiterator()) diff --git a/html2docx/tests/__init__.py b/html2docx/tests/__init__.py index e6b05fb..4bf9fda 100644 --- a/html2docx/tests/__init__.py +++ b/html2docx/tests/__init__.py @@ -56,6 +56,9 @@ class TestDocx2Html(Docx2Html): def style(*args, **kwargs): return '' + def table(self, text): + return '%s
' % text + def build_run(test_name, html): boiler_plate = '%s' diff --git a/html2docx/tests/test_builder.py b/html2docx/tests/test_builder.py index ceb3c49..fc8324a 100644 --- a/html2docx/tests/test_builder.py +++ b/html2docx/tests/test_builder.py @@ -1,7 +1,17 @@ from xml.etree import cElementTree from unittest import TestCase -from html2docx.builder import RunProperties, ParagraphParser, Paragraph +from html2docx.builder import ( + Paragraph, + ParagraphParser, + RunProperties, + Table, + TableCell, + TableCellParser, + TableParser, + TableRow, + TableRowParser, +) class RunPropertiesTestCase(TestCase): @@ -75,3 +85,100 @@ def test_empty(self): xml = paragraph.xml self.assertEqual(xml, expected_xml) + + +class TableCellParserTestCase(TestCase): + def test_simple(self): + element = cElementTree.fromstring('AAA') + parser = TableCellParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + def test_with_style(self): + element = cElementTree.fromstring('AAA') + parser = TableCellParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + +class TableCellTestCase(TestCase): + def test_empty(self): + table_cell = TableCell() + expected_xml = '' + + xml = table_cell.xml + self.assertEqual(xml, expected_xml) + + +class TableRowParserTestCase(TestCase): + def test_simple(self): + element = cElementTree.fromstring('AAA') + parser = TableRowParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + def test_with_style(self): + element = cElementTree.fromstring('AAA') # noqa + parser = TableRowParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + def test_multiple_cells(self): + element = cElementTree.fromstring('AAABBB') + parser = TableRowParser(element) + xml = parser.tag.xml + expected_xml = 'AAABBB' # noqa + + self.assertEqual(xml, expected_xml) + + +class TableRowTestCase(TestCase): + def test_empty(self): + table_row = TableRow() + expected_xml = '' + + xml = table_row.xml + self.assertEqual(xml, expected_xml) + + +class TableParserTestCase(TestCase): + def test_simple(self): + element = cElementTree.fromstring('
AAA
') # noqa + parser = TableParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + def test_with_style(self): + element = cElementTree.fromstring('
AAA
') # noqa + parser = TableParser(element) + xml = parser.tag.xml + expected_xml = 'AAA' # noqa + + self.assertEqual(xml, expected_xml) + + def test_multiple_cells(self): + element = cElementTree.fromstring('
AAABBB
CCCDDD
') # noqa + parser = TableParser(element) + xml = parser.tag.xml + expected_xml = 'AAABBBCCCDDD' # noqa + + self.assertEqual(xml, expected_xml) + + +class TableTestCase(TestCase): + def test_empty(self): + table_row = Table() + expected_xml = '' + + xml = table_row.xml + self.assertEqual(xml, expected_xml) diff --git a/html2docx/tests/test_complex.py b/html2docx/tests/test_complex.py new file mode 100644 index 0000000..3e76b82 --- /dev/null +++ b/html2docx/tests/test_complex.py @@ -0,0 +1,24 @@ +from html2docx.tests import build_run + + +test_cases = [ + ( + 'Test paragraph, table, paragraph.', + '

AAA

BBB

CCC

', + ), + ( + 'Test table, table, paragraph', + '
AAA
BBB

CCC

', # noqa + ), + # Nesting doesn't really work yet. + # ( + # 'Test Nested Table', + # '
AAA
BBB
', # noqa + # ), +] + + +def test(): + for test_name, html in test_cases: + run = build_run(test_name, html) + yield run diff --git a/html2docx/tests/test_tables.py b/html2docx/tests/test_tables.py new file mode 100644 index 0000000..c7658da --- /dev/null +++ b/html2docx/tests/test_tables.py @@ -0,0 +1,27 @@ +from html2docx.tests import build_run + + +test_cases = [ + ( + 'Test simple table.', + '
AAA
', + ), + ( + 'Test multiple rows.', + '
AAA
BBB
', + ), + ( + 'Test multiple cells.', + '
AAABBB
', + ), + ( + 'Test multiple rows and cells.', + '
AAABBB
CCCDDD
', # noqa + ), +] + + +def test(): + for test_name, html in test_cases: + run = build_run(test_name, html) + yield run diff --git a/run_tests.sh b/run_tests.sh index 167ea3f..5b68bf0 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,5 +1,3 @@ #! /bin/sh -RUN_TESTS='nosetests -v -v --with-coverage --cover-erase --cover-package=. html2docx' -echo $RUN_TESTS -$RUN_TESTS +nosetests -v -v --with-coverage --cover-erase --cover-package=html2docx html2docx && find -name '*.py' | xargs flake8