diff --git a/CHANGELOG b/CHANGELOG index d8aa3f16..94035b11 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,9 @@ Changelog ========= +* 0.3.5 + * Not all docx files contain a `styles.xml` file. We are no longer assuming + they do. * 0.3.4 * It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text. * 0.3.3 diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index 82b48cc3..fd16308f 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -46,23 +46,24 @@ class DocxParser: __metaclass__ = ABCMeta pre_processor_class = PydocxPrePorcessor + def _extract_xml(self, f, xml_path): + try: + return f.read(xml_path) + except KeyError: + return None + def _build_data(self, path, *args, **kwargs): with ZipFile(path) as f: + # These must be in the ZIP in order for the docx to be valid. self.document_text = f.read('word/document.xml') - self.styles_text = f.read('word/styles.xml') - try: - self.fonts = f.read('/word/fontTable.xml') - except KeyError: - self.fonts = None - try: # Only present if there are lists - self.numbering_text = f.read('word/numbering.xml') - except KeyError: - self.numbering_text = None - try: # Only present if there are comments - self.comment_text = f.read('word/comments.xml') - except KeyError: - self.comment_text = None self.relationship_text = f.read('word/_rels/document.xml.rels') + + # These are all optional. + self.styles_text = self._extract_xml(f, 'word/styles.xml') + self.fonts = self._extract_xml(f, 'word/fontTable.xml') + self.numbering_text = self._extract_xml(f, 'word/numbering.xml') + self.comment_text = self._extract_xml(f, 'word/comments.xml') + zipped_image_files = [ e for e in f.infolist() if e.filename.startswith('word/media/') @@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs): self.comment_root = parse_xml_from_string(self.comment_text) def _parse_styles(self): + if self.styles_text is None: + return {} tree = parse_xml_from_string(self.styles_text) result = {} for style in find_all(tree, 'style'): diff --git a/pydocx/__init__.py b/pydocx/__init__.py index b4a82a07..ca95ca6b 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -8,4 +8,4 @@ def docx2html(path): def docx2markdown(path): return Docx2Markdown(path).parsed -VERSION = '0.3.3' +VERSION = '0.3.4' diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx new file mode 100644 index 00000000..3ded985c Binary files /dev/null and b/pydocx/fixtures/missing_style.docx differ diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py index d7b49b9c..a0aa1909 100644 --- a/pydocx/tests/test_docx.py +++ b/pydocx/tests/test_docx.py @@ -722,6 +722,19 @@ def test_justification(): ''') +def test_missing_style(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'missing_style.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

AAA

+ ''') + + def _converter(*args, **kwargs): # Having a converter that does nothing is the same as if abiword fails to # convert. diff --git a/setup.py b/setup.py index d0285271..82c0bc82 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def get_description(): setup( name="PyDocX", # Edit here and pydocx.__init__ - version="0.3.3", + version="0.3.4", description="docx (OOXML) to html converter", author="Jason Ward, Sam Portnow", author_email="jason.louard.ward@gmail.com, samson91787@gmail.com",