Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@

Changelog
=========
* 0.3.5
* Not all docx files contain a `styles.xml` file. We are no longer assuming
they do.
* 0.3.4
* It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text.
* 0.3.3
Expand Down
29 changes: 16 additions & 13 deletions pydocx/DocxParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,24 @@ class DocxParser:
__metaclass__ = ABCMeta
pre_processor_class = PydocxPrePorcessor

def _extract_xml(self, f, xml_path):
try:
return f.read(xml_path)
except KeyError:
return None

def _build_data(self, path, *args, **kwargs):
with ZipFile(path) as f:
# These must be in the ZIP in order for the docx to be valid.
self.document_text = f.read('word/document.xml')
self.styles_text = f.read('word/styles.xml')
try:
self.fonts = f.read('/word/fontTable.xml')
except KeyError:
self.fonts = None
try: # Only present if there are lists
self.numbering_text = f.read('word/numbering.xml')
except KeyError:
self.numbering_text = None
try: # Only present if there are comments
self.comment_text = f.read('word/comments.xml')
except KeyError:
self.comment_text = None
self.relationship_text = f.read('word/_rels/document.xml.rels')

# These are all optional.
self.styles_text = self._extract_xml(f, 'word/styles.xml')
self.fonts = self._extract_xml(f, 'word/fontTable.xml')
self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
self.comment_text = self._extract_xml(f, 'word/comments.xml')

zipped_image_files = [
e for e in f.infolist()
if e.filename.startswith('word/media/')
Expand All @@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs):
self.comment_root = parse_xml_from_string(self.comment_text)

def _parse_styles(self):
if self.styles_text is None:
return {}
tree = parse_xml_from_string(self.styles_text)
result = {}
for style in find_all(tree, 'style'):
Expand Down
2 changes: 1 addition & 1 deletion pydocx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ def docx2html(path):
def docx2markdown(path):
return Docx2Markdown(path).parsed

VERSION = '0.3.3'
VERSION = '0.3.4'
Binary file added pydocx/fixtures/missing_style.docx
Binary file not shown.
13 changes: 13 additions & 0 deletions pydocx/tests/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,19 @@ def test_justification():
''')


def test_missing_style():
file_path = path.join(
path.abspath(path.dirname(__file__)),
'..',
'fixtures',
'missing_style.docx',
)
actual_html = convert(file_path)
assert_html_equal(actual_html, BASE_HTML % '''
<p>AAA</p>
''')


def _converter(*args, **kwargs):
# Having a converter that does nothing is the same as if abiword fails to
# convert.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_description():
setup(
name="PyDocX",
# Edit here and pydocx.__init__
version="0.3.3",
version="0.3.4",
description="docx (OOXML) to html converter",
author="Jason Ward, Sam Portnow",
author_email="jason.louard.ward@gmail.com, samson91787@gmail.com",
Expand Down