From 2bb32d6e2f0df1620664653ef68cf8bfa63f2c9a Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 5 Jul 2013 13:53:04 -0400 Subject: [PATCH 1/4] bumped to version 0.3.4 --- pydocx/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydocx/__init__.py b/pydocx/__init__.py index b4a82a07..ca95ca6b 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -8,4 +8,4 @@ def docx2html(path): def docx2markdown(path): return Docx2Markdown(path).parsed -VERSION = '0.3.3' +VERSION = '0.3.4' diff --git a/setup.py b/setup.py index d0285271..82c0bc82 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def get_description(): setup( name="PyDocX", # Edit here and pydocx.__init__ - version="0.3.3", + version="0.3.4", description="docx (OOXML) to html converter", author="Jason Ward, Sam Portnow", author_email="jason.louard.ward@gmail.com, samson91787@gmail.com", From b4073fba9c3e82e09ba3018db03d2c75f3f8db74 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 8 Jul 2013 13:49:24 -0400 Subject: [PATCH 2/4] refs #51: Added a document fixture that is missing the styles.xml file and a test showing its expected output. --- pydocx/fixtures/missing_style.docx | Bin 0 -> 2644 bytes pydocx/tests/test_docx.py | 13 +++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 pydocx/fixtures/missing_style.docx diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx new file mode 100644 index 0000000000000000000000000000000000000000..3ded985c991da35c63fce75f7421a63ff835ff30 GIT binary patch literal 2644 zcmaJ@2{=^UAD&?*WH%JqvJ{bJY$4;H(U^>in6J-H6VqTAMluvqQ^^u)2$7{E6J<*p zia!RCB@vPuB1_inpY6N!d=KgS|L=LuInRCW^WJmc_x#TLUJEmJh%gAm$qBNG8bS`W zfaSVCAP|BL1mXr}C$cNiS50+&%)NcWjNORm`oxNe@punWIjpW}P8La$o#mh=j*ho9 zacH{bu6;Jz$*>4dt1Szdo|I_yY+6k_nH&*eR7H0)@=<@sPWhHG>mD( zB-$&{KT|W#2$@EQF^`XCYr%aU-;=?cTh(x*zZO$o^TOuByn%0GD^p4+64%7TvYtcqk5pm5b@RJWZ2UTIgn@rvXzo zy4J-n;uJIOO#Ii}v`Cj4%4&)49Qz^~Gpf48XHqSGQ9yfqz<3T|P5~4U0%m^_*+mUc z@;Z+_gCn}CQap*dvE^P>dj)Fd4WT$vni#!q=Pky0x`x&;d-9Z?NnbQm#;QE%CQL0i zt62Qge9!#hTPwZqZFF$*u(CdL@i$_DlD1#f!b6gsr2Ugmv3zp=O9l?X!dV|Sdl65?#W|?>In~*G7XVhE!Jh2z8y#o|Ib>*wj}b$(r{jl%f@%LQWZ{p;hY_3lUE27w_% z>>$v_5Y>#}t5oNKvJ&sh{s51(hJv-nS~HbLpvOmwL>i-SUuMm@Ah9^-nZj9{XT$gL zFn&HCztffZIcXjd-zvp3ldD~4PxZsZ_Z>X2sENaJi-%`U6Q@l|ofQY*Z--2K2>6^Q zBh=pR%f-5*Som?FO7-@=HFpO?&S%ZiOZ-Q?Iv+`!TO7V;!thEywNq1pEA{c)J(IJhOB)NN#0{xsd*uAqwRl{A3 zKxUz2!%TmI8Jynn-EZaq$}0G%!LfS1T5x#20jC3;+o?!IQaxB`=yN%H>kU`{g~5;3 zs6na5H%z9$9Iq+k?2Z!Hib_hOZB%597TzmhUSvhQze;Gjbv<5HK8vw>*G$(%!R*yo zqw4dINiiLlu_i;*8dopLRxPx0QM0jysM`a3(L)l=RF4-+k?Q3Kub6ia+*N)?3I=WN z;<~>LS0zS20P2<-*hv9k1~$-wO!D?sJLB!WUa%?6aPF(>0#Mega%6!A@rI@Rh^4-N zUXL-^z2cK8S!8AY#DTU~U%&ph_9ES3g{C5qA(z9BHQ(d0EKlZc>e$V!mO0s-|P<;7?@cCCl6vg5yX@JZc&;_>= zI!hvNxRqyY!^tXj0o42@RDTezv{%r^lmYS!J%W^d9G2qBpN(3M@l2YVFm2TlZcC}1 zUm(<2sroa_idk0+9wD{l6v1*i*hjf7c7+&j8nrH}KSkRj9RVrId4}mt$dhs`^J6`@ z62l`4OYj!?SUn_KV?;bM9m7i%t2FV>Mxf~dm~l%>Ev<7Mv>stG8R>niS)wU7 z3TJ2K_&>;dj1SZ`c6_xpAhwq@CeaqklIi7#pM-2H5Bei_XDL9*Zs3r^05$E50pka} z%n2Ld?dt2WULmjUG_LDOod^oA`;(^YeS~x0F!-k2j6hRe=e=s#)z}W&+?dTF0huGh z7$)=fK;j}ER%+|@HX&Cg?Lsj=3-v}Lul0cEQc}qI+P$_aeZINF%L}Pcm zxpuX?9DAF@HGx(^Jq*6&+0J3RiorCc?BE!iqCz4=P6S>sn!3fAOA$>!R__FM=y2nz zf1_X=FUwqH$?t7d&^!xAPpH5f4HRORzkCbfM~p8_-xT~fJ{Ib8m`$pjNn4V|$L${#+F^BE&?e)J2U)%Aor@_Ju3OoRT OIDj?@=zEfL{qApI(K^8Z literal 0 HcmV?d00001 diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py index d7b49b9c..a0aa1909 100644 --- a/pydocx/tests/test_docx.py +++ b/pydocx/tests/test_docx.py @@ -722,6 +722,19 @@ def test_justification(): ''') +def test_missing_style(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'missing_style.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

AAA

+ ''') + + def _converter(*args, **kwargs): # Having a converter that does nothing is the same as if abiword fails to # convert. From ef6135b0bf317aa6b301052850e82f26cccfed55 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 8 Jul 2013 13:50:08 -0400 Subject: [PATCH 3/4] refs #51: No longer assuming that all docx files must have styles.xml --- pydocx/DocxParser.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index 82b48cc3..fd16308f 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -46,23 +46,24 @@ class DocxParser: __metaclass__ = ABCMeta pre_processor_class = PydocxPrePorcessor + def _extract_xml(self, f, xml_path): + try: + return f.read(xml_path) + except KeyError: + return None + def _build_data(self, path, *args, **kwargs): with ZipFile(path) as f: + # These must be in the ZIP in order for the docx to be valid. self.document_text = f.read('word/document.xml') - self.styles_text = f.read('word/styles.xml') - try: - self.fonts = f.read('/word/fontTable.xml') - except KeyError: - self.fonts = None - try: # Only present if there are lists - self.numbering_text = f.read('word/numbering.xml') - except KeyError: - self.numbering_text = None - try: # Only present if there are comments - self.comment_text = f.read('word/comments.xml') - except KeyError: - self.comment_text = None self.relationship_text = f.read('word/_rels/document.xml.rels') + + # These are all optional. + self.styles_text = self._extract_xml(f, 'word/styles.xml') + self.fonts = self._extract_xml(f, 'word/fontTable.xml') + self.numbering_text = self._extract_xml(f, 'word/numbering.xml') + self.comment_text = self._extract_xml(f, 'word/comments.xml') + zipped_image_files = [ e for e in f.infolist() if e.filename.startswith('word/media/') @@ -79,6 +80,8 @@ def _build_data(self, path, *args, **kwargs): self.comment_root = parse_xml_from_string(self.comment_text) def _parse_styles(self): + if self.styles_text is None: + return {} tree = parse_xml_from_string(self.styles_text) result = {} for style in find_all(tree, 'style'): From b54e80b1ebc67d0a7d6db9734d6863fed50f1583 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 8 Jul 2013 13:51:25 -0400 Subject: [PATCH 4/4] refs #51: update note --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index d8aa3f16..94035b11 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,9 @@ Changelog ========= +* 0.3.5 + * Not all docx files contain a `styles.xml` file. We are no longer assuming + they do. * 0.3.4 * It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text. * 0.3.3