From 40cd2b8e4230d2146f4a1d1436ffce81fedbbc60 Mon Sep 17 00:00:00 2001 From: winhamwr Date: Fri, 11 Jan 2013 18:29:22 -0500 Subject: [PATCH 1/6] Added documentation on using an image handler. --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 3a4d871..ca1d1f0 100644 --- a/README.md +++ b/README.md @@ -46,3 +46,26 @@ is a list of what currently works: * Simple headings * Root level lists that are upper case roman numerals get converted to h2 tags + +### Handling embedded images + +docx2html allows you to specify how you would like to handle image uploading. +For example, you might be uploading your images to Amazon S3 eg: +Note: This documentation sucks, so you might need to read the source. + + import os.path + from shutil import copyfile + + from docx2html import convert + + def handle_image(image_id, relationship_dict): + image_path = relationship_dict[image_id] + # Now do something to the image. Let's move it somewhere. + _, filename = os.path.split(image_path) + destination_path = os.path.join('/tmp', filename) + copyfile(image_path, destination_path) + + # Return the `src` attribute to be used in the img tag + return 'file://%s' % destination + + html = convert('path/to/docx/file', image_handler=handle_image) From 0405dc0a5215c84431851e424bf9f67ea4d238e8 Mon Sep 17 00:00:00 2001 From: winhamwr Date: Fri, 11 Jan 2013 18:30:26 -0500 Subject: [PATCH 2/6] Added detailed example for running tests in a development environment. --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ca1d1f0..d01cf24 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,16 @@ Usage >>> html = convert('path/to/docx/file') -Running Tests -============= +## Running Tests for Development + + $ virtualenv path/to/new/virtualenv + $ source path/to/new/virtualenv/bin/activate + $ cd path/to/workspace + $ git clone git://github.com/PolicyStat/docx2html.git + $ cd docx2html + $ pip install . + $ pip install -r test_requirements.txt $ ./run_tests.sh From 1c5e0820ed992b402b896a8819093a98e5ee825a Mon Sep 17 00:00:00 2001 From: winhamwr Date: Fri, 11 Jan 2013 18:30:55 -0500 Subject: [PATCH 3/6] Added more detailed overview and used hashes for headers. --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d01cf24..e468d54 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ -docx2html -========= +# docx2html -Convert a docx (OOXML) file to html +Convert a docx (OOXML) file to semantic HTML. +All of Word formatting nonsense is stripped away and +you're left with a cleanly-formatted version of the content. -Usage -===== + +## Usage >>> from docx2html import convert >>> html = convert('path/to/docx/file') @@ -23,8 +24,7 @@ Usage $ ./run_tests.sh -Description -=========== +## Description docx2html is designed to take a docx file and extract the content out and convert that content to html. It does not care about styles or fonts or From 7788c699d7438240c468099c7a822d2c18cfe56a Mon Sep 17 00:00:00 2001 From: winhamwr Date: Fri, 11 Jan 2013 18:32:03 -0500 Subject: [PATCH 4/6] Now using test_requirements.txt instead of tests_require tests_require is supported by basically nobody who matters. --- setup.py | 1 - test_requirements.txt | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 test_requirements.txt diff --git a/setup.py b/setup.py index 135d7c1..c340146 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ def get_readme(): scripts=[], zip_safe=False, install_requires=['lxml==2.2.4', 'pillow==1.7.7'], - tests_require=['nose'], cmdclass={}, classifiers=[ "Development Status :: 3 - Alpha", diff --git a/test_requirements.txt b/test_requirements.txt new file mode 100644 index 0000000..a678696 --- /dev/null +++ b/test_requirements.txt @@ -0,0 +1,2 @@ +nose +mock From 2fca19eceda94ddd7f312f2f2427f21d829509b1 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 14 Jan 2013 11:28:01 -0500 Subject: [PATCH 5/6] refs #2: created a test for handling images with no image id --- docx2html/tests/__init__.py | 21 +++++++++++++++++++++ docx2html/tests/test_xml.py | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/docx2html/tests/__init__.py b/docx2html/tests/__init__.py index a164a51..6d75948 100644 --- a/docx2html/tests/__init__.py +++ b/docx2html/tests/__init__.py @@ -187,6 +187,27 @@ """.strip() +DOCUMENT_PICT_NO_IMAGEID_TEMPLATE = """ + + + + + + + + + + + + + + + + + +""".strip() + + def assert_html_equal(actual_html, expected_html): assert collapse_html( actual_html, diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py index b6143bf..a535aea 100644 --- a/docx2html/tests/test_xml.py +++ b/docx2html/tests/test_xml.py @@ -16,6 +16,7 @@ DOCUMENT_DRAWING_TEMPLATE, DOCUMENT_LI_TEMPLATE, DOCUMENT_PICT_TEMPLATE, + DOCUMENT_PICT_NO_IMAGEID_TEMPLATE, DOCUMENT_P_TEMPLATE, DOCUMENT_TBL_TEMPLATE, DOCUMENT_TC_TEMPLATE, @@ -402,3 +403,23 @@ def test_image_id_for_pict(self): pict_tag = pict_tags[0] image_id = get_image_id(pict_tag) self.assertEqual(image_id, 'rId0') + + +class PictImageTestCase(_TranslationTestCase): + expected_output = ''' + + ''' + + def get_xml(self): + pict = DOCUMENT_PICT_NO_IMAGEID_TEMPLATE + tags = [ + pict, + ] + body = '' + for el in tags: + body += el + + xml = DOCUMENT_XML_TEMPLATE % { + 'body': body, + } + return etree.fromstring(xml) From 502768b5d3c7dc8af9d75142c0156f08f34b9dae Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 14 Jan 2013 11:28:26 -0500 Subject: [PATCH 6/6] refs #2: correctly handle images with no image id --- docx2html/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docx2html/core.py b/docx2html/core.py index e4752b2..85e20ac 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1154,6 +1154,9 @@ def get_p_data(p, meta_data, is_td=False): p_text += '
' else: # We have an image image_id = get_image_id(child) + if image_id not in meta_data.relationship_dict: + # This image does not have an image_id + continue src = meta_data.image_handler( image_id, meta_data.relationship_dict,