diff --git a/README.md b/README.md index 3a4d871..e468d54 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,30 @@ -docx2html -========= +# docx2html -Convert a docx (OOXML) file to html +Convert a docx (OOXML) file to semantic HTML. +All of Word formatting nonsense is stripped away and +you're left with a cleanly-formatted version of the content. -Usage -===== + +## Usage >>> from docx2html import convert >>> html = convert('path/to/docx/file') -Running Tests -============= +## Running Tests for Development + + $ virtualenv path/to/new/virtualenv + $ source path/to/new/virtualenv/bin/activate + $ cd path/to/workspace + $ git clone git://github.com/PolicyStat/docx2html.git + $ cd docx2html + $ pip install . + $ pip install -r test_requirements.txt $ ./run_tests.sh -Description -=========== +## Description docx2html is designed to take a docx file and extract the content out and convert that content to html. It does not care about styles or fonts or @@ -46,3 +53,26 @@ is a list of what currently works: * Simple headings * Root level lists that are upper case roman numerals get converted to h2 tags + +### Handling embedded images + +docx2html allows you to specify how you would like to handle image uploading. +For example, you might be uploading your images to Amazon S3 eg: +Note: This documentation sucks, so you might need to read the source. + + import os.path + from shutil import copyfile + + from docx2html import convert + + def handle_image(image_id, relationship_dict): + image_path = relationship_dict[image_id] + # Now do something to the image. Let's move it somewhere. + _, filename = os.path.split(image_path) + destination_path = os.path.join('/tmp', filename) + copyfile(image_path, destination_path) + + # Return the `src` attribute to be used in the img tag + return 'file://%s' % destination + + html = convert('path/to/docx/file', image_handler=handle_image) diff --git a/docx2html/core.py b/docx2html/core.py index a09803a..29774bd 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1144,6 +1144,9 @@ def get_p_data(p, meta_data, is_td=False): p_text += '
' else: # We have an image image_id = get_image_id(child) + if image_id not in meta_data.relationship_dict: + # This image does not have an image_id + continue src = meta_data.image_handler( image_id, meta_data.relationship_dict, diff --git a/docx2html/tests/__init__.py b/docx2html/tests/__init__.py index a164a51..6d75948 100644 --- a/docx2html/tests/__init__.py +++ b/docx2html/tests/__init__.py @@ -187,6 +187,27 @@ """.strip() +DOCUMENT_PICT_NO_IMAGEID_TEMPLATE = """ + + + + + + + + + + + + + + + + + +""".strip() + + def assert_html_equal(actual_html, expected_html): assert collapse_html( actual_html, diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py index b6143bf..a535aea 100644 --- a/docx2html/tests/test_xml.py +++ b/docx2html/tests/test_xml.py @@ -16,6 +16,7 @@ DOCUMENT_DRAWING_TEMPLATE, DOCUMENT_LI_TEMPLATE, DOCUMENT_PICT_TEMPLATE, + DOCUMENT_PICT_NO_IMAGEID_TEMPLATE, DOCUMENT_P_TEMPLATE, DOCUMENT_TBL_TEMPLATE, DOCUMENT_TC_TEMPLATE, @@ -402,3 +403,23 @@ def test_image_id_for_pict(self): pict_tag = pict_tags[0] image_id = get_image_id(pict_tag) self.assertEqual(image_id, 'rId0') + + +class PictImageTestCase(_TranslationTestCase): + expected_output = ''' + + ''' + + def get_xml(self): + pict = DOCUMENT_PICT_NO_IMAGEID_TEMPLATE + tags = [ + pict, + ] + body = '' + for el in tags: + body += el + + xml = DOCUMENT_XML_TEMPLATE % { + 'body': body, + } + return etree.fromstring(xml) diff --git a/setup.py b/setup.py index 135d7c1..c340146 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ def get_readme(): scripts=[], zip_safe=False, install_requires=['lxml==2.2.4', 'pillow==1.7.7'], - tests_require=['nose'], cmdclass={}, classifiers=[ "Development Status :: 3 - Alpha", diff --git a/test_requirements.txt b/test_requirements.txt new file mode 100644 index 0000000..a678696 --- /dev/null +++ b/test_requirements.txt @@ -0,0 +1,2 @@ +nose +mock