Skip to content
This repository was archived by the owner on Oct 17, 2018. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
docx2html
=========
# docx2html

Convert a docx (OOXML) file to html
Convert a docx (OOXML) file to semantic HTML.
All of Word formatting nonsense is stripped away and
you're left with a cleanly-formatted version of the content.

Usage
=====

## Usage

>>> from docx2html import convert
>>> html = convert('path/to/docx/file')


Running Tests
=============
## Running Tests for Development


$ virtualenv path/to/new/virtualenv
$ source path/to/new/virtualenv/bin/activate
$ cd path/to/workspace
$ git clone git://github.com/PolicyStat/docx2html.git
$ cd docx2html
$ pip install .
$ pip install -r test_requirements.txt
$ ./run_tests.sh


Description
===========
## Description

docx2html is designed to take a docx file and extract the content out and
convert that content to html. It does not care about styles or fonts or
Expand Down Expand Up @@ -46,3 +53,26 @@ is a list of what currently works:
* Simple headings
* Root level lists that are upper case roman numerals get converted to h2
tags

### Handling embedded images

docx2html allows you to specify how you would like to handle image uploading.
For example, you might be uploading your images to Amazon S3 eg:
Note: This documentation sucks, so you might need to read the source.

import os.path
from shutil import copyfile

from docx2html import convert

def handle_image(image_id, relationship_dict):
image_path = relationship_dict[image_id]
# Now do something to the image. Let's move it somewhere.
_, filename = os.path.split(image_path)
destination_path = os.path.join('/tmp', filename)
copyfile(image_path, destination_path)

# Return the `src` attribute to be used in the img tag
return 'file://%s' % destination

html = convert('path/to/docx/file', image_handler=handle_image)
3 changes: 3 additions & 0 deletions docx2html/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,9 @@ def get_p_data(p, meta_data, is_td=False):
p_text += '<br />'
else: # We have an image
image_id = get_image_id(child)
if image_id not in meta_data.relationship_dict:
# This image does not have an image_id
continue
src = meta_data.image_handler(
image_id,
meta_data.relationship_dict,
Expand Down
21 changes: 21 additions & 0 deletions docx2html/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,27 @@
""".strip()


DOCUMENT_PICT_NO_IMAGEID_TEMPLATE = """
<w:p w:rsidR="00E94BDC" w:rsidRPr="003638EA" w:rsidRDefault="00E94BDC" w:rsidP="00E94BDC">
<w:pPr>
<w:rPr>
<w:color w:val="000000"/>
</w:rPr>
</w:pPr>
<w:r w:rsidR="00360165">
<w:rPr>
<w:b/>
<w:color w:val="000000"/>
</w:rPr>
<w:pict>
<v:shape id="_x0000_i1027" type="#_x0000_t75" style="width:99.75pt;height:116.25pt">
</v:shape>
</w:pict>
</w:r>
</w:p>
""".strip()


def assert_html_equal(actual_html, expected_html):
assert collapse_html(
actual_html,
Expand Down
21 changes: 21 additions & 0 deletions docx2html/tests/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DOCUMENT_DRAWING_TEMPLATE,
DOCUMENT_LI_TEMPLATE,
DOCUMENT_PICT_TEMPLATE,
DOCUMENT_PICT_NO_IMAGEID_TEMPLATE,
DOCUMENT_P_TEMPLATE,
DOCUMENT_TBL_TEMPLATE,
DOCUMENT_TC_TEMPLATE,
Expand Down Expand Up @@ -402,3 +403,23 @@ def test_image_id_for_pict(self):
pict_tag = pict_tags[0]
image_id = get_image_id(pict_tag)
self.assertEqual(image_id, 'rId0')


class PictImageTestCase(_TranslationTestCase):
expected_output = '''
<html/>
'''

def get_xml(self):
pict = DOCUMENT_PICT_NO_IMAGEID_TEMPLATE
tags = [
pict,
]
body = ''
for el in tags:
body += el

xml = DOCUMENT_XML_TEMPLATE % {
'body': body,
}
return etree.fromstring(xml)
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def get_readme():
scripts=[],
zip_safe=False,
install_requires=['lxml==2.2.4', 'pillow==1.7.7'],
tests_require=['nose'],
cmdclass={},
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
2 changes: 2 additions & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nose
mock