PolicyStat · jlward · Jan 14, 2013 · Jan 11, 2013 · Jan 11, 2013 · Jan 11, 2013
diff --git a/README.md b/README.md
@@ -1,23 +1,30 @@
-docx2html
-=========
+# docx2html
 
-Convert a docx (OOXML) file to html
+Convert a docx (OOXML) file to semantic HTML.
+All of Word formatting nonsense is stripped away and
+you're left with a cleanly-formatted version of the content.
 
-Usage
-=====
+
+## Usage
 
     >>> from docx2html import convert
     >>> html = convert('path/to/docx/file')
 
 
-Running Tests
-=============
+## Running Tests for Development
+
 
+	$ virtualenv path/to/new/virtualenv
+	$ source path/to/new/virtualenv/bin/activate
+	$ cd path/to/workspace
+	$ git clone git://github.com/PolicyStat/docx2html.git
+	$ cd docx2html
+	$ pip install .
+	$ pip install -r test_requirements.txt
     $ ./run_tests.sh
 
 
-Description
-===========
+## Description
 
 docx2html is designed to take a docx file and extract the content out and
 convert that content to html. It does not care about styles or fonts or
@@ -46,3 +53,26 @@ is a list of what currently works:
     * Simple headings
     * Root level lists that are upper case roman numerals get converted to h2
       tags
+
+### Handling embedded images
+
+docx2html allows you to specify how you would like to handle image uploading.
+For example, you might be uploading your images to Amazon S3 eg:
+Note: This documentation sucks, so you might need to read the source.
+
+	import os.path
+	from shutil import copyfile
+
+    from docx2html import convert
+
+	def handle_image(image_id, relationship_dict):
+		image_path = relationship_dict[image_id]
+		# Now do something to the image. Let's move it somewhere.
+		_, filename = os.path.split(image_path)
+		destination_path = os.path.join('/tmp', filename)
+		copyfile(image_path, destination_path)
+
+		# Return the `src` attribute to be used in the img tag
+		return 'file://%s' % destination
+
+    html = convert('path/to/docx/file', image_handler=handle_image)
diff --git a/docx2html/core.py b/docx2html/core.py
@@ -1144,6 +1144,9 @@ def get_p_data(p, meta_data, is_td=False):
                 p_text += '<br />'
             else:  # We have an image
                 image_id = get_image_id(child)
+                if image_id not in meta_data.relationship_dict:
+                    # This image does not have an image_id
+                    continue
                 src = meta_data.image_handler(
                     image_id,
                     meta_data.relationship_dict,

diff --git a/docx2html/tests/__init__.py b/docx2html/tests/__init__.py
@@ -187,6 +187,27 @@
 """.strip()
 
 
+DOCUMENT_PICT_NO_IMAGEID_TEMPLATE = """
+    <w:p w:rsidR="00E94BDC" w:rsidRPr="003638EA" w:rsidRDefault="00E94BDC" w:rsidP="00E94BDC">
+        <w:pPr>
+            <w:rPr>
+                <w:color w:val="000000"/>
+            </w:rPr>
+        </w:pPr>
+        <w:r w:rsidR="00360165">
+            <w:rPr>
+                <w:b/>
+                <w:color w:val="000000"/>
+            </w:rPr>
+            <w:pict>
+                <v:shape id="_x0000_i1027" type="#_x0000_t75" style="width:99.75pt;height:116.25pt">
+                </v:shape>
+            </w:pict>
+        </w:r>
+    </w:p>
+""".strip()
+
+
 def assert_html_equal(actual_html, expected_html):
     assert collapse_html(
         actual_html,

diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py
@@ -16,6 +16,7 @@
     DOCUMENT_DRAWING_TEMPLATE,
     DOCUMENT_LI_TEMPLATE,
     DOCUMENT_PICT_TEMPLATE,
+    DOCUMENT_PICT_NO_IMAGEID_TEMPLATE,
     DOCUMENT_P_TEMPLATE,
     DOCUMENT_TBL_TEMPLATE,
     DOCUMENT_TC_TEMPLATE,
@@ -402,3 +403,23 @@ def test_image_id_for_pict(self):
         pict_tag = pict_tags[0]
         image_id = get_image_id(pict_tag)
         self.assertEqual(image_id, 'rId0')
+
+
+class PictImageTestCase(_TranslationTestCase):
+    expected_output = '''
+        <html/>
+    '''
+
+    def get_xml(self):
+        pict = DOCUMENT_PICT_NO_IMAGEID_TEMPLATE
+        tags = [
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DOCUMENT_XML_TEMPLATE % {
+            'body': body,
+        }
+        return etree.fromstring(xml)
diff --git a/setup.py b/setup.py
@@ -30,7 +30,6 @@ def get_readme():
     scripts=[],
     zip_safe=False,
     install_requires=['lxml==2.2.4', 'pillow==1.7.7'],
-    tests_require=['nose'],
     cmdclass={},
     classifiers=[
         "Development Status :: 3 - Alpha",

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -0,0 +1,2 @@
+nose
+mock