diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8aaedb5
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,12 @@
+language: python
+python:
+  - "2.6"
+  - "2.7"
+  - "3.3"
+script: ./run_tests.sh
+install:
+  - python setup.py -q install
+  - pip install -r test_requirements.txt
+notifications:
+  email:
+    - jason.louard.ward@gmail.com
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..1f78556
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,3 @@
+Jason Ward <jason.louard.ward@gmail.com>
+Wes Winham <winhamwr@gmail.com>
+Kyle Gibson <kyle.gibson@policystat.com>
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..7f6a437
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,31 @@
+
+Changelog
+=========
+
+* 0.1.7
+    * If the indentation level of a set of lists (with the same list id) were
+      mangled (Starting off with a higher indentation level followed by a
+      lower) then the entire sub list (the list with the lower indentation
+      level) would not be added to the root list. This would result in removing
+      the mangled list from the final output. This issue has been addressed.
+* 0.1.6
+    * Header detection was relying on case. However it is possible for a lower
+      case version of headers to show up. Those are now handled correctly.
+* 0.1.4
+    * Added a function to remove tags, in addition stripped 'sectPr' tags since
+      they have to do with headers and footers.
+* 0.1.3
+    * Hyperlinks with no text no longer throw an error
+    * Fixed a bug with determining the font size with an incomplete styles dict
+* 0.1.2
+    * Fixed a bug with determining the font size of a paragraph tag
+* 0.1.1
+    * Added a changelog
+    * Styles are now stripped from hyperlinks
+    * jinja2 is now used to render test xml
+* 0.1.0
+    * Correctly handle tables and paragraphs in lists. Before if there was a
+      table in a list it would break the list into two halves, the half before
+      the table and the half after the table (with the table inbetween them). Now
+      if there is a table or paragraph in a list those elements get rolled into
+      the list.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..493f32d
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,6 @@
+include AUTHORS
+include CHANGELOG
+include LICENSE
+include MANIFEST.in
+include README.md
+include docx2html/fixtures/*
diff --git a/README.md b/README.md
index 3a4d871..1c4fc7f 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,12 @@
+=========
 docx2html
 =========
 
-Convert a docx (OOXML) file to html
+
+Convert a docx (OOXML) file to semantic HTML.
+All of Word formatting nonsense is stripped away and
+you're left with a cleanly-formatted version of the content.
+
 
 Usage
 =====
@@ -10,11 +15,19 @@ Usage
     >>> html = convert('path/to/docx/file')
 
 
-Running Tests
-=============
+Running Tests for Development
+=============================
 
-    $ ./run_tests.sh
+::
 
+     $ virtualenv path/to/new/virtualenv
+     $ source path/to/new/virtualenv/bin/activate
+     $ cd path/to/workspace
+     $ git clone git://github.com/PolicyStat/docx2html.git
+     $ cd docx2html
+     $ pip install .
+     $ pip install -r test_requirements.txt
+     $ ./run_tests.sh
 
 Description
 ===========
@@ -32,6 +45,8 @@ is a list of what currently works:
 * Lists
     * Nested lists
     * List styles (letters, roman numerals, etc.)
+    * Tables
+    * Paragraphs
 * Tables
     * Rowspans
     * Colspans
@@ -46,3 +61,29 @@ is a list of what currently works:
     * Simple headings
     * Root level lists that are upper case roman numerals get converted to h2
       tags
+
+Handling embedded images
+------------------------
+
+docx2html allows you to specify how you would like to handle image uploading.
+For example, you might be uploading your images to Amazon S3 eg:
+Note: This documentation sucks, so you might need to read the source.
+
+::
+
+    import os.path
+    from shutil import copyfile
+
+    from docx2html import convert
+
+    def handle_image(image_id, relationship_dict):
+        image_path = relationship_dict[image_id]
+        # Now do something to the image. Let's move it somewhere.
+        _, filename = os.path.split(image_path)
+        destination_path = os.path.join('/tmp', filename)
+        copyfile(image_path, destination_path)
+
+        # Return the `src` attribute to be used in the img tag
+        return 'file://%s' % destination
+
+    html = convert('path/to/docx/file', image_handler=handle_image)
diff --git a/docx2html/__init__.py b/docx2html/__init__.py
index c6328aa..625c109 100644
--- a/docx2html/__init__.py
+++ b/docx2html/__init__.py
@@ -1,5 +1,5 @@
 from docx2html.core import convert
 
 __all__ = [
-    convert.func_name,
+    convert.__name__,
 ]
diff --git a/docx2html/converters.py b/docx2html/converters.py
new file mode 100644
index 0000000..36bc283
--- /dev/null
+++ b/docx2html/converters.py
@@ -0,0 +1,17 @@
+import subprocess
+
+
+def convert_with_abiword(docx_path, file_path):
+    """
+    This will convert ``file_path`` to docx and place the converted file at
+    ``docx_path``
+    """
+    subprocess.call(
+        [
+            'abiword',
+            '--to=docx',
+            '--to-name',
+            docx_path,
+            file_path,
+        ],
+    )
diff --git a/docx2html/core.py b/docx2html/core.py
index 72f0370..3aafcd4 100644
--- a/docx2html/core.py
+++ b/docx2html/core.py
@@ -1,6 +1,8 @@
 import cgi
 import os
-import subprocess
+import os.path
+import re
+import sys
 from PIL import Image
 from lxml import etree
 from lxml.etree import XMLSyntaxError
@@ -8,29 +10,42 @@
 from collections import namedtuple, defaultdict
 from zipfile import ZipFile, BadZipfile
 
+from docx2html.exceptions import (
+    ConversionFailed,
+    FileNotDocx,
+    MalformedDocx,
+    UnintendedTag,
+    SyntaxNotSupported,
+)
+
+
+PYTHON_VERSION = sys.version[0]
+
+
+def _get_etree_tostring_kwargs():
+    tostring_kwargs = {
+        'method': 'text',
+        'encoding': string_function,
+    }
+    return tostring_kwargs
+
+
+def _string_function():
+    if PYTHON_VERSION == '2':
+        return unicode  # noqa
+    elif PYTHON_VERSION == '3':
+        return str
+    raise NotImplementedError('Your version of python is not supported')
+string_function = _string_function()
+
 DETECT_FONT_SIZE = False
 EMUS_PER_PIXEL = 9525
-# Abiword supported formats
-VALID_EXTRACT_EXTENSIONS = [
-    '.doc', '.docx', '.dotx', '.docm', '.dotm', '.wri', '.rtf', '.txt',
-    '.text', '.wpd', '.wp', '.odt', '.ott', '.abw', '.atw', '.pdf', '.html',
-    '.dot',
-]
 
 ###
 # Help functions
 ###
 
 
-def is_extractable(path):
-    """
-    Determine if a file is something that we can extract.
-    """
-    _, extension = os.path.splitext(path)
-    extension = extension.lower()
-    return (extension in VALID_EXTRACT_EXTENSIONS)
-
-
 def replace_ext(file_path, new_ext):
     """
     >>> replace_ext('one/two/three.four.doc', '.html')
@@ -129,17 +144,29 @@ def get_font_size(p, styles_dict):
             return None
         pStyle = pStyle.get('%sval' % w_namespace)
         font_size = None
-        if 'font_size' in styles_dict[pStyle]:
+        style_value = styles_dict.get(pStyle, None)
+        if style_value is None:
+            return None
+        if 'font_size' in style_value:
             font_size = styles_dict[pStyle]['font_size']
         while font_size is None:
             old_pStyle = pStyle
+            # If pStyle is not in the styles_dict then we have to break.
             if pStyle not in styles_dict:
                 break
+            # If based on is not in the styles_dict for pStyle then we have to
+            # break.
             if 'based_on' not in styles_dict[pStyle]:
                 break
+            # Try to derive what the font size is based on what the current
+            # style is based on.
             pStyle = styles_dict[pStyle]['based_on']
             if old_pStyle == pStyle:
                 break
+            # If pStyle is not in styles_dict then break.
+            if pStyle not in styles_dict:
+                break
+            # We have found a new font size
             font_size = styles_dict[pStyle]['font_size']
         return font_size
 
@@ -159,13 +186,14 @@ def is_natural_header(el, styles_dict):
     if (
             style_id in styles_dict and
             'header' in styles_dict[style_id] and
-            styles_dict[style_id]['header']
-        ):
+            styles_dict[style_id]['header']):
         return styles_dict[style_id]['header']
 
 
 @ensure_tag(['p'])
 def is_header(el, meta_data):
+    if _is_top_level_upper_roman(el, meta_data):
+        return 'h2'
     el_is_natural_header = is_natural_header(el, meta_data.styles_dict)
     if el_is_natural_header:
         return el_is_natural_header
@@ -187,13 +215,10 @@ def is_header(el, meta_data):
 
     # If a paragraph is longer than eight words it is likely not supposed to be
     # an h tag.
-    num_words = len(
-        etree.tostring(
-            el,
-            encoding=unicode,
-            method='text',
-        ).split(' ')
-    )
+    tostring_kwargs = _get_etree_tostring_kwargs()
+    text = etree.tostring(el, **tostring_kwargs)
+    text = string_function(text)
+    num_words = len(text.split(' '))
     if num_words > 8:
         return False
 
@@ -201,6 +226,7 @@ def is_header(el, meta_data):
     whole_line_bold, whole_line_italics = whole_line_styled(el)
     if whole_line_bold or whole_line_italics:
         return 'h2'
+
     return False
 
 
@@ -241,7 +267,34 @@ def has_text(p):
     this is the case we do not want that tag interfering with things like
     lists. Detect if this tag has any content.
     """
-    return etree.tostring(p, encoding=unicode, method='text') != ''
+    tostring_kwargs = _get_etree_tostring_kwargs()
+    return '' != etree.tostring(p, **tostring_kwargs).strip()
+
+
+def is_last_li(li, meta_data, current_numId):
+    """
+    Determine if ``li`` is the last list item for a given list
+    """
+    if not is_li(li, meta_data):
+        return False
+    w_namespace = get_namespace(li, 'w')
+    next_el = li
+    while True:
+        # If we run out of element this must be the last list item
+        if next_el is None:
+            return True
+
+        next_el = next_el.getnext()
+        # Ignore elements that are not a list item
+        if not is_li(next_el, meta_data):
+            continue
+
+        new_numId = get_numId(next_el, w_namespace)
+        if current_numId != new_numId:
+            return True
+        # If we have gotten here then we have found another list item in the
+        # current list, so ``li`` is not the last li in the list.
+        return False
 
 
 @ensure_tag(['p'])
@@ -249,9 +302,10 @@ def get_li_nodes(li, meta_data):
     """
     Find consecutive li tags that have content that have the same list id.
     """
-    w_namespace = get_namespace(li, 'w')
     yield li
+    w_namespace = get_namespace(li, 'w')
     current_numId = get_numId(li, w_namespace)
+    starting_ilvl = get_ilvl(li, w_namespace)
     el = li
     while True:
         el = el.getnext()
@@ -260,20 +314,24 @@ def get_li_nodes(li, meta_data):
         # If the tag has no content ignore it.
         if not has_text(el):
             continue
-        # If the next tag is not an li tag then we have found the end of the
-        # list.
-        if not is_li(el, meta_data):
-            break
 
         # Stop the lists if you come across a list item that should be a
         # heading.
         if _is_top_level_upper_roman(el, meta_data):
             break
 
+        if (
+                is_li(el, meta_data) and
+                (starting_ilvl > get_ilvl(el, w_namespace))):
+            break
+
         # If the list id of the next tag is different that the previous that
         # means a new list being made (not nested)
-        numId = get_numId(el, w_namespace)
-        if current_numId != numId:
+        if is_last_li(el, meta_data, current_numId):
+            new_numId = get_numId(el, w_namespace)
+            if current_numId == new_numId:
+                # Not a subsequent list.
+                yield el
             break
         yield el
 
@@ -624,21 +682,21 @@ def get_style_dict(tree):
     # This is a partial document and actual h1 is the document title, which
     # will be displayed elsewhere.
     headers = {
-        'Heading 1': 'h2',
-        'Heading 2': 'h3',
-        'Heading 3': 'h4',
-        'Heading 4': 'h5',
-        'Heading 5': 'h6',
-        'Heading 6': 'h6',
-        'Heading 7': 'h6',
-        'Heading 8': 'h6',
-        'Heading 9': 'h6',
-        'Heading 10': 'h6',
+        'heading 1': 'h2',
+        'heading 2': 'h3',
+        'heading 3': 'h4',
+        'heading 4': 'h5',
+        'heading 5': 'h6',
+        'heading 6': 'h6',
+        'heading 7': 'h6',
+        'heading 8': 'h6',
+        'heading 9': 'h6',
+        'heading 10': 'h6',
     }
     if tree is None:
         return {}
     w_namespace = get_namespace(tree, 'w')
-    result = defaultdict(dict)
+    result = {}
     for el in tree:
         style_id = el.get('%sstyleId' % w_namespace)
         el_result = {
@@ -650,7 +708,7 @@ def get_style_dict(tree):
         name = el.find('%sname' % w_namespace)
         if name is None:
             continue
-        value = name.get('%sval' % w_namespace)
+        value = name.get('%sval' % w_namespace).lower()
         if value in headers:
             el_result['header'] = headers[value]
 
@@ -690,7 +748,7 @@ def get_image_sizes(tree):
                 ext = el.find('%sext' % a_namespace)
                 cx = int(ext.get('cx')) / EMUS_PER_PIXEL
                 cy = int(ext.get('cy')) / EMUS_PER_PIXEL
-                result[get_image_id(d)] = (cx, cy)
+                result[get_image_id(d)] = (int(cx), int(cy))
     return result
 
 
@@ -845,13 +903,67 @@ def get_list_data(li_nodes, meta_data):
     # Store the first list created (the root list) for the return value.
     root_ol = None
     visited_nodes = []
+    list_contents = []
+
+    def _build_li(list_contents):
+        data = '<br />'.join(
+            string_function(t) for t in list_contents if
+            t is not None
+        )
+        return etree.XML('<li>%s</li>' % data)
+
+    def _build_non_li_content(el, meta_data):
+        w_namespace = get_namespace(el, 'w')
+        if el.tag == '%stbl' % w_namespace:
+            new_el, visited_nodes = get_table_data(el, meta_data)
+            li_content_text = etree.tostring(
+                new_el,
+                encoding=string_function,
+            )
+            return li_content_text, visited_nodes
+        elif el.tag == '%sp' % w_namespace:
+            return get_p_data(el, meta_data), [el]
+        if has_text(el):
+            raise UnintendedTag('Did not expect %s' % el.tag)
+
+    def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol):
+        for i in reversed(range(ilvl, current_ilvl)):
+            # Any list that is more indented that ilvl needs to
+            # be merged to the list before it.
+            if i not in ol_dict:
+                continue
+            if ol_dict[i] is not current_ol:
+                if ol_dict[i] is current_ol:
+                    continue
+                ol_dict[i][-1].append(current_ol)
+                current_ol = ol_dict[i]
+
+        # Clean up finished nested lists.
+        for key in list(ol_dict):
+            if key > ilvl:
+                del ol_dict[key]
+        return current_ol
+
     for li_node in li_nodes:
         w_namespace = get_namespace(li_node, 'w')
+        if not is_li(li_node, meta_data):
+            # Get the content and visited nodes
+            new_el, el_visited_nodes = _build_non_li_content(
+                li_node,
+                meta_data,
+            )
+            list_contents.append(new_el)
+            visited_nodes.extend(el_visited_nodes)
+            continue
+        if list_contents:
+            li_el = _build_li(list_contents)
+            list_contents = []
+            current_ol.append(li_el)
         # Get the data needed to build the current list item
-        text = get_p_data(
+        list_contents.append(get_p_data(
             li_node,
             meta_data,
-        )
+        ))
         ilvl = get_ilvl(li_node, w_namespace)
         numId = get_numId(li_node, w_namespace)
         list_type = meta_data.numbering_dict[numId].get(ilvl, 'decimal')
@@ -870,21 +982,12 @@ def get_list_data(li_nodes, meta_data):
         # than ilvl and then remove them from the ol_dict
         else:
             # Merge any nested lists that need to be merged.
-            for i in reversed(range(ilvl, current_ilvl)):
-                # Any list that is more indented that ilvl needs to
-                # be merged to the list before it.
-                if i not in ol_dict:
-                    continue
-                if ol_dict[i] is not current_ol:
-                    if ol_dict[i] is current_ol:
-                        continue
-                    ol_dict[i][-1].append(current_ol)
-                    current_ol = ol_dict[i]
-
-            # Clean up finished nested lists.
-            for key in list(ol_dict):
-                if key > ilvl:
-                    del ol_dict[key]
+            current_ol = _merge_lists(
+                ilvl=ilvl,
+                current_ilvl=current_ilvl,
+                ol_dict=ol_dict,
+                current_ol=current_ol,
+            )
 
         # Set the root list after the first list is created.
         if root_ol is None:
@@ -903,21 +1006,23 @@ def get_list_data(li_nodes, meta_data):
             current_ol = ol_dict[ilvl]
 
         # Create the li element.
-        li_el = etree.XML('<li>%s</li>' % text)
-        current_ol.append(li_el)
         visited_nodes.extend(list(li_node.iter()))
 
+    # If a list item is the last thing in a document, then you will need to add
+    # it here. Should probably figure out how to get the above logic to deal
+    # with it.
+    if list_contents:
+        li_el = _build_li(list_contents)
+        list_contents = []
+        current_ol.append(li_el)
+
     # Merge up any nested lists that have not been merged.
-    for i in reversed(range(0, current_ilvl)):
-        if i not in ol_dict:
-            continue
-        # If we do not do this check it is possible to create an infinite loop
-        # in etree.
-        if ol_dict[i] is current_ol:
-            continue
-        # append the current ol to the end of the last li tag.
-        ol_dict[i][-1].append(current_ol)
-        current_ol = ol_dict[i]
+    current_ol = _merge_lists(
+        ilvl=0,
+        current_ilvl=current_ilvl,
+        ol_dict=ol_dict,
+        current_ol=current_ol,
+    )
 
     return root_ol, visited_nodes
 
@@ -943,13 +1048,9 @@ def get_tr_data(tr, meta_data, row_spans):
             # ignored.
             if (
                     v_merge is not None and
-                    v_merge.get('%sval' % w_namespace) != 'restart'
-                ):
+                    v_merge.get('%sval' % w_namespace) != 'restart'):
                 continue
 
-            # Create the td element with all the text break-joined.
-            td_el = etree.XML('<td></td>')
-
             # Loop through each and build a list of all the content.
             texts = []
             for td_content in el:
@@ -968,14 +1069,22 @@ def get_tr_data(tr, meta_data, row_spans):
                         meta_data,
                     )
                     visited_nodes.extend(list_visited_nodes)
-                    texts.append(etree.tostring(list_el))
+                    list_el_text = etree.tostring(
+                        list_el,
+                        encoding=string_function,
+                    )
+                    texts.append(list_el_text)
                 elif td_content.tag == '%stbl' % w_namespace:
                     table_el, table_visited_nodes = get_table_data(
                         td_content,
                         meta_data,
                     )
                     visited_nodes.extend(table_visited_nodes)
-                    texts.append(etree.tostring(table_el))
+                    table_el_text = etree.tostring(
+                        table_el,
+                        encoding=string_function,
+                    )
+                    texts.append(table_el_text)
                 elif td_content.tag == '%stcPr' % w_namespace:
                     # Do nothing
                     visited_nodes.append(td_content)
@@ -988,7 +1097,7 @@ def get_tr_data(tr, meta_data, row_spans):
                     )
                     texts.append(text)
 
-            data = '<br/>'.join(texts)
+            data = '<br />'.join(t for t in texts if t is not None)
             td_el = etree.XML('<td>%s</td>' % data)
             # if there is a colspan then set it here.
             colspan = get_grid_span(el)
@@ -1000,8 +1109,7 @@ def get_tr_data(tr, meta_data, row_spans):
             # here.
             if (
                     v_merge is not None and
-                    v_merge.get('%sval' % w_namespace) == 'restart'
-                ):
+                    v_merge.get('%sval' % w_namespace) == 'restart'):
                 rowspan = next(row_spans)
                 td_el.set('rowspan', '%d' % rowspan)
 
@@ -1053,7 +1161,8 @@ def handle_t_tag(
         # The relationship_id is the href
         if hyperlink_id in meta_data.relationship_dict:
             href = meta_data.relationship_dict[hyperlink_id]
-            text = '<a href="%s">%s</a>' % (href, text)
+            # Do not do any styling on hyperlinks
+            return '<a href="%s">%s</a>' % (href, text)
     # Wrap the text with any modifiers it might have (bold, italics or
     # underline)
     el_is_bold = not remove_bold and (
@@ -1084,14 +1193,14 @@ def get_p_data(p, meta_data, is_td=False):
     """
     remove_italics = False
     remove_bold = False
-    if not is_td and not is_li(p, meta_data):
-        # Check to see if the whole line is bold or italics.
-        whole_line_bold, whole_line_italics = whole_line_styled(p)
-        p_is_header = bool(is_header(p, meta_data) and not is_li(p, meta_data))
 
-        # Only remove bold or italics if this tag is an h tag.
-        remove_bold = p_is_header and whole_line_bold
-        remove_italics = p_is_header and whole_line_italics
+    # Only remove bold or italics if this tag is an h tag.
+    # Td elements have the same look and feel as p/h elements. Right now we are
+    # never putting h tags in td elements, as such if we are in a td we will
+    # never be stripping bold/italics since that is only done on h tags
+    if not is_td and is_header(p, meta_data):
+        # Check to see if the whole line is bold or italics.
+        remove_bold, remove_italics = whole_line_styled(p)
 
     p_text = ''
     w_namespace = get_namespace(p, 'w')
@@ -1136,7 +1245,20 @@ def get_p_data(p, meta_data, is_td=False):
 
             # Once we have the hyperlink_id then we need to replace the
             # hyperlink tag with its child run tag.
-            el = el.find('%sr' % w_namespace)
+            child_run_tag = el.find('%sr' % w_namespace)
+            if child_run_tag is None:
+                if has_text(el):
+                    # If there is text in this hyperlink we need to raise an
+                    # exception so that we don't lose content.
+                    raise SyntaxNotSupported(
+                        'Hyperlink with text outside run tags not supported.',
+                    )
+                # It is very likely that this was a hyperlink tag that had its
+                # content removed, office does not do a very good job at
+                # cleaning up old tags, as such this tag has no content and
+                # should be ignored.
+                continue
+            el = child_run_tag
 
         # t tags hold all the text content.
         for child in get_raw_data(el):
@@ -1153,6 +1275,9 @@ def get_p_data(p, meta_data, is_td=False):
                 p_text += '<br />'
             else:  # We have an image
                 image_id = get_image_id(child)
+                if image_id not in meta_data.relationship_dict:
+                    # This image does not have an image_id
+                    continue
                 src = meta_data.image_handler(
                     image_id,
                     meta_data.relationship_dict,
@@ -1173,23 +1298,43 @@ def get_p_data(p, meta_data, is_td=False):
     return p_text
 
 
+def _strip_tag(tree, tag):
+    """
+    Remove all tags that have the tag name ``tag``
+    """
+    for el in tree.iter():
+        if el.tag == tag:
+            el.getparent().remove(el)
+
+
 def get_zip_file_handler(file_path):
     return ZipFile(file_path)
 
 
-def convert(file_path, image_handler=None, fall_back=None):
-    file_base, extension = os.path.splitext(os.path.basename(file_path))
+def read_html_file(file_path):
+    with open(file_path) as f:
+        html = f.read()
+    return html
 
-    if not is_extractable(file_path):
-        #XXX create better exception, used to be InvalidFileExtension
-        raise Exception(
-            'The file type "%s" is not supported' % extension
-        )
 
-    if extension == '.html':
-        with open(file_path) as f:
-            html = f.read()
-        return html
+def convert(file_path, image_handler=None, fall_back=None, converter=None):
+    """
+    ``file_path`` is a path to the file on the file system that you want to be
+        converted to html.
+    ``image_handler`` is a function that takes an image_id and a
+        relationship_dict to generate the src attribute for images. (see readme
+        for more details)
+    ``fall_back`` is a function that takes a ``file_path``. This function will
+        only be called if for whatever reason the conversion fails.
+    ``converter`` is a function to convert a document that is not docx to docx
+        (examples in docx2html.converters)
+
+    Returns html extracted from ``file_path``
+    """
+    file_base, extension = os.path.splitext(os.path.basename(file_path))
+
+    if extension == '.html' or extension == '.htm':
+        return read_html_file(file_path)
 
     # Create the converted file as a file in the same dir with the
     # same name only with a .docx extension
@@ -1198,25 +1343,20 @@ def convert(file_path, image_handler=None, fall_back=None):
         # If the file is already html, just leave it in place.
         docx_path = file_path
     else:
-        # Convert the file to docx
-        # TODO make this configurable.
-        subprocess.call(
-            ['abiword', '--to=docx', '--to-name', docx_path, file_path],
-        )
+        if converter is None:
+            raise FileNotDocx('The file passed in is not a docx.')
+        converter(docx_path, file_path)
+        if not os.path.isfile(docx_path):
+            if fall_back is None:
+                raise ConversionFailed('Conversion to docx failed.')
+            else:
+                return fall_back(file_path)
+
     try:
         # Docx files are actually just zip files.
         zf = get_zip_file_handler(docx_path)
     except BadZipfile:
-        # If its a malformed zip file raise InvalidFileExtension
-        # XXX
-        raise Exception('This file is not a docx')
-    except IOError:
-        # This means that the conversion from abiword failed.
-        if fall_back is not None:
-            return fall_back(file_path)
-        else:
-            # XXX
-            raise Exception('Conversion to docx failed.')
+        raise MalformedDocx('This file is not a docx')
 
     # Need to populate the xml based on word/document.xml
     tree, meta_data = _get_document_data(zf, image_handler)
@@ -1230,33 +1370,49 @@ def create_html(tree, meta_data):
 
     w_namespace = get_namespace(tree, 'w')
     visited_nodes = []
+
+    _strip_tag(tree, '%ssectPr' % w_namespace)
     for el in tree.iter():
         # The way lists are handled could double visit certain elements; keep
         # track of which elements have been visited and skip any that have been
         # visited already.
         if el in visited_nodes:
             continue
-        if el.tag == '%sp' % w_namespace:
-            # If this is true we have a bullet in some list
+        header_value = is_header(el, meta_data)
+        if is_header(el, meta_data):
+            p_text = get_p_data(el, meta_data)
+            if p_text == '':
+                continue
+            new_html.append(
+                etree.XML('<%s>%s</%s>' % (
+                    header_value,
+                    p_text,
+                    header_value,
+                ))
+            )
+        elif el.tag == '%sp' % w_namespace:
+            # Strip out titles.
+            if is_title(el):
+                continue
             if is_li(el, meta_data):
-                # This should be a header instead.
-                if _is_top_level_upper_roman(el, meta_data):
-                    p_text = get_p_data(el, meta_data)
-                    new_html.append(
-                        etree.XML('<h2>%s</h2>' % p_text)
-                    )
-                    continue
                 # Parse out the needed info from the node.
                 li_nodes = get_li_nodes(el, meta_data)
-                list_el, list_visited_nodes = get_list_data(
+                new_el, list_visited_nodes = get_list_data(
                     li_nodes,
                     meta_data,
                 )
                 visited_nodes.extend(list_visited_nodes)
-                new_html.append(list_el)
-                continue
+            # Handle generic p tag here.
+            else:
+                p_text = get_p_data(el, meta_data)
+                # If there is not text do not add an empty tag.
+                if p_text == '':
+                    continue
 
-        if el.tag == '%stbl' % w_namespace:
+                new_el = etree.XML('<p>%s</p>' % p_text)
+            new_html.append(new_el)
+
+        elif el.tag == '%stbl' % w_namespace:
             table_el, table_visited_nodes = get_table_data(
                 el,
                 meta_data,
@@ -1265,33 +1421,29 @@ def create_html(tree, meta_data):
             new_html.append(table_el)
             continue
 
-        # Handle generic p tag here.
-        if el.tag == '%sp' % w_namespace:
-            # Strip out titles.
-            if is_title(el):
-                continue
-
-            # If there is not text do not add an empty tag.
-            p_text = get_p_data(el, meta_data)
-            if p_text == '':
-                continue
-
-            # Check to see if its a header
-            header_value = is_header(el, meta_data)
-            if header_value:
-                # Make a header based of the header_value
-                new_html.append(
-                    etree.XML('<%s>%s</%s>' % (
-                        header_value,
-                        p_text,
-                        header_value,
-                    ))
-                )
-            else:
-                # Make a paragraph
-                new_html.append(etree.XML('<p>%s</p>' % p_text))
-            continue
-
         # Keep track of visited_nodes
         visited_nodes.append(el)
-    return etree.tostring(new_html)
+    result = etree.tostring(
+        new_html,
+        method='html',
+        encoding=string_function,
+        with_tail=True,
+    )
+    return _make_void_elements_self_close(result)
+
+
+def _make_void_elements_self_close(html):
+    #XXX Hack not sure how to get etree to do this by default.
+    void_tags = [
+        r'br',
+        r'img',
+    ]
+    html = string_function(html)
+    for tag in void_tags:
+        regex = re.compile(r'<%s.*?>' % tag)
+        matches = regex.findall(html)
+        for match in matches:
+            new_tag = match.strip('<>')
+            new_tag = '<%s />' % new_tag
+            html = re.sub(match, new_tag, html)
+    return html
diff --git a/docx2html/exceptions.py b/docx2html/exceptions.py
new file mode 100644
index 0000000..d50f1a0
--- /dev/null
+++ b/docx2html/exceptions.py
@@ -0,0 +1,22 @@
+class Docx2HtmlException(Exception):
+    pass
+
+
+class ConversionFailed(Docx2HtmlException):
+    pass
+
+
+class FileNotDocx(Docx2HtmlException):
+    pass
+
+
+class MalformedDocx(Docx2HtmlException):
+    pass
+
+
+class UnintendedTag(Docx2HtmlException):
+    pass
+
+
+class SyntaxNotSupported(Docx2HtmlException):
+    pass
diff --git a/docx2html/fixtures/attachment_is_tiff.docx b/docx2html/fixtures/attachment_is_tiff.docx
new file mode 100644
index 0000000..774362c
Binary files /dev/null and b/docx2html/fixtures/attachment_is_tiff.docx differ
diff --git a/docx2html/fixtures/bigger_font_size_to_header.docx b/docx2html/fixtures/bigger_font_size_to_header.docx
new file mode 100644
index 0000000..c722888
Binary files /dev/null and b/docx2html/fixtures/bigger_font_size_to_header.docx differ
diff --git a/docx2html/fixtures/convert_p_to_h.docx b/docx2html/fixtures/convert_p_to_h.docx
new file mode 100644
index 0000000..53769e1
Binary files /dev/null and b/docx2html/fixtures/convert_p_to_h.docx differ
diff --git a/docx2html/fixtures/fake_headings_by_length.docx b/docx2html/fixtures/fake_headings_by_length.docx
new file mode 100644
index 0000000..a130f5b
Binary files /dev/null and b/docx2html/fixtures/fake_headings_by_length.docx differ
diff --git a/docx2html/fixtures/greek_alphabet.docx b/docx2html/fixtures/greek_alphabet.docx
new file mode 100644
index 0000000..46ab542
Binary files /dev/null and b/docx2html/fixtures/greek_alphabet.docx differ
diff --git a/docx2html/fixtures/has_image.docx b/docx2html/fixtures/has_image.docx
new file mode 100644
index 0000000..2ebd0bd
Binary files /dev/null and b/docx2html/fixtures/has_image.docx differ
diff --git a/docx2html/fixtures/has_missing_image.docx b/docx2html/fixtures/has_missing_image.docx
new file mode 100644
index 0000000..996e667
Binary files /dev/null and b/docx2html/fixtures/has_missing_image.docx differ
diff --git a/docx2html/fixtures/has_title.docx b/docx2html/fixtures/has_title.docx
new file mode 100644
index 0000000..a87d88e
Binary files /dev/null and b/docx2html/fixtures/has_title.docx differ
diff --git a/docx2html/fixtures/header_footer_problem.docx b/docx2html/fixtures/header_footer_problem.docx
new file mode 100644
index 0000000..6bc49a7
Binary files /dev/null and b/docx2html/fixtures/header_footer_problem.docx differ
diff --git a/docx2html/fixtures/headers.docx b/docx2html/fixtures/headers.docx
new file mode 100644
index 0000000..890104c
Binary files /dev/null and b/docx2html/fixtures/headers.docx differ
diff --git a/docx2html/fixtures/headers_with_full_line_styles.docx b/docx2html/fixtures/headers_with_full_line_styles.docx
new file mode 100644
index 0000000..38d6f6a
Binary files /dev/null and b/docx2html/fixtures/headers_with_full_line_styles.docx differ
diff --git a/docx2html/fixtures/inline_tags.docx b/docx2html/fixtures/inline_tags.docx
new file mode 100644
index 0000000..4aba234
Binary files /dev/null and b/docx2html/fixtures/inline_tags.docx differ
diff --git a/docx2html/fixtures/list_in_table.docx b/docx2html/fixtures/list_in_table.docx
new file mode 100644
index 0000000..d1a8738
Binary files /dev/null and b/docx2html/fixtures/list_in_table.docx differ
diff --git a/docx2html/fixtures/list_to_header.docx b/docx2html/fixtures/list_to_header.docx
new file mode 100644
index 0000000..f9b3946
Binary files /dev/null and b/docx2html/fixtures/list_to_header.docx differ
diff --git a/docx2html/fixtures/lists_with_styles.docx b/docx2html/fixtures/lists_with_styles.docx
new file mode 100644
index 0000000..c1c7ecf
Binary files /dev/null and b/docx2html/fixtures/lists_with_styles.docx differ
diff --git a/docx2html/fixtures/missing_content.docx b/docx2html/fixtures/missing_content.docx
new file mode 100644
index 0000000..21bed96
Binary files /dev/null and b/docx2html/fixtures/missing_content.docx differ
diff --git a/docx2html/fixtures/nested_lists.docx b/docx2html/fixtures/nested_lists.docx
new file mode 100644
index 0000000..f4000df
Binary files /dev/null and b/docx2html/fixtures/nested_lists.docx differ
diff --git a/docx2html/fixtures/nested_table_rowspan.docx b/docx2html/fixtures/nested_table_rowspan.docx
new file mode 100644
index 0000000..b43b8a0
Binary files /dev/null and b/docx2html/fixtures/nested_table_rowspan.docx differ
diff --git a/docx2html/fixtures/nested_tables.docx b/docx2html/fixtures/nested_tables.docx
new file mode 100644
index 0000000..af704d4
Binary files /dev/null and b/docx2html/fixtures/nested_tables.docx differ
diff --git a/docx2html/fixtures/resized_image.docx b/docx2html/fixtures/resized_image.docx
new file mode 100644
index 0000000..913099c
Binary files /dev/null and b/docx2html/fixtures/resized_image.docx differ
diff --git a/docx2html/fixtures/shift_enter.docx b/docx2html/fixtures/shift_enter.docx
new file mode 100644
index 0000000..4128c0a
Binary files /dev/null and b/docx2html/fixtures/shift_enter.docx differ
diff --git a/docx2html/fixtures/simple.docx b/docx2html/fixtures/simple.docx
new file mode 100644
index 0000000..f75e5eb
Binary files /dev/null and b/docx2html/fixtures/simple.docx differ
diff --git a/docx2html/fixtures/simple_lists.docx b/docx2html/fixtures/simple_lists.docx
new file mode 100644
index 0000000..c09ad74
Binary files /dev/null and b/docx2html/fixtures/simple_lists.docx differ
diff --git a/docx2html/fixtures/special_chars.docx b/docx2html/fixtures/special_chars.docx
new file mode 100644
index 0000000..b4b9287
Binary files /dev/null and b/docx2html/fixtures/special_chars.docx differ
diff --git a/docx2html/fixtures/split_header.docx b/docx2html/fixtures/split_header.docx
new file mode 100644
index 0000000..cc4bd5c
Binary files /dev/null and b/docx2html/fixtures/split_header.docx differ
diff --git a/docx2html/fixtures/table_col_row_span.docx b/docx2html/fixtures/table_col_row_span.docx
new file mode 100644
index 0000000..856abfd
Binary files /dev/null and b/docx2html/fixtures/table_col_row_span.docx differ
diff --git a/docx2html/fixtures/tables_in_lists.docx b/docx2html/fixtures/tables_in_lists.docx
new file mode 100644
index 0000000..1185954
Binary files /dev/null and b/docx2html/fixtures/tables_in_lists.docx differ
diff --git a/docx2html/fixtures/track_changes_on.docx b/docx2html/fixtures/track_changes_on.docx
new file mode 100644
index 0000000..dcb7ba1
Binary files /dev/null and b/docx2html/fixtures/track_changes_on.docx differ
diff --git a/docx2html/fixtures/upper_alpha_all_bold.docx b/docx2html/fixtures/upper_alpha_all_bold.docx
new file mode 100644
index 0000000..d518b2c
Binary files /dev/null and b/docx2html/fixtures/upper_alpha_all_bold.docx differ
diff --git a/docx2html/tests.py b/docx2html/tests.py
deleted file mode 100644
index e69de29..0000000
diff --git a/docx2html/tests/__init__.py b/docx2html/tests/__init__.py
new file mode 100644
index 0000000..87359c8
--- /dev/null
+++ b/docx2html/tests/__init__.py
@@ -0,0 +1,126 @@
+from __future__ import print_function
+
+from unittest import TestCase
+import re
+
+from docx2html.core import (
+    MetaData,
+    create_html,
+    string_function,
+)
+
+
+def assert_html_equal(actual_html, expected_html):
+    actual_collapsed = collapse_html(actual_html)
+    expected_collapsed = collapse_html(expected_html)
+    print(actual_collapsed)
+    print(expected_collapsed)
+    assert actual_collapsed == expected_collapsed
+
+
+def collapse_html(html):
+    """
+    Remove insignificant whitespace from the html.
+
+    >>> print(collapse_html('''\\
+    ...     <h1>
+    ...         Heading
+    ...     </h1>
+    ... '''))
+    <h1>Heading</h1>
+    >>> print(collapse_html('''\\
+    ...     <p>
+    ...         Paragraph with
+    ...         multiple lines.
+    ...     </p>
+    ... '''))
+    <p>Paragraph with multiple lines.</p>
+    """
+    def smart_space(match):
+        # Put a space in between lines, unless exactly one side of the line
+        # break butts up against a tag.
+        before = match.group(1)
+        after = match.group(2)
+        space = ' '
+        if before == '>' or after == '<':
+            space = ''
+        return before + space + after
+    # Replace newlines and their surrounding whitespace with a single space (or
+    # empty string)
+    html = re.sub(
+        r'(>?)\s*\n\s*(<?)',
+        smart_space,
+        string_function(html),
+    )
+    return html.strip()
+
+
+DEFAULT_NUMBERING_DICT = {
+    '1': {
+        0: 'decimal',
+        1: 'decimal',
+    },
+    '2': {
+        0: 'none',
+        1: 'none',
+    },
+}
+DEFAULT_RELATIONSHIP_DICT = {
+    'rId3': 'fontTable.xml',
+    'rId2': 'numbering.xml',
+    'rId1': 'styles.xml',
+}
+DEFAULT_STYLES_DICT = {
+    'style0': {
+        'header': False,
+        'font_size': '24',
+        'based_on': None,
+    },
+}
+DEFAULT_FONT_SIZES_DICT = {
+    '24': None,
+}
+
+
+def image_handler(*args, **kwargs):
+    return 'test'
+DEFAULT_IMAGE_HANDLER = image_handler
+DEFAULT_IMAGE_SIZES = {}
+
+
+# This is a base test case defining methods to generate the xml and the meta
+# data for each test case.
+class _TranslationTestCase(TestCase):
+    expected_output = None
+    numbering_dict = DEFAULT_NUMBERING_DICT
+    relationship_dict = DEFAULT_RELATIONSHIP_DICT
+    styles_dict = DEFAULT_STYLES_DICT
+    font_sizes_dict = DEFAULT_FONT_SIZES_DICT
+    image_handler = DEFAULT_FONT_SIZES_DICT
+    image_sizes = DEFAULT_IMAGE_SIZES
+
+    def get_xml(self):
+        raise NotImplementedError()
+
+    def get_meta_data(self):
+        return MetaData(
+            numbering_dict=self.numbering_dict,
+            relationship_dict=self.relationship_dict,
+            styles_dict=self.styles_dict,
+            font_sizes_dict=self.font_sizes_dict,
+            image_handler=self.image_handler,
+            image_sizes=self.image_sizes,
+        )
+
+    def test_expected_output(self):
+        if self.expected_output is None:
+            raise AssertionError('expected_output is not defined')
+
+        # Create the xml
+        tree = self.get_xml()
+        meta_data = self.get_meta_data()
+
+        # Verify the final output.
+        html = create_html(tree, meta_data)
+
+        assert_html_equal(html, self.expected_output)
diff --git a/docx2html/tests/document_builder.py b/docx2html/tests/document_builder.py
new file mode 100644
index 0000000..b585d09
--- /dev/null
+++ b/docx2html/tests/document_builder.py
@@ -0,0 +1,142 @@
+from jinja2 import Environment, PackageLoader
+
+templates = {
+    'drawing': 'drawing.xml',
+    'hyperlink': 'hyperlink.xml',
+    'main': 'base.xml',
+    'p': 'p.xml',
+    'pict': 'pict.xml',
+    'r': 'r.xml',
+    'sectPr': 'sectPr.xml',
+    'table': 'table.xml',
+    'tc': 'tc.xml',
+    'tr': 'tr.xml',
+    'styles': 'styles.xml',
+    'style': 'style.xml',
+}
+
+env = Environment(
+    loader=PackageLoader(
+        'docx2html.tests',
+        'templates',
+    ),
+)
+
+
+class DocxBuilder(object):
+    @classmethod
+    def xml(self, body):
+        template = env.get_template(templates['main'])
+        return template.render(body=body)
+
+    @classmethod
+    def p_tag(self, text, bold=False):
+        if isinstance(text, str):
+            # Use create a single r tag based on the text and the bold
+            run_tag = DocxBuilder.r_tag(text, bold)
+            run_tags = [run_tag]
+        elif isinstance(text, list):
+            run_tags = text
+        else:
+            raise AssertionError('text must be a string or a list')
+        template = env.get_template(templates['p'])
+
+        kwargs = {
+            'run_tags': run_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def r_tag(self, text, is_bold=False):
+        template = env.get_template(templates['r'])
+        kwargs = {
+            'text': text,
+            'is_bold': is_bold,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def hyperlink_tag(self, r_id, run_tags):
+        template = env.get_template(templates['hyperlink'])
+        kwargs = {
+            'r_id': r_id,
+            'run_tags': run_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def li(self, text, ilvl, numId, bold=False):
+        if isinstance(text, str):
+            # Use create a single r tag based on the text and the bold
+            run_tag = DocxBuilder.r_tag(text, bold)
+            run_tags = [run_tag]
+        elif isinstance(text, list):
+            run_tags = []
+            for run_text, run_bold in text:
+                run_tags.append(DocxBuilder.r_tag(run_tags, run_bold))
+        else:
+            raise AssertionError('text must be a string or a list')
+        template = env.get_template(templates['p'])
+
+        kwargs = {
+            'run_tags': run_tags,
+            'is_list': True,
+            'ilvl': ilvl,
+            'numId': numId,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def table(self, num_rows, num_columns, text):
+
+        def _tc(cell_value):
+            template = env.get_template(templates['tc'])
+            return template.render(p_tag=cell_value)
+
+        def _tr(rows, text):
+            tcs = [_tc(next(text)) for _ in range(rows)]
+            template = env.get_template(templates['tr'])
+            return template.render(table_cells=tcs)
+
+        trs = [_tr(num_rows, text) for _ in range(num_rows)]
+        template = env.get_template(templates['table'])
+        return template.render(table_rows=trs)
+
+    @classmethod
+    def drawing(self, r_id):
+        template = env.get_template(templates['drawing'])
+        return template.render(r_id=r_id)
+
+    @classmethod
+    def pict(self, r_id=None):
+        template = env.get_template(templates['pict'])
+        return template.render(r_id=r_id)
+
+    @classmethod
+    def sectPr_tag(self, p_tag):
+        template = env.get_template(templates['sectPr'])
+
+        kwargs = {
+            'p_tag': p_tag,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def styles_xml(self, style_tags):
+        template = env.get_template(templates['styles'])
+
+        kwargs = {
+            'style_tags': style_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def style(self, style_id, value):
+        template = env.get_template(templates['style'])
+
+        kwargs = {
+            'style_id': style_id,
+            'value': value,
+        }
+
+        return template.render(**kwargs)
diff --git a/docx2html/tests/templates/base.xml b/docx2html/tests/templates/base.xml
new file mode 100644
index 0000000..a9c8915
--- /dev/null
+++ b/docx2html/tests/templates/base.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<w:document xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">
+	{{ body }}
+</w:document>
diff --git a/docx2html/tests/templates/drawing.xml b/docx2html/tests/templates/drawing.xml
new file mode 100644
index 0000000..9b5e5cd
--- /dev/null
+++ b/docx2html/tests/templates/drawing.xml
@@ -0,0 +1,61 @@
+<w:p>
+	<w:pPr>
+		<w:pStyle w:val="style0"/>
+	</w:pPr>
+	<w:r>
+		<w:rPr/>
+		<w:drawing>
+			<wp:anchor allowOverlap="1" behindDoc="0" distB="0" distL="0" distR="0" distT="0" layoutInCell="1" locked="0" relativeHeight="0" simplePos="0">
+				<wp:simplePos x="0" y="0"/>
+				<wp:positionH relativeFrom="column">
+					<wp:posOffset>2397125</wp:posOffset>
+				</wp:positionH>
+				<wp:positionV relativeFrom="paragraph">
+					<wp:posOffset>0</wp:posOffset>
+				</wp:positionV>
+				<wp:extent cx="1537970" cy="354965"/>
+				<wp:effectExtent b="0" l="0" r="0" t="0"/>
+				<wp:wrapSquare wrapText="largest"/>
+				<wp:docPr descr="A description..." id="1" name="Picture"/>
+				<wp:cNvGraphicFramePr>
+					<a:graphicFrameLocks xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" noChangeAspect="1"/>
+				</wp:cNvGraphicFramePr>
+				<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+					<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
+						<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
+							<pic:nvPicPr>
+								<pic:cNvPr descr="A description..." id="0" name="Picture"/>
+								<pic:cNvPicPr>
+									<a:picLocks noChangeArrowheads="1" noChangeAspect="1"/>
+								</pic:cNvPicPr>
+							</pic:nvPicPr>
+							<pic:blipFill>
+								<a:blip r:embed="{{ r_id }}"/>
+								<a:srcRect/>
+								<a:stretch>
+									<a:fillRect/>
+								</a:stretch>
+							</pic:blipFill>
+							<pic:spPr bwMode="auto">
+								<a:xfrm>
+									<a:off x="0" y="0"/>
+									<a:ext cx="1537970" cy="354965"/>
+								</a:xfrm>
+								<a:prstGeom prst="rect">
+									<a:avLst/>
+								</a:prstGeom>
+								<a:noFill/>
+								<a:ln w="9525">
+									<a:noFill/>
+									<a:miter lim="800000"/>
+									<a:headEnd/>
+									<a:tailEnd/>
+								</a:ln>
+							</pic:spPr>
+						</pic:pic>
+					</a:graphicData>
+				</a:graphic>
+			</wp:anchor>
+		</w:drawing>
+	</w:r>
+</w:p>
diff --git a/docx2html/tests/templates/hyperlink.xml b/docx2html/tests/templates/hyperlink.xml
new file mode 100644
index 0000000..8364594
--- /dev/null
+++ b/docx2html/tests/templates/hyperlink.xml
@@ -0,0 +1,5 @@
+<w:hyperlink r:id="{{ r_id }}">
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:hyperlink>
diff --git a/docx2html/tests/templates/p.xml b/docx2html/tests/templates/p.xml
new file mode 100644
index 0000000..ab376ca
--- /dev/null
+++ b/docx2html/tests/templates/p.xml
@@ -0,0 +1,14 @@
+<w:p>
+	<w:pPr>
+		<w:pStyle w:val="style0"/>
+		{% if is_list %}
+		<w:numPr>
+			<w:ilvl w:val="{{ ilvl }}"/>
+			<w:numId w:val="{{ numId }}"/>
+		</w:numPr>
+		{% endif %}
+	</w:pPr>
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:p>
diff --git a/docx2html/tests/templates/pict.xml b/docx2html/tests/templates/pict.xml
new file mode 100644
index 0000000..5dfa377
--- /dev/null
+++ b/docx2html/tests/templates/pict.xml
@@ -0,0 +1,17 @@
+<w:p w:rsidR="00E94BDC" w:rsidRPr="003638EA" w:rsidRDefault="00E94BDC" w:rsidP="00E94BDC">
+	<w:pPr>
+		<w:rPr>
+			<w:color w:val="000000"/>
+		</w:rPr>
+	</w:pPr>
+	<w:r w:rsidR="00360165">
+		<w:rPr>
+			<w:color w:val="000000"/>
+		</w:rPr>
+		<w:pict>
+			<v:shape id="_x0000_i1027" type="#_x0000_t75" style="width:99.75pt;height:116.25pt">
+				{% if r_id %}<v:imagedata r:id="{{ r_id }}" o:title="New Picture"/>{% endif %}
+			</v:shape>
+		</w:pict>
+	</w:r>
+</w:p>
diff --git a/docx2html/tests/templates/r.xml b/docx2html/tests/templates/r.xml
new file mode 100644
index 0000000..660c33c
--- /dev/null
+++ b/docx2html/tests/templates/r.xml
@@ -0,0 +1,8 @@
+<w:r>
+	<w:rPr>
+		{% if is_bold %}
+		<w:b/>
+		{% endif %}
+	</w:rPr>
+	{% include 't.xml' %}
+</w:r>
diff --git a/docx2html/tests/templates/sectPr.xml b/docx2html/tests/templates/sectPr.xml
new file mode 100644
index 0000000..16a1205
--- /dev/null
+++ b/docx2html/tests/templates/sectPr.xml
@@ -0,0 +1,3 @@
+<w:sectPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+	{{ p_tag }}
+</w:sectPr>
diff --git a/docx2html/tests/templates/style.xml b/docx2html/tests/templates/style.xml
new file mode 100644
index 0000000..5fa9f00
--- /dev/null
+++ b/docx2html/tests/templates/style.xml
@@ -0,0 +1,15 @@
+<w:style w:styleId="{{ style_id }}">
+	<w:name w:val="{{ value }}"/>
+	<w:basedOn w:val="Normal"/>
+	<w:next w:val="Normal"/>
+	<w:pPr>
+		<w:ind w:hanging="461"/>
+		<w:ind w:left="485"/>
+		<w:spacing w:after="60"/>
+		<w:spacing w:before="61"/>
+	</w:pPr>
+	<w:rPr>
+		<w:sz w:val="24"/>
+		<w:rFonts w:ascii="Times New Roman" w:cs="Times New Roman" w:hAnsi="Times New Roman"/>
+	</w:rPr>
+</w:style>
diff --git a/docx2html/tests/templates/styles.xml b/docx2html/tests/templates/styles.xml
new file mode 100644
index 0000000..a30e752
--- /dev/null
+++ b/docx2html/tests/templates/styles.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<w:styles xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+	{% for style in style_tags %}
+		{{ style }}
+	{% endfor %}
+</w:styles>
diff --git a/docx2html/tests/templates/t.xml b/docx2html/tests/templates/t.xml
new file mode 100644
index 0000000..92412f7
--- /dev/null
+++ b/docx2html/tests/templates/t.xml
@@ -0,0 +1 @@
+<w:t>{{ text }}</w:t>
diff --git a/docx2html/tests/templates/table.xml b/docx2html/tests/templates/table.xml
new file mode 100644
index 0000000..e47783b
--- /dev/null
+++ b/docx2html/tests/templates/table.xml
@@ -0,0 +1,18 @@
+<w:tbl>
+	<w:tblPr>
+		<w:tblW w:type="dxa" w:w="9972"/>
+		<w:jc w:val="left"/>
+		<w:tblBorders>
+			<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:left w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:bottom w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+		</w:tblBorders>
+	</w:tblPr>
+	<w:tblGrid>
+		<w:gridCol w:w="4986"/>
+		<w:gridCol w:w="4986"/>
+	</w:tblGrid>
+	{% for table_row in table_rows %}
+		{{ table_row }}
+	{% endfor %}
+</w:tbl>
diff --git a/docx2html/tests/templates/tc.xml b/docx2html/tests/templates/tc.xml
new file mode 100644
index 0000000..b9e38ae
--- /dev/null
+++ b/docx2html/tests/templates/tc.xml
@@ -0,0 +1,18 @@
+<w:tc>
+	<w:tcPr>
+		<w:tcW w:type="dxa" w:w="4986"/>
+		<w:tcBorders>
+			<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:left w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:bottom w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+		</w:tcBorders>
+		<w:shd w:fill="auto" w:val="clear"/>
+		<w:tcMar>
+			<w:top w:type="dxa" w:w="55"/>
+			<w:left w:type="dxa" w:w="55"/>
+			<w:bottom w:type="dxa" w:w="55"/>
+			<w:right w:type="dxa" w:w="55"/>
+		</w:tcMar>
+	</w:tcPr>
+	{{ p_tag }}
+</w:tc>
diff --git a/docx2html/tests/templates/tr.xml b/docx2html/tests/templates/tr.xml
new file mode 100644
index 0000000..6e2f692
--- /dev/null
+++ b/docx2html/tests/templates/tr.xml
@@ -0,0 +1,8 @@
+<w:tr>
+	<w:trPr>
+		<w:cantSplit w:val="false"/>
+	</w:trPr>
+	{% for table_cell in table_cells %}
+		{{ table_cell }}
+	{% endfor %}
+</w:tr>
diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py
new file mode 100644
index 0000000..8ba710c
--- /dev/null
+++ b/docx2html/tests/test_docx.py
@@ -0,0 +1,751 @@
+import mock
+import tempfile
+import shutil
+from os import path
+from zipfile import ZipFile
+from nose.plugins.skip import SkipTest
+from nose.tools import assert_raises
+
+from docx2html.tests import assert_html_equal
+from docx2html import convert
+from docx2html.core import (
+    _get_document_data,
+    DETECT_FONT_SIZE,
+)
+from docx2html.exceptions import (
+    ConversionFailed,
+)
+
+
+def test_extract_html():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'simple.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <p>
+          Simple text
+        </p>
+        <ol data-list-type="decimal">
+          <li>one</li>
+          <li>two</li>
+          <li>three</li>
+        </ol>
+        <table>
+          <tr>
+            <td>Cell1</td>
+            <td>Cell2</td>
+          </tr>
+          <tr>
+            <td>Cell3</td>
+            <td>cell4</td>
+          </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_nested_list():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>one</li>
+            <li>two</li>
+            <li>three
+                <ol data-list-type="decimal">
+                    <li>AAA</li>
+                    <li>BBB</li>
+                    <li>CCC
+                        <ol data-list-type="decimal">
+                            <li>alpha</li>
+                        </ol>
+                    </li>
+                </ol>
+            </li>
+            <li>four</li>
+        </ol>
+        <ol data-list-type="decimal">
+            <li>xxx
+                <ol data-list-type="decimal">
+                    <li>yyy</li>
+                </ol>
+            </li>
+        </ol>
+        <ul>
+            <li>www
+                <ul>
+                    <li>zzz</li>
+                </ul>
+            </li>
+        </ul>
+    </html>
+    ''')
+
+
+def test_simple_list():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'simple_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>One</li>
+        </ol>
+        <ul>
+            <li>two</li>
+        </ul>
+    </html>
+    ''')
+
+
+def test_inline_tags():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'inline_tags.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html><p>This sentence has some <strong>bold</strong>, some <em>italics</em> and some <strong>underline</strong>, as well as a <a href="http://www.google.com/">hyperlink</a>.</p></html>''')  # noqa
+
+
+def test_unicode():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'greek_alphabet.docx',
+    )
+    actual_html = convert(file_path)
+    assert actual_html is not None
+
+
+def test_special_chars():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'special_chars.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html><p>&amp; &lt; &gt; <a href="https://www.google.com/?test=1&amp;more=2">link</a></p></html>''')  # noqa
+
+
+def test_table_col_row_span():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'table_col_row_span.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+      <table>
+        <tr>
+          <td colspan="2">AAA</td>
+        </tr>
+        <tr>
+          <td rowspan="2">BBB</td>
+          <td>CCC</td>
+        </tr>
+        <tr>
+          <td>DDD</td>
+        </tr>
+        <tr>
+          <td>EEE</td>
+          <td rowspan="2">FFF</td>
+        </tr>
+        <tr>
+          <td>GGG</td>
+        </tr>
+      </table>
+      <table>
+        <tr>
+          <td>1</td>
+          <td>2</td>
+          <td>3</td>
+          <td>4</td>
+        </tr>
+        <tr>
+          <td>5</td>
+          <td colspan="2" rowspan="2">6</td>
+          <td>7</td>
+        </tr>
+        <tr>
+          <td>8</td>
+          <td>9</td>
+        </tr>
+        <tr>
+          <td>10</td>
+          <td>11</td>
+          <td>12</td>
+          <td>13</td>
+        </tr>
+      </table>
+    </html>
+    ''')
+
+
+def test_nested_table_rowspan():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_table_rowspan.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <table>
+            <tr>
+                <td colspan="2">AAA</td>
+            </tr>
+            <tr>
+                <td>BBB</td>
+                <td>
+                    <table>
+                        <tr>
+                            <td rowspan="2">CCC</td>
+                            <td>DDD</td>
+                        </tr>
+                        <tr>
+                            <td>EEE</td>
+                        </tr>
+                    </table>
+                    <br />
+                </td>
+            </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_nested_tables():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_tables.docx',
+    )
+    actual_html = convert(file_path)
+    # Find out why br tag is there.
+    assert_html_equal(actual_html, '''
+    <html>
+        <table>
+          <tr>
+            <td>AAA</td>
+            <td>BBB</td>
+          </tr>
+          <tr>
+            <td>CCC</td>
+            <td>
+              <table>
+                <tr>
+                  <td>DDD</td>
+                  <td>EEE</td>
+                </tr>
+                <tr>
+                  <td>FFF</td>
+                  <td>GGG</td>
+                </tr>
+              </table>
+              <br />
+            </td>
+          </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_list_in_table():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'list_in_table.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <table>
+          <tr>
+            <td>
+              <ol data-list-type="decimal">
+                <li>AAA</li>
+                <li>BBB</li>
+                <li>CCC</li>
+              </ol>
+            </td>
+          </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_tables_in_lists():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'tables_in_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>AAA</li>
+            <li>BBB<br />
+                <table>
+                    <tr>
+                        <td>CCC</td>
+                        <td>DDD</td>
+                    </tr>
+                    <tr>
+                        <td>EEE</td>
+                        <td>FFF</td>
+                    </tr>
+                </table>
+            </li>
+            <li>GGG</li>
+        </ol>
+    </html>
+    ''')
+
+
+def test_track_changes_on():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'track_changes_on.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html><p>This was some content.</p></html>
+    ''')
+
+
+def test_headers():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'headers.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>This is an H1</h2>
+        <h3>This is an H2</h3>
+        <h4>This is an H3</h4>
+        <h5>This is an H4</h5>
+        <h6>This is an H5</h6>
+        <h6>This is an H6</h6>
+        <h6>This is an H7</h6>
+        <h6>This is an H8</h6>
+        <h6>This is an H9</h6>
+        <h6>This is an H10</h6>
+    </html>
+    ''')
+
+
+def _copy_file_to_tmp_dir(file_path, filename):
+    # Since the images need to be extracted from the docx, copy the file to a
+    # temp directory so we do not clutter up repo.
+    dp = tempfile.mkdtemp()
+    new_file_path = path.join(dp, filename)
+    shutil.copyfile(file_path, new_file_path)
+    return new_file_path, dp
+
+
+def test_split_headers():
+    filename = 'split_header.docx'
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'split_header.docx',
+    )
+    # preserve_images must be true in order for the image to not be removed.
+    # This is handled in build_import, however here we need to manually set it
+    # to True.
+    new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+
+    def image_handler(*args, **kwargs):
+        return 'test'
+    actual_html = convert(new_file_path, image_handler=image_handler)
+    assert_html_equal(actual_html, '''
+    <html><h2>AAA</h2><p>BBB</p><h2>CCC</h2></html>
+    ''')
+
+
+def test_has_image():
+    filename = 'has_image.docx'
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_image.docx',
+    )
+    # preserve_images must be true in order for the image to not be removed.
+    # This is handled in build_import, however here we need to manually set it
+    # to True.
+    new_file_path, dp = _copy_file_to_tmp_dir(file_path, filename)
+
+    actual_html = convert(new_file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+    <p>AAA<img src="%s/word/media/image1.gif" height="55" width="260" /></p>
+    </html>
+    ''' % dp)
+
+
+def test_has_image_using_image_handler():
+    filename = 'has_image.docx'
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_image.docx',
+    )
+    # preserve_images must be true in order for the image to not be removed.
+    # This is handled in build_import, however here we need to manually set it
+    # to True.
+    new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+
+    def image_handler(*args, **kwargs):
+        return 'test'
+    actual_html = convert(new_file_path, image_handler=image_handler)
+    assert_html_equal(actual_html, '''
+
+    <html><p>AAA<img src="test" height="55" width="260" /></p></html>
+    ''')
+
+
+def test_attachment_is_tiff():
+    filename = 'attachment_is_tiff.docx'
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'attachment_is_tiff.docx',
+    )
+    # preserve_images must be true in order for the image to not be removed.
+    # This is handled in build_import, however here we need to manually set it
+    # to True.
+    new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename)
+
+    # First open the file and verify that the image attachment is a tiff.
+    try:
+        zf = ZipFile(new_file_path)
+        # Get the document data.
+        _, meta_data = _get_document_data(zf)
+    finally:
+        zf.close()
+    # Find the path to the image.
+    image_file = None
+    for file_path in meta_data.relationship_dict.values():
+        if file_path.endswith('.gif'):
+            image_file = file_path
+    assert image_file is not None
+    with open(image_file, 'rb') as f:
+        magic_number = f.read()[:4]
+    # Make sure the image is actually a gif.
+    assert magic_number == b'GIF8', magic_number
+
+
+def test_headers_with_full_line_styles():
+    # Show that if a natural header is completely bold/italics that
+    # bold/italics will get stripped out.
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'headers_with_full_line_styles.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <h2><strong>C</strong><em>C</em>C</h2>
+    </html>
+    ''')
+
+
+def test_convert_p_to_h():
+    # Show when it is correct to convert a p tag to an h tag based on
+    # bold/italics
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'convert_p_to_h.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <p>CCC</p>
+        <ol data-list-type="decimal">
+            <li><strong>DDD</strong></li>
+            <li><em>EEE</em></li>
+            <li>FFF</li>
+        </ol>
+        <table>
+            <tr>
+                <td><strong>GGG</strong></td>
+                <td><em>HHH</em></td>
+            </tr>
+            <tr>
+                <td>III</td>
+                <td>JJJ</td>
+            </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_bigger_font_size_to_header():
+    # Show when it is appropriate to convert p tags to h tags based on font
+    # size.
+    if not DETECT_FONT_SIZE:
+        raise SkipTest('Font size detection is disabled.')
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'bigger_font_size_to_header.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <p>Paragraphs:</p>
+        <h2>Header</h2>
+        <p>paragraph 1</p>
+        <p>Lists:</p>
+        <ol data-list-type="decimal">
+            <li>bigger</li>
+            <li>smaller</li>
+        </ol>
+        <p>Tables:</p>
+        <table>
+            <tr>
+                <td>bigger</td>
+                <td>smaller</td>
+            </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_fake_headings_by_length():
+    # Show that converting p tags to h tags has a length limit. If the p tag is
+    # supposed to be converted to an h tag but has more than seven words in the
+    # paragraph do not convert it.
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'fake_headings_by_length.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>Heading.</h2>
+        <h2>Still a heading.</h2>
+        <p>
+        <strong>This is not a heading because it is too many words.</strong>
+        </p>
+    </html>
+    ''')
+
+
+def test_shift_enter():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'shift_enter.docx',
+    )
+
+    # Test just the convert without clean_html to make sure the first
+    # break tag is present.
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <p>AAA<br />BBB</p>
+        <p>CCC</p>
+        <ol data-list-type="decimal">
+            <li>DDD<br />EEE</li>
+            <li>FFF</li>
+        </ol>
+        <table>
+            <tr>
+                <td>GGG<br />HHH</td>
+                <td>III<br />JJJ</td>
+            </tr>
+            <tr>
+                <td>KKK</td>
+                <td>LLL</td>
+            </tr>
+        </table>
+    </html>
+    ''')
+
+
+def test_lists_with_styles():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'lists_with_styles.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>AAA</li>
+            <li>BBB
+                <ol data-list-type="lower-roman">
+                    <li>CCC</li>
+                    <li>DDD
+                        <ol data-list-type="upper-alpha">
+                            <li>EEE
+                                <ol data-list-type="lower-alpha">
+                                    <li>FFF</li>
+                                </ol>
+                            </li>
+                        </ol>
+                    </li>
+                </ol>
+            </li>
+        </ol>
+    </html>
+    ''')
+
+
+def test_list_to_header():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'list_to_header.docx',
+    )
+    actual_html = convert(file_path)
+    # It should be noted that list item `GGG` is upper roman in the word
+    # document to show that only top level upper romans get converted.
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>AAA</h2>
+        <ol data-list-type="decimal">
+            <li>BBB</li>
+        </ol>
+        <h2>CCC</h2>
+        <ol data-list-type="decimal">
+            <li>DDD</li>
+        </ol>
+        <h2>EEE</h2>
+        <ol data-list-type="decimal">
+            <li>FFF
+                <ol data-list-type="upper-roman">
+                    <li>GGG</li>
+                </ol>
+            </li>
+        </ol>
+    </html>
+    ''')
+
+
+def test_has_title():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_title.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''<html><p>Text</p></html>''')
+
+
+def test_upper_alpha_all_bold():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'upper_alpha_all_bold.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, '''
+    <html>
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <h2>CCC</h2>
+    </html>
+    ''')
+
+
+def _converter(*args, **kwargs):
+    # Having a converter that does nothing is the same as if abiword fails to
+    # convert.
+    pass
+
+
+def test_converter_broken():
+    file_path = 'test.doc'
+    assert_raises(
+        ConversionFailed,
+        lambda: convert(file_path, converter=_converter),
+    )
+
+
+def test_fall_back():
+    file_path = 'test.doc'
+
+    def fall_back(*args, **kwargs):
+        return 'success'
+    html = convert(file_path, fall_back=fall_back, converter=_converter)
+    assert html == 'success'
+
+
+@mock.patch('docx2html.core.read_html_file')
+@mock.patch('docx2html.core.get_zip_file_handler')
+def test_html_files(patch_zip_handler, patch_read):
+    def raise_assertion(*args, **kwargs):
+        raise AssertionError('Should not have called get_zip_file_handler')
+    patch_zip_handler.side_effect = raise_assertion
+
+    def return_text(*args, **kwargs):
+        return 'test'
+    patch_read.side_effect = return_text
+
+    # Try with an html file
+    file_path = 'test.html'
+
+    html = convert(file_path)
+    assert html == 'test'
+
+    # Try again with an htm file.
+    file_path = 'test.htm'
+
+    html = convert(file_path)
+    assert html == 'test'
diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py
new file mode 100644
index 0000000..52bca28
--- /dev/null
+++ b/docx2html/tests/test_xml.py
@@ -0,0 +1,624 @@
+import mock
+from itertools import chain
+from lxml import etree
+from copy import copy
+
+from docx2html.core import (
+    _is_top_level_upper_roman,
+    create_html,
+    get_style_dict,
+    get_font_size,
+    get_image_id,
+    get_li_nodes,
+    get_namespace,
+    is_last_li,
+)
+from docx2html.tests.document_builder import DocxBuilder as DXB
+from docx2html.tests import (
+    _TranslationTestCase,
+    assert_html_equal,
+)
+
+
+class SimpleListTestCase(_TranslationTestCase):
+    expected_output = '''
+        <html>
+            <ol data-list-type="decimal">
+                <li>AAA</li>
+                <li>BBB</li>
+                <li>CCC</li>
+            </ol>
+        </html>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 0, 1),
+            ('CCC', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return etree.fromstring(xml)
+
+    def test_get_li_nodes(self):
+        tree = self.get_xml()
+        meta_data = self.get_meta_data()
+        w_namespace = get_namespace(tree, 'w')
+        first_p_tag = tree.find('%sp' % w_namespace)
+
+        li_data = get_li_nodes(first_p_tag, meta_data)
+        assert len(list(li_data)) == 3
+
+    def test_is_last_li(self):
+        tree = self.get_xml()
+        meta_data = self.get_meta_data()
+        p_tags = tree.xpath('.//w:p', namespaces=tree.nsmap)
+        result = [is_last_li(p, meta_data, current_numId='1') for p in p_tags]
+        self.assertEqual(
+            result,
+            [False, False, True],
+        )
+
+
+class TableInListTestCase(_TranslationTestCase):
+    expected_output = '''
+        <html>
+            <ol data-list-type="decimal">
+                <li>AAA<br />
+                    <table>
+                        <tr>
+                            <td>BBB</td>
+                            <td>CCC</td>
+                        </tr>
+                        <tr>
+                            <td>DDD</td>
+                            <td>EEE</td>
+                        </tr>
+                    </table>
+                </li>
+                <li>FFF</li>
+            </ol>
+            <p>GGG</p>
+        </html>
+    '''
+
+    def get_xml(self):
+        table = DXB.table(num_rows=2, num_columns=2, text=chain(
+            [DXB.p_tag('BBB')],
+            [DXB.p_tag('CCC')],
+            [DXB.p_tag('DDD')],
+            [DXB.p_tag('EEE')],
+        ))
+
+        # Nest that table in a list.
+        first_li = DXB.li(text='AAA', ilvl=0, numId=1)
+        second = DXB.li(text='FFF', ilvl=0, numId=1)
+        p_tag = DXB.p_tag('GGG')
+
+        body = ''
+        for el in [first_li, table, second, p_tag]:
+            body += el
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+    def test_get_li_nodes_with_nested_table(self):
+        # Create a table
+        tree = self.get_xml()
+        meta_data = self.get_meta_data()
+        w_namespace = get_namespace(tree, 'w')
+        first_p_tag = tree.find('%sp' % w_namespace)
+
+        # Show that list nesting deals with the table nesting
+        li_data = get_li_nodes(first_p_tag, meta_data)
+        assert len(list(li_data)) == 3
+
+    def test_is_last_li(self):
+        tree = self.get_xml()
+        meta_data = self.get_meta_data()
+        result = [is_last_li(el, meta_data, current_numId='1') for el in tree]
+        self.assertEqual(
+            result,
+            # None list items are ignored
+            [False, False, True, False],
+        )
+
+
+class RomanNumeralToHeadingTestCase(_TranslationTestCase):
+    numbering_dict = {
+        '1': {
+            0: 'upperRoman',
+            1: 'decimal',
+            2: 'upperRoman',
+        }
+    }
+    expected_output = '''
+        <html>
+            <h2>AAA</h2>
+            <ol data-list-type="decimal">
+                <li>BBB</li>
+            </ol>
+            <h2>CCC</h2>
+            <ol data-list-type="decimal">
+                <li>DDD</li>
+            </ol>
+            <h2>EEE</h2>
+            <ol data-list-type="decimal">
+                <li>FFF
+                    <ol data-list-type="upper-roman">
+                        <li>GGG</li>
+                    </ol>
+                </li>
+            </ol>
+        </html>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 1, 1),
+            ('CCC', 0, 1),
+            ('DDD', 1, 1),
+            ('EEE', 0, 1),
+            ('FFF', 1, 1),
+            ('GGG', 2, 1),
+        ]
+        body = ''
+        for text, ilvl, numId in li_text:
+            body += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+    def test_is_top_level_upper_roman(self):
+        tree = self.get_xml()
+        w_namespace = get_namespace(tree, 'w')
+        meta_data = self.get_meta_data()
+
+        result = []
+        for p in tree.findall('%sp' % w_namespace):
+            result.append(
+                _is_top_level_upper_roman(p, meta_data)
+            )
+        self.assertEqual(
+            result,
+            [
+                True,  # AAA
+                False,  # BBB
+                True,  # CCC
+                False,  # DDD
+                True,  # EEE
+                False,  # FFF
+                False,  # GGG - notice this is upper roman but not in the root
+            ]
+        )
+
+
+class RomanNumeralToHeadingAllBoldTestCase(_TranslationTestCase):
+    numbering_dict = {
+        '1': {
+            0: 'upperRoman',
+        }
+    }
+    expected_output = '''
+        <html>
+            <h2>AAA</h2>
+            <h2>BBB</h2>
+            <h2>CCC</h2>
+        </html>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 0, 1),
+            ('CCC', 0, 1),
+        ]
+        body = ''
+        for text, ilvl, numId in li_text:
+            body += DXB.li(text=text, ilvl=ilvl, numId=numId, bold=True)
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class ImageTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'media/image1.jpeg',
+        'rId1': 'media/image2.jpeg',
+    }
+    image_sizes = {
+        'rId0': (4, 4),
+        'rId1': (4, 4),
+    }
+    expected_output = '''
+        <html>
+            <p>
+                <img src="media/image1.jpeg" height="4" width="4" />
+            </p>
+            <p>
+                <img src="media/image2.jpeg" height="4" width="4" />
+            </p>
+        </html>
+    '''
+
+    @staticmethod
+    def image_handler(image_id, relationship_dict):
+        return relationship_dict.get(image_id)
+
+    def get_xml(self):
+        drawing = DXB.drawing('rId0')
+        pict = DXB.pict('rId1')
+        tags = [
+            drawing,
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+    def test_get_image_id(self):
+        tree = self.get_xml()
+        els = []
+        w_namespace = get_namespace(tree, 'w')
+        for el in tree.iter():
+            if el.tag == '%sdrawing' % w_namespace:
+                els.append(el)
+            if el.tag == '%spict' % w_namespace:
+                els.append(el)
+        image_ids = []
+        for el in els:
+            image_ids.append(get_image_id(el))
+        self.assertEqual(
+            image_ids,
+            [
+                'rId0',
+                'rId1',
+            ]
+        )
+
+    @mock.patch('docx2html.core._get_image_size_from_image')
+    def test_missing_size(self, patched_item):
+        def side_effect(*args, **kwargs):
+            return (6, 6)
+        patched_item.side_effect = side_effect
+        tree = self.get_xml()
+        meta_data = copy(self.get_meta_data())
+        del meta_data.image_sizes['rId1']
+
+        html = create_html(tree, meta_data)
+
+        # Show that the height and width were grabbed from the actual image.
+        assert_html_equal(html, '''
+            <html>
+                <p>
+                    <img src="media/image1.jpeg" height="4" width="4" />
+                </p>
+                <p>
+                    <img src="media/image2.jpeg" height="6" width="6" />
+                </p>
+            </html>
+        ''')
+
+
+class ListWithContinuationTestCase(_TranslationTestCase):
+    expected_output = '''
+        <html>
+            <ol data-list-type="decimal">
+                <li>AAA<br />BBB</li>
+                <li>CCC<br />
+                    <table>
+                        <tr>
+                            <td>DDD</td>
+                            <td>EEE</td>
+                        </tr>
+                        <tr>
+                            <td>FFF</td>
+                            <td>GGG</td>
+                        </tr>
+                    </table>
+                </li>
+                <li>HHH</li>
+            </ol>
+        </html>
+    '''
+
+    def get_xml(self):
+        table = DXB.table(num_rows=2, num_columns=2, text=chain(
+            [DXB.p_tag('DDD')],
+            [DXB.p_tag('EEE')],
+            [DXB.p_tag('FFF')],
+            [DXB.p_tag('GGG')],
+        ))
+        tags = [
+            DXB.li(text='AAA', ilvl=0, numId=1),
+            DXB.p_tag('BBB'),
+            DXB.li(text='CCC', ilvl=0, numId=1),
+            table,
+            DXB.li(text='HHH', ilvl=0, numId=1),
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class PictImageTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'media/image1.jpeg',
+    }
+    image_sizes = {
+        'rId0': (4, 4),
+    }
+    expected_output = '''
+        <html>
+            <p>
+                <img src="media/image1.jpeg" height="4" width="4" />
+            </p>
+        </html>
+    '''
+
+    @staticmethod
+    def image_handler(image_id, relationship_dict):
+        return relationship_dict.get(image_id)
+
+    def get_xml(self):
+        pict = DXB.pict('rId0')
+        tags = [
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+    def test_image_id_for_pict(self):
+        tree = self.get_xml()
+
+        # Get all the pict tags
+        pict_tags = tree.xpath('.//w:pict', namespaces=tree.nsmap)
+        self.assertEqual(len(pict_tags), 1)
+
+        # Get the image id for the pict tag.
+        pict_tag = pict_tags[0]
+        image_id = get_image_id(pict_tag)
+        self.assertEqual(image_id, 'rId0')
+
+
+class PictImageMissingIdTestCase(_TranslationTestCase):
+    expected_output = '''
+        <html></html>
+    '''
+
+    def get_xml(self):
+        pict = DXB.pict(None)
+        tags = [
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class TableWithInvalidTag(_TranslationTestCase):
+    expected_output = '''
+        <html>
+            <table>
+                <tr>
+                    <td>AAA</td>
+                    <td>BBB</td>
+                </tr>
+                <tr>
+                    <td></td>
+                    <td>DDD</td>
+                </tr>
+            </table>
+        </html>
+    '''
+
+    def get_xml(self):
+        table = DXB.table(num_rows=2, num_columns=2, text=chain(
+            [DXB.p_tag('AAA')],
+            [DXB.p_tag('BBB')],
+            # This tag may have CCC in it, however this tag has no meaning
+            # pertaining to content.
+            ['<w:invalidTag>CCC</w:invalidTag>'],
+            [DXB.p_tag('DDD')],
+        ))
+        body = table
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class HyperlinkStyledTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '''
+    <html>
+        <p><a href="www.google.com">link</a>.</p>
+    </html>
+    '''
+
+    def get_xml(self):
+        run_tags = []
+        run_tags.append(DXB.r_tag('link', is_bold=True))
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        run_tags.append(DXB.r_tag('.', is_bold=False))
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class HyperlinkNoTextTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '''
+    <html>
+    </html>
+    '''
+
+    def get_xml(self):
+        run_tags = []
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class HyperlinkVanillaTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '''
+    <html>
+        <p><a href="www.google.com">link</a>.</p>
+    </html>
+    '''
+
+    def get_xml(self):
+        run_tags = []
+        run_tags.append(DXB.r_tag('link', is_bold=False))
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        run_tags.append(DXB.r_tag('.', is_bold=False))
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class MissingFontInfoTestCase(_TranslationTestCase):
+    styles_dict = {
+        'BodyText': {
+            'header': False, 'font_size': None, 'based_on': 'Normal',
+        },
+    }
+
+    expected_output = '''
+    <html>
+        <p><strong>AAA</strong></p>
+    </html>
+    '''
+
+    def get_xml(self):
+        p_tag = '''
+        <w:p w:rsidR="009C063D" w:rsidRDefault="009C063D">
+            <w:pPr>
+                <w:pStyle w:val="BodyText"/>
+                <w:ind w:left="720"/>
+                <w:rPr>
+                    <w:b w:val="0"/>
+                    <w:bCs w:val="0"/>
+                </w:rPr>
+            </w:pPr>
+            <w:r>
+                <w:rPr>
+                    <w:b w:val="0"/>
+                    <w:bCs w:val="0"/>
+                </w:rPr>
+                <w:t>AAA</w:t>
+            </w:r>
+        </w:p>
+        '''
+        xml = DXB.xml(p_tag)
+        return etree.fromstring(xml)
+
+    def test_get_font_size(self):
+        tree = self.get_xml()
+        w_namespace = get_namespace(tree, 'w')
+        p_tag = tree.find('%sp' % w_namespace)
+        self.assertNotEqual(p_tag, None)
+        self.assertEqual(
+            get_font_size(p_tag, self.styles_dict),
+            None,
+        )
+
+    def test_get_font_size_empty_styles_dict(self):
+        tree = self.get_xml()
+        w_namespace = get_namespace(tree, 'w')
+        p_tag = tree.find('%sp' % w_namespace)
+        self.assertNotEqual(p_tag, None)
+        self.assertEqual(
+            get_font_size(p_tag, {}),
+            None,
+        )
+
+
+class HeaderFooterTagsWithContent(_TranslationTestCase):
+    expected_output = '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>AAA</li>
+        </ol>
+    </html>
+    '''
+
+    def get_xml(self):
+        li = DXB.li(text='AAA', ilvl=0, numId=1)
+        p_tag = DXB.p_tag('BBB')
+        footer_tag = DXB.sectPr_tag(p_tag)
+        body = li + footer_tag
+        xml = DXB.xml(body)
+        return etree.fromstring(xml)
+
+
+class StylesParsingTestCase(_TranslationTestCase):
+    expected_output = '<html></html>'
+
+    def get_xml(self):
+        return etree.fromstring(DXB.xml(''))
+
+    def test_get_headings(self):
+
+        styles = [
+            DXB.style('heading 1', 'heading 1'),
+        ]
+        xml = DXB.styles_xml(styles)
+        styles_xml = etree.fromstring(xml)
+        styles_dict = get_style_dict(styles_xml)
+        self.assertEqual(styles_dict['heading 1']['header'], 'h2')
+
+
+class MangledIlvlTestCase(_TranslationTestCase):
+    expected_output = '''
+    <html>
+        <ol data-list-type="decimal">
+            <li>AAA</li>
+        </ol>
+        <ol data-list-type="decimal">
+            <li>BBB</li>
+        </ol>
+        <ol data-list-type="decimal">
+            <li>CCC</li>
+        </ol>
+    </html>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 2),
+            ('BBB', 1, 1),
+            ('CCC', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return etree.fromstring(xml)
diff --git a/setup.py b/setup.py
index 2d692ea..aad2ac4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import codecs
+import os
 
 try:
     from setuptools import setup, find_packages
@@ -10,11 +10,23 @@
     use_setuptools()
     from setuptools import setup, find_packages  # noqa
 
-long_description = codecs.open("README.md", "r", "utf-8").read()
+rel_file = lambda *args: os.path.join(
+    os.path.dirname(
+        os.path.abspath(__file__),
+    ), *args)
+
+
+def get_file(filename):
+    with open(rel_file(filename)) as f:
+        return f.read()
+
+
+def get_description():
+    return get_file('README.md') + get_file('CHANGELOG')
 
 setup(
     name="docx2html",
-    version="0.0.1",
+    version="0.1.7",
     description="docx (OOXML) to html converter",
     author="Jason Ward",
     author_email="jason.louard.ward@gmail.com",
@@ -24,16 +36,19 @@
     packages=find_packages(),
     scripts=[],
     zip_safe=False,
-    install_requires=['lxml', 'pillow==1.7.7'],
+    install_requires=['lxml==3.1.0', 'pillow==2.0.0'],
     cmdclass={},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Programming Language :: Python",
+        "Programming Language :: Python :: 2.6",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3.3",
         "Intended Audience :: Developers",
         "License :: OSI Approved :: BSD License",
         "Operating System :: OS Independent",
         "Topic :: Text Processing :: Markup :: HTML",
         "Topic :: Text Processing :: Markup :: XML",
     ],
-    long_description=long_description,
+    long_description=get_description(),
 )
diff --git a/test_requirements.txt b/test_requirements.txt
new file mode 100644
index 0000000..6d8e78c
--- /dev/null
+++ b/test_requirements.txt
@@ -0,0 +1,3 @@
+nose
+mock
+Jinja2>=2.0