diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8aaedb5 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,12 @@ +language: python +python: + - "2.6" + - "2.7" + - "3.3" +script: ./run_tests.sh +install: + - python setup.py -q install + - pip install -r test_requirements.txt +notifications: + email: + - jason.louard.ward@gmail.com diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..1f78556 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +Jason Ward +Wes Winham +Kyle Gibson diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..7f6a437 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,31 @@ + +Changelog +========= + +* 0.1.7 + * If the indentation level of a set of lists (with the same list id) were + mangled (Starting off with a higher indentation level followed by a + lower) then the entire sub list (the list with the lower indentation + level) would not be added to the root list. This would result in removing + the mangled list from the final output. This issue has been addressed. +* 0.1.6 + * Header detection was relying on case. However it is possible for a lower + case version of headers to show up. Those are now handled correctly. +* 0.1.4 + * Added a function to remove tags, in addition stripped 'sectPr' tags since + they have to do with headers and footers. +* 0.1.3 + * Hyperlinks with no text no longer throw an error + * Fixed a bug with determining the font size with an incomplete styles dict +* 0.1.2 + * Fixed a bug with determining the font size of a paragraph tag +* 0.1.1 + * Added a changelog + * Styles are now stripped from hyperlinks + * jinja2 is now used to render test xml +* 0.1.0 + * Correctly handle tables and paragraphs in lists. Before if there was a + table in a list it would break the list into two halves, the half before + the table and the half after the table (with the table inbetween them). Now + if there is a table or paragraph in a list those elements get rolled into + the list. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..493f32d --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include AUTHORS +include CHANGELOG +include LICENSE +include MANIFEST.in +include README.md +include docx2html/fixtures/* diff --git a/README.md b/README.md index 3a4d871..1c4fc7f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,12 @@ +========= docx2html ========= -Convert a docx (OOXML) file to html + +Convert a docx (OOXML) file to semantic HTML. +All of Word formatting nonsense is stripped away and +you're left with a cleanly-formatted version of the content. + Usage ===== @@ -10,11 +15,19 @@ Usage >>> html = convert('path/to/docx/file') -Running Tests -============= +Running Tests for Development +============================= - $ ./run_tests.sh +:: + $ virtualenv path/to/new/virtualenv + $ source path/to/new/virtualenv/bin/activate + $ cd path/to/workspace + $ git clone git://github.com/PolicyStat/docx2html.git + $ cd docx2html + $ pip install . + $ pip install -r test_requirements.txt + $ ./run_tests.sh Description =========== @@ -32,6 +45,8 @@ is a list of what currently works: * Lists * Nested lists * List styles (letters, roman numerals, etc.) + * Tables + * Paragraphs * Tables * Rowspans * Colspans @@ -46,3 +61,29 @@ is a list of what currently works: * Simple headings * Root level lists that are upper case roman numerals get converted to h2 tags + +Handling embedded images +------------------------ + +docx2html allows you to specify how you would like to handle image uploading. +For example, you might be uploading your images to Amazon S3 eg: +Note: This documentation sucks, so you might need to read the source. + +:: + + import os.path + from shutil import copyfile + + from docx2html import convert + + def handle_image(image_id, relationship_dict): + image_path = relationship_dict[image_id] + # Now do something to the image. Let's move it somewhere. + _, filename = os.path.split(image_path) + destination_path = os.path.join('/tmp', filename) + copyfile(image_path, destination_path) + + # Return the `src` attribute to be used in the img tag + return 'file://%s' % destination + + html = convert('path/to/docx/file', image_handler=handle_image) diff --git a/docx2html/__init__.py b/docx2html/__init__.py index c6328aa..625c109 100644 --- a/docx2html/__init__.py +++ b/docx2html/__init__.py @@ -1,5 +1,5 @@ from docx2html.core import convert __all__ = [ - convert.func_name, + convert.__name__, ] diff --git a/docx2html/converters.py b/docx2html/converters.py new file mode 100644 index 0000000..36bc283 --- /dev/null +++ b/docx2html/converters.py @@ -0,0 +1,17 @@ +import subprocess + + +def convert_with_abiword(docx_path, file_path): + """ + This will convert ``file_path`` to docx and place the converted file at + ``docx_path`` + """ + subprocess.call( + [ + 'abiword', + '--to=docx', + '--to-name', + docx_path, + file_path, + ], + ) diff --git a/docx2html/core.py b/docx2html/core.py index 72f0370..3aafcd4 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1,6 +1,8 @@ import cgi import os -import subprocess +import os.path +import re +import sys from PIL import Image from lxml import etree from lxml.etree import XMLSyntaxError @@ -8,29 +10,42 @@ from collections import namedtuple, defaultdict from zipfile import ZipFile, BadZipfile +from docx2html.exceptions import ( + ConversionFailed, + FileNotDocx, + MalformedDocx, + UnintendedTag, + SyntaxNotSupported, +) + + +PYTHON_VERSION = sys.version[0] + + +def _get_etree_tostring_kwargs(): + tostring_kwargs = { + 'method': 'text', + 'encoding': string_function, + } + return tostring_kwargs + + +def _string_function(): + if PYTHON_VERSION == '2': + return unicode # noqa + elif PYTHON_VERSION == '3': + return str + raise NotImplementedError('Your version of python is not supported') +string_function = _string_function() + DETECT_FONT_SIZE = False EMUS_PER_PIXEL = 9525 -# Abiword supported formats -VALID_EXTRACT_EXTENSIONS = [ - '.doc', '.docx', '.dotx', '.docm', '.dotm', '.wri', '.rtf', '.txt', - '.text', '.wpd', '.wp', '.odt', '.ott', '.abw', '.atw', '.pdf', '.html', - '.dot', -] ### # Help functions ### -def is_extractable(path): - """ - Determine if a file is something that we can extract. - """ - _, extension = os.path.splitext(path) - extension = extension.lower() - return (extension in VALID_EXTRACT_EXTENSIONS) - - def replace_ext(file_path, new_ext): """ >>> replace_ext('one/two/three.four.doc', '.html') @@ -129,17 +144,29 @@ def get_font_size(p, styles_dict): return None pStyle = pStyle.get('%sval' % w_namespace) font_size = None - if 'font_size' in styles_dict[pStyle]: + style_value = styles_dict.get(pStyle, None) + if style_value is None: + return None + if 'font_size' in style_value: font_size = styles_dict[pStyle]['font_size'] while font_size is None: old_pStyle = pStyle + # If pStyle is not in the styles_dict then we have to break. if pStyle not in styles_dict: break + # If based on is not in the styles_dict for pStyle then we have to + # break. if 'based_on' not in styles_dict[pStyle]: break + # Try to derive what the font size is based on what the current + # style is based on. pStyle = styles_dict[pStyle]['based_on'] if old_pStyle == pStyle: break + # If pStyle is not in styles_dict then break. + if pStyle not in styles_dict: + break + # We have found a new font size font_size = styles_dict[pStyle]['font_size'] return font_size @@ -159,13 +186,14 @@ def is_natural_header(el, styles_dict): if ( style_id in styles_dict and 'header' in styles_dict[style_id] and - styles_dict[style_id]['header'] - ): + styles_dict[style_id]['header']): return styles_dict[style_id]['header'] @ensure_tag(['p']) def is_header(el, meta_data): + if _is_top_level_upper_roman(el, meta_data): + return 'h2' el_is_natural_header = is_natural_header(el, meta_data.styles_dict) if el_is_natural_header: return el_is_natural_header @@ -187,13 +215,10 @@ def is_header(el, meta_data): # If a paragraph is longer than eight words it is likely not supposed to be # an h tag. - num_words = len( - etree.tostring( - el, - encoding=unicode, - method='text', - ).split(' ') - ) + tostring_kwargs = _get_etree_tostring_kwargs() + text = etree.tostring(el, **tostring_kwargs) + text = string_function(text) + num_words = len(text.split(' ')) if num_words > 8: return False @@ -201,6 +226,7 @@ def is_header(el, meta_data): whole_line_bold, whole_line_italics = whole_line_styled(el) if whole_line_bold or whole_line_italics: return 'h2' + return False @@ -241,7 +267,34 @@ def has_text(p): this is the case we do not want that tag interfering with things like lists. Detect if this tag has any content. """ - return etree.tostring(p, encoding=unicode, method='text') != '' + tostring_kwargs = _get_etree_tostring_kwargs() + return '' != etree.tostring(p, **tostring_kwargs).strip() + + +def is_last_li(li, meta_data, current_numId): + """ + Determine if ``li`` is the last list item for a given list + """ + if not is_li(li, meta_data): + return False + w_namespace = get_namespace(li, 'w') + next_el = li + while True: + # If we run out of element this must be the last list item + if next_el is None: + return True + + next_el = next_el.getnext() + # Ignore elements that are not a list item + if not is_li(next_el, meta_data): + continue + + new_numId = get_numId(next_el, w_namespace) + if current_numId != new_numId: + return True + # If we have gotten here then we have found another list item in the + # current list, so ``li`` is not the last li in the list. + return False @ensure_tag(['p']) @@ -249,9 +302,10 @@ def get_li_nodes(li, meta_data): """ Find consecutive li tags that have content that have the same list id. """ - w_namespace = get_namespace(li, 'w') yield li + w_namespace = get_namespace(li, 'w') current_numId = get_numId(li, w_namespace) + starting_ilvl = get_ilvl(li, w_namespace) el = li while True: el = el.getnext() @@ -260,20 +314,24 @@ def get_li_nodes(li, meta_data): # If the tag has no content ignore it. if not has_text(el): continue - # If the next tag is not an li tag then we have found the end of the - # list. - if not is_li(el, meta_data): - break # Stop the lists if you come across a list item that should be a # heading. if _is_top_level_upper_roman(el, meta_data): break + if ( + is_li(el, meta_data) and + (starting_ilvl > get_ilvl(el, w_namespace))): + break + # If the list id of the next tag is different that the previous that # means a new list being made (not nested) - numId = get_numId(el, w_namespace) - if current_numId != numId: + if is_last_li(el, meta_data, current_numId): + new_numId = get_numId(el, w_namespace) + if current_numId == new_numId: + # Not a subsequent list. + yield el break yield el @@ -624,21 +682,21 @@ def get_style_dict(tree): # This is a partial document and actual h1 is the document title, which # will be displayed elsewhere. headers = { - 'Heading 1': 'h2', - 'Heading 2': 'h3', - 'Heading 3': 'h4', - 'Heading 4': 'h5', - 'Heading 5': 'h6', - 'Heading 6': 'h6', - 'Heading 7': 'h6', - 'Heading 8': 'h6', - 'Heading 9': 'h6', - 'Heading 10': 'h6', + 'heading 1': 'h2', + 'heading 2': 'h3', + 'heading 3': 'h4', + 'heading 4': 'h5', + 'heading 5': 'h6', + 'heading 6': 'h6', + 'heading 7': 'h6', + 'heading 8': 'h6', + 'heading 9': 'h6', + 'heading 10': 'h6', } if tree is None: return {} w_namespace = get_namespace(tree, 'w') - result = defaultdict(dict) + result = {} for el in tree: style_id = el.get('%sstyleId' % w_namespace) el_result = { @@ -650,7 +708,7 @@ def get_style_dict(tree): name = el.find('%sname' % w_namespace) if name is None: continue - value = name.get('%sval' % w_namespace) + value = name.get('%sval' % w_namespace).lower() if value in headers: el_result['header'] = headers[value] @@ -690,7 +748,7 @@ def get_image_sizes(tree): ext = el.find('%sext' % a_namespace) cx = int(ext.get('cx')) / EMUS_PER_PIXEL cy = int(ext.get('cy')) / EMUS_PER_PIXEL - result[get_image_id(d)] = (cx, cy) + result[get_image_id(d)] = (int(cx), int(cy)) return result @@ -845,13 +903,67 @@ def get_list_data(li_nodes, meta_data): # Store the first list created (the root list) for the return value. root_ol = None visited_nodes = [] + list_contents = [] + + def _build_li(list_contents): + data = '
'.join( + string_function(t) for t in list_contents if + t is not None + ) + return etree.XML('
  • %s
  • ' % data) + + def _build_non_li_content(el, meta_data): + w_namespace = get_namespace(el, 'w') + if el.tag == '%stbl' % w_namespace: + new_el, visited_nodes = get_table_data(el, meta_data) + li_content_text = etree.tostring( + new_el, + encoding=string_function, + ) + return li_content_text, visited_nodes + elif el.tag == '%sp' % w_namespace: + return get_p_data(el, meta_data), [el] + if has_text(el): + raise UnintendedTag('Did not expect %s' % el.tag) + + def _merge_lists(ilvl, current_ilvl, ol_dict, current_ol): + for i in reversed(range(ilvl, current_ilvl)): + # Any list that is more indented that ilvl needs to + # be merged to the list before it. + if i not in ol_dict: + continue + if ol_dict[i] is not current_ol: + if ol_dict[i] is current_ol: + continue + ol_dict[i][-1].append(current_ol) + current_ol = ol_dict[i] + + # Clean up finished nested lists. + for key in list(ol_dict): + if key > ilvl: + del ol_dict[key] + return current_ol + for li_node in li_nodes: w_namespace = get_namespace(li_node, 'w') + if not is_li(li_node, meta_data): + # Get the content and visited nodes + new_el, el_visited_nodes = _build_non_li_content( + li_node, + meta_data, + ) + list_contents.append(new_el) + visited_nodes.extend(el_visited_nodes) + continue + if list_contents: + li_el = _build_li(list_contents) + list_contents = [] + current_ol.append(li_el) # Get the data needed to build the current list item - text = get_p_data( + list_contents.append(get_p_data( li_node, meta_data, - ) + )) ilvl = get_ilvl(li_node, w_namespace) numId = get_numId(li_node, w_namespace) list_type = meta_data.numbering_dict[numId].get(ilvl, 'decimal') @@ -870,21 +982,12 @@ def get_list_data(li_nodes, meta_data): # than ilvl and then remove them from the ol_dict else: # Merge any nested lists that need to be merged. - for i in reversed(range(ilvl, current_ilvl)): - # Any list that is more indented that ilvl needs to - # be merged to the list before it. - if i not in ol_dict: - continue - if ol_dict[i] is not current_ol: - if ol_dict[i] is current_ol: - continue - ol_dict[i][-1].append(current_ol) - current_ol = ol_dict[i] - - # Clean up finished nested lists. - for key in list(ol_dict): - if key > ilvl: - del ol_dict[key] + current_ol = _merge_lists( + ilvl=ilvl, + current_ilvl=current_ilvl, + ol_dict=ol_dict, + current_ol=current_ol, + ) # Set the root list after the first list is created. if root_ol is None: @@ -903,21 +1006,23 @@ def get_list_data(li_nodes, meta_data): current_ol = ol_dict[ilvl] # Create the li element. - li_el = etree.XML('
  • %s
  • ' % text) - current_ol.append(li_el) visited_nodes.extend(list(li_node.iter())) + # If a list item is the last thing in a document, then you will need to add + # it here. Should probably figure out how to get the above logic to deal + # with it. + if list_contents: + li_el = _build_li(list_contents) + list_contents = [] + current_ol.append(li_el) + # Merge up any nested lists that have not been merged. - for i in reversed(range(0, current_ilvl)): - if i not in ol_dict: - continue - # If we do not do this check it is possible to create an infinite loop - # in etree. - if ol_dict[i] is current_ol: - continue - # append the current ol to the end of the last li tag. - ol_dict[i][-1].append(current_ol) - current_ol = ol_dict[i] + current_ol = _merge_lists( + ilvl=0, + current_ilvl=current_ilvl, + ol_dict=ol_dict, + current_ol=current_ol, + ) return root_ol, visited_nodes @@ -943,13 +1048,9 @@ def get_tr_data(tr, meta_data, row_spans): # ignored. if ( v_merge is not None and - v_merge.get('%sval' % w_namespace) != 'restart' - ): + v_merge.get('%sval' % w_namespace) != 'restart'): continue - # Create the td element with all the text break-joined. - td_el = etree.XML('') - # Loop through each and build a list of all the content. texts = [] for td_content in el: @@ -968,14 +1069,22 @@ def get_tr_data(tr, meta_data, row_spans): meta_data, ) visited_nodes.extend(list_visited_nodes) - texts.append(etree.tostring(list_el)) + list_el_text = etree.tostring( + list_el, + encoding=string_function, + ) + texts.append(list_el_text) elif td_content.tag == '%stbl' % w_namespace: table_el, table_visited_nodes = get_table_data( td_content, meta_data, ) visited_nodes.extend(table_visited_nodes) - texts.append(etree.tostring(table_el)) + table_el_text = etree.tostring( + table_el, + encoding=string_function, + ) + texts.append(table_el_text) elif td_content.tag == '%stcPr' % w_namespace: # Do nothing visited_nodes.append(td_content) @@ -988,7 +1097,7 @@ def get_tr_data(tr, meta_data, row_spans): ) texts.append(text) - data = '
    '.join(texts) + data = '
    '.join(t for t in texts if t is not None) td_el = etree.XML('%s' % data) # if there is a colspan then set it here. colspan = get_grid_span(el) @@ -1000,8 +1109,7 @@ def get_tr_data(tr, meta_data, row_spans): # here. if ( v_merge is not None and - v_merge.get('%sval' % w_namespace) == 'restart' - ): + v_merge.get('%sval' % w_namespace) == 'restart'): rowspan = next(row_spans) td_el.set('rowspan', '%d' % rowspan) @@ -1053,7 +1161,8 @@ def handle_t_tag( # The relationship_id is the href if hyperlink_id in meta_data.relationship_dict: href = meta_data.relationship_dict[hyperlink_id] - text = '%s' % (href, text) + # Do not do any styling on hyperlinks + return '%s' % (href, text) # Wrap the text with any modifiers it might have (bold, italics or # underline) el_is_bold = not remove_bold and ( @@ -1084,14 +1193,14 @@ def get_p_data(p, meta_data, is_td=False): """ remove_italics = False remove_bold = False - if not is_td and not is_li(p, meta_data): - # Check to see if the whole line is bold or italics. - whole_line_bold, whole_line_italics = whole_line_styled(p) - p_is_header = bool(is_header(p, meta_data) and not is_li(p, meta_data)) - # Only remove bold or italics if this tag is an h tag. - remove_bold = p_is_header and whole_line_bold - remove_italics = p_is_header and whole_line_italics + # Only remove bold or italics if this tag is an h tag. + # Td elements have the same look and feel as p/h elements. Right now we are + # never putting h tags in td elements, as such if we are in a td we will + # never be stripping bold/italics since that is only done on h tags + if not is_td and is_header(p, meta_data): + # Check to see if the whole line is bold or italics. + remove_bold, remove_italics = whole_line_styled(p) p_text = '' w_namespace = get_namespace(p, 'w') @@ -1136,7 +1245,20 @@ def get_p_data(p, meta_data, is_td=False): # Once we have the hyperlink_id then we need to replace the # hyperlink tag with its child run tag. - el = el.find('%sr' % w_namespace) + child_run_tag = el.find('%sr' % w_namespace) + if child_run_tag is None: + if has_text(el): + # If there is text in this hyperlink we need to raise an + # exception so that we don't lose content. + raise SyntaxNotSupported( + 'Hyperlink with text outside run tags not supported.', + ) + # It is very likely that this was a hyperlink tag that had its + # content removed, office does not do a very good job at + # cleaning up old tags, as such this tag has no content and + # should be ignored. + continue + el = child_run_tag # t tags hold all the text content. for child in get_raw_data(el): @@ -1153,6 +1275,9 @@ def get_p_data(p, meta_data, is_td=False): p_text += '
    ' else: # We have an image image_id = get_image_id(child) + if image_id not in meta_data.relationship_dict: + # This image does not have an image_id + continue src = meta_data.image_handler( image_id, meta_data.relationship_dict, @@ -1173,23 +1298,43 @@ def get_p_data(p, meta_data, is_td=False): return p_text +def _strip_tag(tree, tag): + """ + Remove all tags that have the tag name ``tag`` + """ + for el in tree.iter(): + if el.tag == tag: + el.getparent().remove(el) + + def get_zip_file_handler(file_path): return ZipFile(file_path) -def convert(file_path, image_handler=None, fall_back=None): - file_base, extension = os.path.splitext(os.path.basename(file_path)) +def read_html_file(file_path): + with open(file_path) as f: + html = f.read() + return html - if not is_extractable(file_path): - #XXX create better exception, used to be InvalidFileExtension - raise Exception( - 'The file type "%s" is not supported' % extension - ) - if extension == '.html': - with open(file_path) as f: - html = f.read() - return html +def convert(file_path, image_handler=None, fall_back=None, converter=None): + """ + ``file_path`` is a path to the file on the file system that you want to be + converted to html. + ``image_handler`` is a function that takes an image_id and a + relationship_dict to generate the src attribute for images. (see readme + for more details) + ``fall_back`` is a function that takes a ``file_path``. This function will + only be called if for whatever reason the conversion fails. + ``converter`` is a function to convert a document that is not docx to docx + (examples in docx2html.converters) + + Returns html extracted from ``file_path`` + """ + file_base, extension = os.path.splitext(os.path.basename(file_path)) + + if extension == '.html' or extension == '.htm': + return read_html_file(file_path) # Create the converted file as a file in the same dir with the # same name only with a .docx extension @@ -1198,25 +1343,20 @@ def convert(file_path, image_handler=None, fall_back=None): # If the file is already html, just leave it in place. docx_path = file_path else: - # Convert the file to docx - # TODO make this configurable. - subprocess.call( - ['abiword', '--to=docx', '--to-name', docx_path, file_path], - ) + if converter is None: + raise FileNotDocx('The file passed in is not a docx.') + converter(docx_path, file_path) + if not os.path.isfile(docx_path): + if fall_back is None: + raise ConversionFailed('Conversion to docx failed.') + else: + return fall_back(file_path) + try: # Docx files are actually just zip files. zf = get_zip_file_handler(docx_path) except BadZipfile: - # If its a malformed zip file raise InvalidFileExtension - # XXX - raise Exception('This file is not a docx') - except IOError: - # This means that the conversion from abiword failed. - if fall_back is not None: - return fall_back(file_path) - else: - # XXX - raise Exception('Conversion to docx failed.') + raise MalformedDocx('This file is not a docx') # Need to populate the xml based on word/document.xml tree, meta_data = _get_document_data(zf, image_handler) @@ -1230,33 +1370,49 @@ def create_html(tree, meta_data): w_namespace = get_namespace(tree, 'w') visited_nodes = [] + + _strip_tag(tree, '%ssectPr' % w_namespace) for el in tree.iter(): # The way lists are handled could double visit certain elements; keep # track of which elements have been visited and skip any that have been # visited already. if el in visited_nodes: continue - if el.tag == '%sp' % w_namespace: - # If this is true we have a bullet in some list + header_value = is_header(el, meta_data) + if is_header(el, meta_data): + p_text = get_p_data(el, meta_data) + if p_text == '': + continue + new_html.append( + etree.XML('<%s>%s' % ( + header_value, + p_text, + header_value, + )) + ) + elif el.tag == '%sp' % w_namespace: + # Strip out titles. + if is_title(el): + continue if is_li(el, meta_data): - # This should be a header instead. - if _is_top_level_upper_roman(el, meta_data): - p_text = get_p_data(el, meta_data) - new_html.append( - etree.XML('

    %s

    ' % p_text) - ) - continue # Parse out the needed info from the node. li_nodes = get_li_nodes(el, meta_data) - list_el, list_visited_nodes = get_list_data( + new_el, list_visited_nodes = get_list_data( li_nodes, meta_data, ) visited_nodes.extend(list_visited_nodes) - new_html.append(list_el) - continue + # Handle generic p tag here. + else: + p_text = get_p_data(el, meta_data) + # If there is not text do not add an empty tag. + if p_text == '': + continue - if el.tag == '%stbl' % w_namespace: + new_el = etree.XML('

    %s

    ' % p_text) + new_html.append(new_el) + + elif el.tag == '%stbl' % w_namespace: table_el, table_visited_nodes = get_table_data( el, meta_data, @@ -1265,33 +1421,29 @@ def create_html(tree, meta_data): new_html.append(table_el) continue - # Handle generic p tag here. - if el.tag == '%sp' % w_namespace: - # Strip out titles. - if is_title(el): - continue - - # If there is not text do not add an empty tag. - p_text = get_p_data(el, meta_data) - if p_text == '': - continue - - # Check to see if its a header - header_value = is_header(el, meta_data) - if header_value: - # Make a header based of the header_value - new_html.append( - etree.XML('<%s>%s' % ( - header_value, - p_text, - header_value, - )) - ) - else: - # Make a paragraph - new_html.append(etree.XML('

    %s

    ' % p_text)) - continue - # Keep track of visited_nodes visited_nodes.append(el) - return etree.tostring(new_html) + result = etree.tostring( + new_html, + method='html', + encoding=string_function, + with_tail=True, + ) + return _make_void_elements_self_close(result) + + +def _make_void_elements_self_close(html): + #XXX Hack not sure how to get etree to do this by default. + void_tags = [ + r'br', + r'img', + ] + html = string_function(html) + for tag in void_tags: + regex = re.compile(r'<%s.*?>' % tag) + matches = regex.findall(html) + for match in matches: + new_tag = match.strip('<>') + new_tag = '<%s />' % new_tag + html = re.sub(match, new_tag, html) + return html diff --git a/docx2html/exceptions.py b/docx2html/exceptions.py new file mode 100644 index 0000000..d50f1a0 --- /dev/null +++ b/docx2html/exceptions.py @@ -0,0 +1,22 @@ +class Docx2HtmlException(Exception): + pass + + +class ConversionFailed(Docx2HtmlException): + pass + + +class FileNotDocx(Docx2HtmlException): + pass + + +class MalformedDocx(Docx2HtmlException): + pass + + +class UnintendedTag(Docx2HtmlException): + pass + + +class SyntaxNotSupported(Docx2HtmlException): + pass diff --git a/docx2html/fixtures/attachment_is_tiff.docx b/docx2html/fixtures/attachment_is_tiff.docx new file mode 100644 index 0000000..774362c Binary files /dev/null and b/docx2html/fixtures/attachment_is_tiff.docx differ diff --git a/docx2html/fixtures/bigger_font_size_to_header.docx b/docx2html/fixtures/bigger_font_size_to_header.docx new file mode 100644 index 0000000..c722888 Binary files /dev/null and b/docx2html/fixtures/bigger_font_size_to_header.docx differ diff --git a/docx2html/fixtures/convert_p_to_h.docx b/docx2html/fixtures/convert_p_to_h.docx new file mode 100644 index 0000000..53769e1 Binary files /dev/null and b/docx2html/fixtures/convert_p_to_h.docx differ diff --git a/docx2html/fixtures/fake_headings_by_length.docx b/docx2html/fixtures/fake_headings_by_length.docx new file mode 100644 index 0000000..a130f5b Binary files /dev/null and b/docx2html/fixtures/fake_headings_by_length.docx differ diff --git a/docx2html/fixtures/greek_alphabet.docx b/docx2html/fixtures/greek_alphabet.docx new file mode 100644 index 0000000..46ab542 Binary files /dev/null and b/docx2html/fixtures/greek_alphabet.docx differ diff --git a/docx2html/fixtures/has_image.docx b/docx2html/fixtures/has_image.docx new file mode 100644 index 0000000..2ebd0bd Binary files /dev/null and b/docx2html/fixtures/has_image.docx differ diff --git a/docx2html/fixtures/has_missing_image.docx b/docx2html/fixtures/has_missing_image.docx new file mode 100644 index 0000000..996e667 Binary files /dev/null and b/docx2html/fixtures/has_missing_image.docx differ diff --git a/docx2html/fixtures/has_title.docx b/docx2html/fixtures/has_title.docx new file mode 100644 index 0000000..a87d88e Binary files /dev/null and b/docx2html/fixtures/has_title.docx differ diff --git a/docx2html/fixtures/header_footer_problem.docx b/docx2html/fixtures/header_footer_problem.docx new file mode 100644 index 0000000..6bc49a7 Binary files /dev/null and b/docx2html/fixtures/header_footer_problem.docx differ diff --git a/docx2html/fixtures/headers.docx b/docx2html/fixtures/headers.docx new file mode 100644 index 0000000..890104c Binary files /dev/null and b/docx2html/fixtures/headers.docx differ diff --git a/docx2html/fixtures/headers_with_full_line_styles.docx b/docx2html/fixtures/headers_with_full_line_styles.docx new file mode 100644 index 0000000..38d6f6a Binary files /dev/null and b/docx2html/fixtures/headers_with_full_line_styles.docx differ diff --git a/docx2html/fixtures/inline_tags.docx b/docx2html/fixtures/inline_tags.docx new file mode 100644 index 0000000..4aba234 Binary files /dev/null and b/docx2html/fixtures/inline_tags.docx differ diff --git a/docx2html/fixtures/list_in_table.docx b/docx2html/fixtures/list_in_table.docx new file mode 100644 index 0000000..d1a8738 Binary files /dev/null and b/docx2html/fixtures/list_in_table.docx differ diff --git a/docx2html/fixtures/list_to_header.docx b/docx2html/fixtures/list_to_header.docx new file mode 100644 index 0000000..f9b3946 Binary files /dev/null and b/docx2html/fixtures/list_to_header.docx differ diff --git a/docx2html/fixtures/lists_with_styles.docx b/docx2html/fixtures/lists_with_styles.docx new file mode 100644 index 0000000..c1c7ecf Binary files /dev/null and b/docx2html/fixtures/lists_with_styles.docx differ diff --git a/docx2html/fixtures/missing_content.docx b/docx2html/fixtures/missing_content.docx new file mode 100644 index 0000000..21bed96 Binary files /dev/null and b/docx2html/fixtures/missing_content.docx differ diff --git a/docx2html/fixtures/nested_lists.docx b/docx2html/fixtures/nested_lists.docx new file mode 100644 index 0000000..f4000df Binary files /dev/null and b/docx2html/fixtures/nested_lists.docx differ diff --git a/docx2html/fixtures/nested_table_rowspan.docx b/docx2html/fixtures/nested_table_rowspan.docx new file mode 100644 index 0000000..b43b8a0 Binary files /dev/null and b/docx2html/fixtures/nested_table_rowspan.docx differ diff --git a/docx2html/fixtures/nested_tables.docx b/docx2html/fixtures/nested_tables.docx new file mode 100644 index 0000000..af704d4 Binary files /dev/null and b/docx2html/fixtures/nested_tables.docx differ diff --git a/docx2html/fixtures/resized_image.docx b/docx2html/fixtures/resized_image.docx new file mode 100644 index 0000000..913099c Binary files /dev/null and b/docx2html/fixtures/resized_image.docx differ diff --git a/docx2html/fixtures/shift_enter.docx b/docx2html/fixtures/shift_enter.docx new file mode 100644 index 0000000..4128c0a Binary files /dev/null and b/docx2html/fixtures/shift_enter.docx differ diff --git a/docx2html/fixtures/simple.docx b/docx2html/fixtures/simple.docx new file mode 100644 index 0000000..f75e5eb Binary files /dev/null and b/docx2html/fixtures/simple.docx differ diff --git a/docx2html/fixtures/simple_lists.docx b/docx2html/fixtures/simple_lists.docx new file mode 100644 index 0000000..c09ad74 Binary files /dev/null and b/docx2html/fixtures/simple_lists.docx differ diff --git a/docx2html/fixtures/special_chars.docx b/docx2html/fixtures/special_chars.docx new file mode 100644 index 0000000..b4b9287 Binary files /dev/null and b/docx2html/fixtures/special_chars.docx differ diff --git a/docx2html/fixtures/split_header.docx b/docx2html/fixtures/split_header.docx new file mode 100644 index 0000000..cc4bd5c Binary files /dev/null and b/docx2html/fixtures/split_header.docx differ diff --git a/docx2html/fixtures/table_col_row_span.docx b/docx2html/fixtures/table_col_row_span.docx new file mode 100644 index 0000000..856abfd Binary files /dev/null and b/docx2html/fixtures/table_col_row_span.docx differ diff --git a/docx2html/fixtures/tables_in_lists.docx b/docx2html/fixtures/tables_in_lists.docx new file mode 100644 index 0000000..1185954 Binary files /dev/null and b/docx2html/fixtures/tables_in_lists.docx differ diff --git a/docx2html/fixtures/track_changes_on.docx b/docx2html/fixtures/track_changes_on.docx new file mode 100644 index 0000000..dcb7ba1 Binary files /dev/null and b/docx2html/fixtures/track_changes_on.docx differ diff --git a/docx2html/fixtures/upper_alpha_all_bold.docx b/docx2html/fixtures/upper_alpha_all_bold.docx new file mode 100644 index 0000000..d518b2c Binary files /dev/null and b/docx2html/fixtures/upper_alpha_all_bold.docx differ diff --git a/docx2html/tests.py b/docx2html/tests.py deleted file mode 100644 index e69de29..0000000 diff --git a/docx2html/tests/__init__.py b/docx2html/tests/__init__.py new file mode 100644 index 0000000..87359c8 --- /dev/null +++ b/docx2html/tests/__init__.py @@ -0,0 +1,126 @@ +from __future__ import print_function + +from unittest import TestCase +import re + +from docx2html.core import ( + MetaData, + create_html, + string_function, +) + + +def assert_html_equal(actual_html, expected_html): + actual_collapsed = collapse_html(actual_html) + expected_collapsed = collapse_html(expected_html) + print(actual_collapsed) + print(expected_collapsed) + assert actual_collapsed == expected_collapsed + + +def collapse_html(html): + """ + Remove insignificant whitespace from the html. + + >>> print(collapse_html('''\\ + ...

    + ... Heading + ...

    + ... ''')) +

    Heading

    + >>> print(collapse_html('''\\ + ...

    + ... Paragraph with + ... multiple lines. + ...

    + ... ''')) +

    Paragraph with multiple lines.

    + """ + def smart_space(match): + # Put a space in between lines, unless exactly one side of the line + # break butts up against a tag. + before = match.group(1) + after = match.group(2) + space = ' ' + if before == '>' or after == '<': + space = '' + return before + space + after + # Replace newlines and their surrounding whitespace with a single space (or + # empty string) + html = re.sub( + r'(>?)\s*\n\s*( + + {{ body }} + diff --git a/docx2html/tests/templates/drawing.xml b/docx2html/tests/templates/drawing.xml new file mode 100644 index 0000000..9b5e5cd --- /dev/null +++ b/docx2html/tests/templates/drawing.xml @@ -0,0 +1,61 @@ + + + + + + + + + + + 2397125 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docx2html/tests/templates/hyperlink.xml b/docx2html/tests/templates/hyperlink.xml new file mode 100644 index 0000000..8364594 --- /dev/null +++ b/docx2html/tests/templates/hyperlink.xml @@ -0,0 +1,5 @@ + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/docx2html/tests/templates/p.xml b/docx2html/tests/templates/p.xml new file mode 100644 index 0000000..ab376ca --- /dev/null +++ b/docx2html/tests/templates/p.xml @@ -0,0 +1,14 @@ + + + + {% if is_list %} + + + + + {% endif %} + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/docx2html/tests/templates/pict.xml b/docx2html/tests/templates/pict.xml new file mode 100644 index 0000000..5dfa377 --- /dev/null +++ b/docx2html/tests/templates/pict.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + {% if r_id %}{% endif %} + + + + diff --git a/docx2html/tests/templates/r.xml b/docx2html/tests/templates/r.xml new file mode 100644 index 0000000..660c33c --- /dev/null +++ b/docx2html/tests/templates/r.xml @@ -0,0 +1,8 @@ + + + {% if is_bold %} + + {% endif %} + + {% include 't.xml' %} + diff --git a/docx2html/tests/templates/sectPr.xml b/docx2html/tests/templates/sectPr.xml new file mode 100644 index 0000000..16a1205 --- /dev/null +++ b/docx2html/tests/templates/sectPr.xml @@ -0,0 +1,3 @@ + + {{ p_tag }} + diff --git a/docx2html/tests/templates/style.xml b/docx2html/tests/templates/style.xml new file mode 100644 index 0000000..5fa9f00 --- /dev/null +++ b/docx2html/tests/templates/style.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/docx2html/tests/templates/styles.xml b/docx2html/tests/templates/styles.xml new file mode 100644 index 0000000..a30e752 --- /dev/null +++ b/docx2html/tests/templates/styles.xml @@ -0,0 +1,6 @@ + + + {% for style in style_tags %} + {{ style }} + {% endfor %} + diff --git a/docx2html/tests/templates/t.xml b/docx2html/tests/templates/t.xml new file mode 100644 index 0000000..92412f7 --- /dev/null +++ b/docx2html/tests/templates/t.xml @@ -0,0 +1 @@ +{{ text }} diff --git a/docx2html/tests/templates/table.xml b/docx2html/tests/templates/table.xml new file mode 100644 index 0000000..e47783b --- /dev/null +++ b/docx2html/tests/templates/table.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + {% for table_row in table_rows %} + {{ table_row }} + {% endfor %} + diff --git a/docx2html/tests/templates/tc.xml b/docx2html/tests/templates/tc.xml new file mode 100644 index 0000000..b9e38ae --- /dev/null +++ b/docx2html/tests/templates/tc.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + {{ p_tag }} + diff --git a/docx2html/tests/templates/tr.xml b/docx2html/tests/templates/tr.xml new file mode 100644 index 0000000..6e2f692 --- /dev/null +++ b/docx2html/tests/templates/tr.xml @@ -0,0 +1,8 @@ + + + + + {% for table_cell in table_cells %} + {{ table_cell }} + {% endfor %} + diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py new file mode 100644 index 0000000..8ba710c --- /dev/null +++ b/docx2html/tests/test_docx.py @@ -0,0 +1,751 @@ +import mock +import tempfile +import shutil +from os import path +from zipfile import ZipFile +from nose.plugins.skip import SkipTest +from nose.tools import assert_raises + +from docx2html.tests import assert_html_equal +from docx2html import convert +from docx2html.core import ( + _get_document_data, + DETECT_FONT_SIZE, +) +from docx2html.exceptions import ( + ConversionFailed, +) + + +def test_extract_html(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'simple.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    + Simple text +

    +
      +
    1. one
    2. +
    3. two
    4. +
    5. three
    6. +
    + + + + + + + + + +
    Cell1Cell2
    Cell3cell4
    + + ''') + + +def test_nested_list(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +
      +
    1. one
    2. +
    3. two
    4. +
    5. three +
        +
      1. AAA
      2. +
      3. BBB
      4. +
      5. CCC +
          +
        1. alpha
        2. +
        +
      6. +
      +
    6. +
    7. four
    8. +
    +
      +
    1. xxx +
        +
      1. yyy
      2. +
      +
    2. +
    +
      +
    • www +
        +
      • zzz
      • +
      +
    • +
    + + ''') + + +def test_simple_list(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'simple_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +
      +
    1. One
    2. +
    +
      +
    • two
    • +
    + + ''') + + +def test_inline_tags(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'inline_tags.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' +

    This sentence has some bold, some italics and some underline, as well as a hyperlink.

    ''') # noqa + + +def test_unicode(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'greek_alphabet.docx', + ) + actual_html = convert(file_path) + assert actual_html is not None + + +def test_special_chars(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'special_chars.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' +

    & < > link

    ''') # noqa + + +def test_table_col_row_span(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'table_col_row_span.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + + + + + + + + + + + + + + + + + + + +
    AAA
    BBBCCC
    DDD
    EEEFFF
    GGG
    + + + + + + + + + + + + + + + + + + + + + + +
    1234
    567
    89
    10111213
    + + ''') + + +def test_nested_table_rowspan(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_table_rowspan.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + + + + + + + + + +
    AAA
    BBB + + + + + + + + +
    CCCDDD
    EEE
    +
    +
    + + ''') + + +def test_nested_tables(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_tables.docx', + ) + actual_html = convert(file_path) + # Find out why br tag is there. + assert_html_equal(actual_html, ''' + + + + + + + + + + +
    AAABBB
    CCC + + + + + + + + + +
    DDDEEE
    FFFGGG
    +
    +
    + + ''') + + +def test_list_in_table(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'list_in_table.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + + + + + +
    +
      +
    1. AAA
    2. +
    3. BBB
    4. +
    5. CCC
    6. +
    +
    + + ''') + + +def test_tables_in_lists(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'tables_in_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +
      +
    1. AAA
    2. +
    3. BBB
      + + + + + + + + + +
      CCCDDD
      EEEFFF
      +
    4. +
    5. GGG
    6. +
    + + ''') + + +def test_track_changes_on(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'track_changes_on.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' +

    This was some content.

    + ''') + + +def test_headers(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'headers.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    This is an H1

    +

    This is an H2

    +

    This is an H3

    +
    This is an H4
    +
    This is an H5
    +
    This is an H6
    +
    This is an H7
    +
    This is an H8
    +
    This is an H9
    +
    This is an H10
    + + ''') + + +def _copy_file_to_tmp_dir(file_path, filename): + # Since the images need to be extracted from the docx, copy the file to a + # temp directory so we do not clutter up repo. + dp = tempfile.mkdtemp() + new_file_path = path.join(dp, filename) + shutil.copyfile(file_path, new_file_path) + return new_file_path, dp + + +def test_split_headers(): + filename = 'split_header.docx' + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'split_header.docx', + ) + # preserve_images must be true in order for the image to not be removed. + # This is handled in build_import, however here we need to manually set it + # to True. + new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename) + + def image_handler(*args, **kwargs): + return 'test' + actual_html = convert(new_file_path, image_handler=image_handler) + assert_html_equal(actual_html, ''' +

    AAA

    BBB

    CCC

    + ''') + + +def test_has_image(): + filename = 'has_image.docx' + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_image.docx', + ) + # preserve_images must be true in order for the image to not be removed. + # This is handled in build_import, however here we need to manually set it + # to True. + new_file_path, dp = _copy_file_to_tmp_dir(file_path, filename) + + actual_html = convert(new_file_path) + assert_html_equal(actual_html, ''' + +

    AAA

    + + ''' % dp) + + +def test_has_image_using_image_handler(): + filename = 'has_image.docx' + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_image.docx', + ) + # preserve_images must be true in order for the image to not be removed. + # This is handled in build_import, however here we need to manually set it + # to True. + new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename) + + def image_handler(*args, **kwargs): + return 'test' + actual_html = convert(new_file_path, image_handler=image_handler) + assert_html_equal(actual_html, ''' + +

    AAA

    + ''') + + +def test_attachment_is_tiff(): + filename = 'attachment_is_tiff.docx' + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'attachment_is_tiff.docx', + ) + # preserve_images must be true in order for the image to not be removed. + # This is handled in build_import, however here we need to manually set it + # to True. + new_file_path, _ = _copy_file_to_tmp_dir(file_path, filename) + + # First open the file and verify that the image attachment is a tiff. + try: + zf = ZipFile(new_file_path) + # Get the document data. + _, meta_data = _get_document_data(zf) + finally: + zf.close() + # Find the path to the image. + image_file = None + for file_path in meta_data.relationship_dict.values(): + if file_path.endswith('.gif'): + image_file = file_path + assert image_file is not None + with open(image_file, 'rb') as f: + magic_number = f.read()[:4] + # Make sure the image is actually a gif. + assert magic_number == b'GIF8', magic_number + + +def test_headers_with_full_line_styles(): + # Show that if a natural header is completely bold/italics that + # bold/italics will get stripped out. + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'headers_with_full_line_styles.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    AAA

    +

    BBB

    +

    CCC

    + + ''') + + +def test_convert_p_to_h(): + # Show when it is correct to convert a p tag to an h tag based on + # bold/italics + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'convert_p_to_h.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    AAA

    +

    BBB

    +

    CCC

    +
      +
    1. DDD
    2. +
    3. EEE
    4. +
    5. FFF
    6. +
    + + + + + + + + + +
    GGGHHH
    IIIJJJ
    + + ''') + + +def test_bigger_font_size_to_header(): + # Show when it is appropriate to convert p tags to h tags based on font + # size. + if not DETECT_FONT_SIZE: + raise SkipTest('Font size detection is disabled.') + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'bigger_font_size_to_header.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    Paragraphs:

    +

    Header

    +

    paragraph 1

    +

    Lists:

    +
      +
    1. bigger
    2. +
    3. smaller
    4. +
    +

    Tables:

    + + + + + +
    biggersmaller
    + + ''') + + +def test_fake_headings_by_length(): + # Show that converting p tags to h tags has a length limit. If the p tag is + # supposed to be converted to an h tag but has more than seven words in the + # paragraph do not convert it. + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'fake_headings_by_length.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    Heading.

    +

    Still a heading.

    +

    + This is not a heading because it is too many words. +

    + + ''') + + +def test_shift_enter(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'shift_enter.docx', + ) + + # Test just the convert without clean_html to make sure the first + # break tag is present. + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    AAA
    BBB

    +

    CCC

    +
      +
    1. DDD
      EEE
    2. +
    3. FFF
    4. +
    + + + + + + + + + +
    GGG
    HHH
    III
    JJJ
    KKKLLL
    + + ''') + + +def test_lists_with_styles(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'lists_with_styles.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +
      +
    1. AAA
    2. +
    3. BBB +
        +
      1. CCC
      2. +
      3. DDD +
          +
        1. EEE +
            +
          1. FFF
          2. +
          +
        2. +
        +
      4. +
      +
    4. +
    + + ''') + + +def test_list_to_header(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'list_to_header.docx', + ) + actual_html = convert(file_path) + # It should be noted that list item `GGG` is upper roman in the word + # document to show that only top level upper romans get converted. + assert_html_equal(actual_html, ''' + +

    AAA

    +
      +
    1. BBB
    2. +
    +

    CCC

    +
      +
    1. DDD
    2. +
    +

    EEE

    +
      +
    1. FFF +
        +
      1. GGG
      2. +
      +
    2. +
    + + ''') + + +def test_has_title(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_title.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, '''

    Text

    ''') + + +def test_upper_alpha_all_bold(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'upper_alpha_all_bold.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, ''' + +

    AAA

    +

    BBB

    +

    CCC

    + + ''') + + +def _converter(*args, **kwargs): + # Having a converter that does nothing is the same as if abiword fails to + # convert. + pass + + +def test_converter_broken(): + file_path = 'test.doc' + assert_raises( + ConversionFailed, + lambda: convert(file_path, converter=_converter), + ) + + +def test_fall_back(): + file_path = 'test.doc' + + def fall_back(*args, **kwargs): + return 'success' + html = convert(file_path, fall_back=fall_back, converter=_converter) + assert html == 'success' + + +@mock.patch('docx2html.core.read_html_file') +@mock.patch('docx2html.core.get_zip_file_handler') +def test_html_files(patch_zip_handler, patch_read): + def raise_assertion(*args, **kwargs): + raise AssertionError('Should not have called get_zip_file_handler') + patch_zip_handler.side_effect = raise_assertion + + def return_text(*args, **kwargs): + return 'test' + patch_read.side_effect = return_text + + # Try with an html file + file_path = 'test.html' + + html = convert(file_path) + assert html == 'test' + + # Try again with an htm file. + file_path = 'test.htm' + + html = convert(file_path) + assert html == 'test' diff --git a/docx2html/tests/test_xml.py b/docx2html/tests/test_xml.py new file mode 100644 index 0000000..52bca28 --- /dev/null +++ b/docx2html/tests/test_xml.py @@ -0,0 +1,624 @@ +import mock +from itertools import chain +from lxml import etree +from copy import copy + +from docx2html.core import ( + _is_top_level_upper_roman, + create_html, + get_style_dict, + get_font_size, + get_image_id, + get_li_nodes, + get_namespace, + is_last_li, +) +from docx2html.tests.document_builder import DocxBuilder as DXB +from docx2html.tests import ( + _TranslationTestCase, + assert_html_equal, +) + + +class SimpleListTestCase(_TranslationTestCase): + expected_output = ''' + +
      +
    1. AAA
    2. +
    3. BBB
    4. +
    5. CCC
    6. +
    + + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 0, 1), + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return etree.fromstring(xml) + + def test_get_li_nodes(self): + tree = self.get_xml() + meta_data = self.get_meta_data() + w_namespace = get_namespace(tree, 'w') + first_p_tag = tree.find('%sp' % w_namespace) + + li_data = get_li_nodes(first_p_tag, meta_data) + assert len(list(li_data)) == 3 + + def test_is_last_li(self): + tree = self.get_xml() + meta_data = self.get_meta_data() + p_tags = tree.xpath('.//w:p', namespaces=tree.nsmap) + result = [is_last_li(p, meta_data, current_numId='1') for p in p_tags] + self.assertEqual( + result, + [False, False, True], + ) + + +class TableInListTestCase(_TranslationTestCase): + expected_output = ''' + +
      +
    1. AAA
      + + + + + + + + + +
      BBBCCC
      DDDEEE
      +
    2. +
    3. FFF
    4. +
    +

    GGG

    + + ''' + + def get_xml(self): + table = DXB.table(num_rows=2, num_columns=2, text=chain( + [DXB.p_tag('BBB')], + [DXB.p_tag('CCC')], + [DXB.p_tag('DDD')], + [DXB.p_tag('EEE')], + )) + + # Nest that table in a list. + first_li = DXB.li(text='AAA', ilvl=0, numId=1) + second = DXB.li(text='FFF', ilvl=0, numId=1) + p_tag = DXB.p_tag('GGG') + + body = '' + for el in [first_li, table, second, p_tag]: + body += el + xml = DXB.xml(body) + return etree.fromstring(xml) + + def test_get_li_nodes_with_nested_table(self): + # Create a table + tree = self.get_xml() + meta_data = self.get_meta_data() + w_namespace = get_namespace(tree, 'w') + first_p_tag = tree.find('%sp' % w_namespace) + + # Show that list nesting deals with the table nesting + li_data = get_li_nodes(first_p_tag, meta_data) + assert len(list(li_data)) == 3 + + def test_is_last_li(self): + tree = self.get_xml() + meta_data = self.get_meta_data() + result = [is_last_li(el, meta_data, current_numId='1') for el in tree] + self.assertEqual( + result, + # None list items are ignored + [False, False, True, False], + ) + + +class RomanNumeralToHeadingTestCase(_TranslationTestCase): + numbering_dict = { + '1': { + 0: 'upperRoman', + 1: 'decimal', + 2: 'upperRoman', + } + } + expected_output = ''' + +

    AAA

    +
      +
    1. BBB
    2. +
    +

    CCC

    +
      +
    1. DDD
    2. +
    +

    EEE

    +
      +
    1. FFF +
        +
      1. GGG
      2. +
      +
    2. +
    + + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 1, 1), + ('CCC', 0, 1), + ('DDD', 1, 1), + ('EEE', 0, 1), + ('FFF', 1, 1), + ('GGG', 2, 1), + ] + body = '' + for text, ilvl, numId in li_text: + body += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(body) + return etree.fromstring(xml) + + def test_is_top_level_upper_roman(self): + tree = self.get_xml() + w_namespace = get_namespace(tree, 'w') + meta_data = self.get_meta_data() + + result = [] + for p in tree.findall('%sp' % w_namespace): + result.append( + _is_top_level_upper_roman(p, meta_data) + ) + self.assertEqual( + result, + [ + True, # AAA + False, # BBB + True, # CCC + False, # DDD + True, # EEE + False, # FFF + False, # GGG - notice this is upper roman but not in the root + ] + ) + + +class RomanNumeralToHeadingAllBoldTestCase(_TranslationTestCase): + numbering_dict = { + '1': { + 0: 'upperRoman', + } + } + expected_output = ''' + +

    AAA

    +

    BBB

    +

    CCC

    + + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 0, 1), + ('CCC', 0, 1), + ] + body = '' + for text, ilvl, numId in li_text: + body += DXB.li(text=text, ilvl=ilvl, numId=numId, bold=True) + + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class ImageTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'media/image1.jpeg', + 'rId1': 'media/image2.jpeg', + } + image_sizes = { + 'rId0': (4, 4), + 'rId1': (4, 4), + } + expected_output = ''' + +

    + +

    +

    + +

    + + ''' + + @staticmethod + def image_handler(image_id, relationship_dict): + return relationship_dict.get(image_id) + + def get_xml(self): + drawing = DXB.drawing('rId0') + pict = DXB.pict('rId1') + tags = [ + drawing, + pict, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return etree.fromstring(xml) + + def test_get_image_id(self): + tree = self.get_xml() + els = [] + w_namespace = get_namespace(tree, 'w') + for el in tree.iter(): + if el.tag == '%sdrawing' % w_namespace: + els.append(el) + if el.tag == '%spict' % w_namespace: + els.append(el) + image_ids = [] + for el in els: + image_ids.append(get_image_id(el)) + self.assertEqual( + image_ids, + [ + 'rId0', + 'rId1', + ] + ) + + @mock.patch('docx2html.core._get_image_size_from_image') + def test_missing_size(self, patched_item): + def side_effect(*args, **kwargs): + return (6, 6) + patched_item.side_effect = side_effect + tree = self.get_xml() + meta_data = copy(self.get_meta_data()) + del meta_data.image_sizes['rId1'] + + html = create_html(tree, meta_data) + + # Show that the height and width were grabbed from the actual image. + assert_html_equal(html, ''' + +

    + +

    +

    + +

    + + ''') + + +class ListWithContinuationTestCase(_TranslationTestCase): + expected_output = ''' + +
      +
    1. AAA
      BBB
    2. +
    3. CCC
      + + + + + + + + + +
      DDDEEE
      FFFGGG
      +
    4. +
    5. HHH
    6. +
    + + ''' + + def get_xml(self): + table = DXB.table(num_rows=2, num_columns=2, text=chain( + [DXB.p_tag('DDD')], + [DXB.p_tag('EEE')], + [DXB.p_tag('FFF')], + [DXB.p_tag('GGG')], + )) + tags = [ + DXB.li(text='AAA', ilvl=0, numId=1), + DXB.p_tag('BBB'), + DXB.li(text='CCC', ilvl=0, numId=1), + table, + DXB.li(text='HHH', ilvl=0, numId=1), + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class PictImageTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'media/image1.jpeg', + } + image_sizes = { + 'rId0': (4, 4), + } + expected_output = ''' + +

    + +

    + + ''' + + @staticmethod + def image_handler(image_id, relationship_dict): + return relationship_dict.get(image_id) + + def get_xml(self): + pict = DXB.pict('rId0') + tags = [ + pict, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return etree.fromstring(xml) + + def test_image_id_for_pict(self): + tree = self.get_xml() + + # Get all the pict tags + pict_tags = tree.xpath('.//w:pict', namespaces=tree.nsmap) + self.assertEqual(len(pict_tags), 1) + + # Get the image id for the pict tag. + pict_tag = pict_tags[0] + image_id = get_image_id(pict_tag) + self.assertEqual(image_id, 'rId0') + + +class PictImageMissingIdTestCase(_TranslationTestCase): + expected_output = ''' + + ''' + + def get_xml(self): + pict = DXB.pict(None) + tags = [ + pict, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class TableWithInvalidTag(_TranslationTestCase): + expected_output = ''' + + + + + + + + + + +
    AAABBB
    DDD
    + + ''' + + def get_xml(self): + table = DXB.table(num_rows=2, num_columns=2, text=chain( + [DXB.p_tag('AAA')], + [DXB.p_tag('BBB')], + # This tag may have CCC in it, however this tag has no meaning + # pertaining to content. + ['CCC'], + [DXB.p_tag('DDD')], + )) + body = table + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class HyperlinkStyledTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = ''' + +

    link.

    + + ''' + + def get_xml(self): + run_tags = [] + run_tags.append(DXB.r_tag('link', is_bold=True)) + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + run_tags.append(DXB.r_tag('.', is_bold=False)) + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class HyperlinkNoTextTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = ''' + + + ''' + + def get_xml(self): + run_tags = [] + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class HyperlinkVanillaTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = ''' + +

    link.

    + + ''' + + def get_xml(self): + run_tags = [] + run_tags.append(DXB.r_tag('link', is_bold=False)) + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + run_tags.append(DXB.r_tag('.', is_bold=False)) + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class MissingFontInfoTestCase(_TranslationTestCase): + styles_dict = { + 'BodyText': { + 'header': False, 'font_size': None, 'based_on': 'Normal', + }, + } + + expected_output = ''' + +

    AAA

    + + ''' + + def get_xml(self): + p_tag = ''' + + + + + + + + + + + + + + + AAA + + + ''' + xml = DXB.xml(p_tag) + return etree.fromstring(xml) + + def test_get_font_size(self): + tree = self.get_xml() + w_namespace = get_namespace(tree, 'w') + p_tag = tree.find('%sp' % w_namespace) + self.assertNotEqual(p_tag, None) + self.assertEqual( + get_font_size(p_tag, self.styles_dict), + None, + ) + + def test_get_font_size_empty_styles_dict(self): + tree = self.get_xml() + w_namespace = get_namespace(tree, 'w') + p_tag = tree.find('%sp' % w_namespace) + self.assertNotEqual(p_tag, None) + self.assertEqual( + get_font_size(p_tag, {}), + None, + ) + + +class HeaderFooterTagsWithContent(_TranslationTestCase): + expected_output = ''' + +
      +
    1. AAA
    2. +
    + + ''' + + def get_xml(self): + li = DXB.li(text='AAA', ilvl=0, numId=1) + p_tag = DXB.p_tag('BBB') + footer_tag = DXB.sectPr_tag(p_tag) + body = li + footer_tag + xml = DXB.xml(body) + return etree.fromstring(xml) + + +class StylesParsingTestCase(_TranslationTestCase): + expected_output = '' + + def get_xml(self): + return etree.fromstring(DXB.xml('')) + + def test_get_headings(self): + + styles = [ + DXB.style('heading 1', 'heading 1'), + ] + xml = DXB.styles_xml(styles) + styles_xml = etree.fromstring(xml) + styles_dict = get_style_dict(styles_xml) + self.assertEqual(styles_dict['heading 1']['header'], 'h2') + + +class MangledIlvlTestCase(_TranslationTestCase): + expected_output = ''' + +
      +
    1. AAA
    2. +
    +
      +
    1. BBB
    2. +
    +
      +
    1. CCC
    2. +
    + + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 2), + ('BBB', 1, 1), + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return etree.fromstring(xml) diff --git a/setup.py b/setup.py index 2d692ea..aad2ac4 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -import codecs +import os try: from setuptools import setup, find_packages @@ -10,11 +10,23 @@ use_setuptools() from setuptools import setup, find_packages # noqa -long_description = codecs.open("README.md", "r", "utf-8").read() +rel_file = lambda *args: os.path.join( + os.path.dirname( + os.path.abspath(__file__), + ), *args) + + +def get_file(filename): + with open(rel_file(filename)) as f: + return f.read() + + +def get_description(): + return get_file('README.md') + get_file('CHANGELOG') setup( name="docx2html", - version="0.0.1", + version="0.1.7", description="docx (OOXML) to html converter", author="Jason Ward", author_email="jason.louard.ward@gmail.com", @@ -24,16 +36,19 @@ packages=find_packages(), scripts=[], zip_safe=False, - install_requires=['lxml', 'pillow==1.7.7'], + install_requires=['lxml==3.1.0', 'pillow==2.0.0'], cmdclass={}, classifiers=[ "Development Status :: 3 - Alpha", "Programming Language :: Python", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.3", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Topic :: Text Processing :: Markup :: HTML", "Topic :: Text Processing :: Markup :: XML", ], - long_description=long_description, + long_description=get_description(), ) diff --git a/test_requirements.txt b/test_requirements.txt new file mode 100644 index 0000000..6d8e78c --- /dev/null +++ b/test_requirements.txt @@ -0,0 +1,3 @@ +nose +mock +Jinja2>=2.0