diff --git a/.travis.yml b/.travis.yml index 4a31043..e969c4f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.5" - "2.6" - "2.7" script: ./run_tests.sh diff --git a/docx2html/converters.py b/docx2html/converters.py new file mode 100644 index 0000000..36bc283 --- /dev/null +++ b/docx2html/converters.py @@ -0,0 +1,17 @@ +import subprocess + + +def convert_with_abiword(docx_path, file_path): + """ + This will convert ``file_path`` to docx and place the converted file at + ``docx_path`` + """ + subprocess.call( + [ + 'abiword', + '--to=docx', + '--to-name', + docx_path, + file_path, + ], + ) diff --git a/docx2html/core.py b/docx2html/core.py index e4752b2..a09803a 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1,7 +1,6 @@ -from __future__ import with_statement import cgi import os -import subprocess +import os.path from PIL import Image from lxml import etree from lxml.etree import XMLSyntaxError @@ -9,29 +8,20 @@ from collections import namedtuple, defaultdict from zipfile import ZipFile, BadZipfile +from docx2html.exceptions import ( + ConversionFailed, + FileNotDocx, + MalformedDocx, +) + DETECT_FONT_SIZE = False EMUS_PER_PIXEL = 9525 -# Abiword supported formats -VALID_EXTRACT_EXTENSIONS = [ - '.doc', '.docx', '.dotx', '.docm', '.dotm', '.wri', '.rtf', '.txt', - '.text', '.wpd', '.wp', '.odt', '.ott', '.abw', '.atw', '.pdf', '.html', - '.dot', -] ### # Help functions ### -def is_extractable(path): - """ - Determine if a file is something that we can extract. - """ - _, extension = os.path.splitext(path) - extension = extension.lower() - return (extension in VALID_EXTRACT_EXTENSIONS) - - def replace_ext(file_path, new_ext): """ >>> replace_ext('one/two/three.four.doc', '.html') @@ -1178,15 +1168,22 @@ def get_zip_file_handler(file_path): return ZipFile(file_path) -def convert(file_path, image_handler=None, fall_back=None): +def convert(file_path, image_handler=None, fall_back=None, converter=None): + """ + ``file_path`` is a path to the file on the file system that you want to be + converted to html. + ``image_handler`` is a function that takes an image_id and a + relationship_dict to generate the src attribute for images. (see readme + for more details) + ``fall_back`` is a function that takes a ``file_path``. This function will + only be called if for whatever reason the conversion fails. + ``converter`` is a function to convert a document that is not docx to docx + (examples in docx2html.converters) + + Returns html extracted from ``file_path`` + """ file_base, extension = os.path.splitext(os.path.basename(file_path)) - if not is_extractable(file_path): - #XXX create better exception, used to be InvalidFileExtension - raise Exception( - 'The file type "%s" is not supported' % extension - ) - if extension == '.html': with open(file_path) as f: html = f.read() @@ -1199,25 +1196,20 @@ def convert(file_path, image_handler=None, fall_back=None): # If the file is already html, just leave it in place. docx_path = file_path else: - # Convert the file to docx - # TODO make this configurable. - subprocess.call( - ['abiword', '--to=docx', '--to-name', docx_path, file_path], - ) + if converter is None: + raise FileNotDocx('The file passed in is not a docx.') + converter(docx_path, file_path) + if not os.path.isfile(docx_path): + if fall_back is None: + raise ConversionFailed('Conversion to docx failed.') + else: + return fall_back(file_path) + try: # Docx files are actually just zip files. zf = get_zip_file_handler(docx_path) except BadZipfile: - # If its a malformed zip file raise InvalidFileExtension - # XXX - raise Exception('This file is not a docx') - except IOError: - # This means that the conversion from abiword failed. - if fall_back is not None: - return fall_back(file_path) - else: - # XXX - raise Exception('Conversion to docx failed.') + raise MalformedDocx('This file is not a docx') # Need to populate the xml based on word/document.xml tree, meta_data = _get_document_data(zf, image_handler) diff --git a/docx2html/exceptions.py b/docx2html/exceptions.py new file mode 100644 index 0000000..c8e01bf --- /dev/null +++ b/docx2html/exceptions.py @@ -0,0 +1,14 @@ +class Docx2HtmlException(Exception): + pass + + +class ConversionFailed(Docx2HtmlException): + pass + + +class FileNotDocx(Docx2HtmlException): + pass + + +class MalformedDocx(Docx2HtmlException): + pass diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index 81639b0..6591ba4 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -1,9 +1,9 @@ -from __future__ import with_statement import tempfile import shutil from os import path from zipfile import ZipFile from nose.plugins.skip import SkipTest +from nose.tools import assert_raises from docx2html.tests import collapse_html from docx2html import convert @@ -11,6 +11,9 @@ _get_document_data, DETECT_FONT_SIZE, ) +from docx2html.exceptions import ( + ConversionFailed, +) def assert_html_equal(actual_html, expected_html): @@ -694,3 +697,26 @@ def test_has_title(): ) actual_html = convert(file_path) assert_html_equal(actual_html, '''
Text
''') + + +def _converter(*args, **kwargs): + # Having a converter that does nothing is the same as if abiword fails to + # convert. + pass + + +def test_converter_broken(): + file_path = 'test.doc' + assert_raises( + ConversionFailed, + lambda: convert(file_path, converter=_converter), + ) + + +def test_fall_back(): + file_path = 'test.doc' + + def fall_back(*args, **kwargs): + return 'success' + html = convert(file_path, fall_back=fall_back, converter=_converter) + assert html == 'success'