From dbdd85ac9028c5b54fb1806332509a3df0623eac Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 13:42:25 -0500 Subject: [PATCH 01/10] refs #1: added tests for making sure exceptions are being raised and fall back is being used. --- docx2html/tests/test_docx.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index 259437f..00f1b9b 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -10,6 +10,9 @@ _get_document_data, DETECT_FONT_SIZE, ) +from docx2html.errors import ( + ConversionFailed, +) def assert_html_equal(actual_html, expected_html): @@ -693,3 +696,22 @@ def test_has_title(): ) actual_html = convert(file_path) assert_html_equal(actual_html, '''

Text

''') + + +def test_missing_converter(): + file_path = 'test.doc' + try: + convert(file_path) + except ConversionFailed: + pass + else: + raise AssertionError('ConversionFailed was not raised') + + +def test_fall_back(): + file_path = 'test.doc' + + def fall_back(*args, **kwargs): + return 'success' + html = convert(file_path, fall_back=fall_back) + assert html == 'success' From 58e437a694fb1eee61c3a8c5008cb11928c6a886 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 13:43:02 -0500 Subject: [PATCH 02/10] refs #1: added custom exceptions --- docx2html/errors.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 docx2html/errors.py diff --git a/docx2html/errors.py b/docx2html/errors.py new file mode 100644 index 0000000..e94a093 --- /dev/null +++ b/docx2html/errors.py @@ -0,0 +1,14 @@ +class Docx2HtmlException(Exception): + pass + + +class InvalidFileExtension(Docx2HtmlException): + pass + + +class ConversionFailed(Docx2HtmlException): + pass + + +class MissingConverter(Docx2HtmlException): + pass From 3981dfd51e156e8cad7f5b4a775649ea2cb78222 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 13:43:14 -0500 Subject: [PATCH 03/10] refs #1: refactor, used custom exceptions --- docx2html/core.py | 56 +++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index 72f0370..5168110 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -8,29 +8,20 @@ from collections import namedtuple, defaultdict from zipfile import ZipFile, BadZipfile +from docx2html.errors import ( + ConversionFailed, + InvalidFileExtension, + MissingConverter, +) + DETECT_FONT_SIZE = False EMUS_PER_PIXEL = 9525 -# Abiword supported formats -VALID_EXTRACT_EXTENSIONS = [ - '.doc', '.docx', '.dotx', '.docm', '.dotm', '.wri', '.rtf', '.txt', - '.text', '.wpd', '.wp', '.odt', '.ott', '.abw', '.atw', '.pdf', '.html', - '.dot', -] ### # Help functions ### -def is_extractable(path): - """ - Determine if a file is something that we can extract. - """ - _, extension = os.path.splitext(path) - extension = extension.lower() - return (extension in VALID_EXTRACT_EXTENSIONS) - - def replace_ext(file_path, new_ext): """ >>> replace_ext('one/two/three.four.doc', '.html') @@ -1177,15 +1168,9 @@ def get_zip_file_handler(file_path): return ZipFile(file_path) -def convert(file_path, image_handler=None, fall_back=None): +def convert(file_path, image_handler=None, fall_back=None, converter=None): file_base, extension = os.path.splitext(os.path.basename(file_path)) - if not is_extractable(file_path): - #XXX create better exception, used to be InvalidFileExtension - raise Exception( - 'The file type "%s" is not supported' % extension - ) - if extension == '.html': with open(file_path) as f: html = f.read() @@ -1198,25 +1183,34 @@ def convert(file_path, image_handler=None, fall_back=None): # If the file is already html, just leave it in place. docx_path = file_path else: - # Convert the file to docx - # TODO make this configurable. - subprocess.call( - ['abiword', '--to=docx', '--to-name', docx_path, file_path], - ) + if converter is None: + def converter(file_path): + subprocess.call( + [ + 'abiword', + '--to=docx', + '--to-name', + docx_path, + file_path, + ], + ) + else: + raise MissingConverter( + 'pass in a converter for filetypes that are not docx.' + ) + try: # Docx files are actually just zip files. zf = get_zip_file_handler(docx_path) except BadZipfile: # If its a malformed zip file raise InvalidFileExtension - # XXX - raise Exception('This file is not a docx') + raise InvalidFileExtension('This file is not a docx') except IOError: # This means that the conversion from abiword failed. if fall_back is not None: return fall_back(file_path) else: - # XXX - raise Exception('Conversion to docx failed.') + raise ConversionFailed('Conversion to docx failed.') # Need to populate the xml based on word/document.xml tree, meta_data = _get_document_data(zf, image_handler) From 6ae04a3dc5546d65efcb414f5263449bd024a803 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 13:45:38 -0500 Subject: [PATCH 04/10] refs #1: removed MissingConverter, created a fall back in stead --- docx2html/core.py | 6 +----- docx2html/errors.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index 5168110..1664171 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -11,7 +11,6 @@ from docx2html.errors import ( ConversionFailed, InvalidFileExtension, - MissingConverter, ) DETECT_FONT_SIZE = False @@ -1194,10 +1193,7 @@ def converter(file_path): file_path, ], ) - else: - raise MissingConverter( - 'pass in a converter for filetypes that are not docx.' - ) + converter(file_path) try: # Docx files are actually just zip files. diff --git a/docx2html/errors.py b/docx2html/errors.py index e94a093..b2fd09b 100644 --- a/docx2html/errors.py +++ b/docx2html/errors.py @@ -8,7 +8,3 @@ class InvalidFileExtension(Docx2HtmlException): class ConversionFailed(Docx2HtmlException): pass - - -class MissingConverter(Docx2HtmlException): - pass From dbce7ae48bae3fb4c7989e97456087a2d79cbfe6 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 16:01:48 -0500 Subject: [PATCH 05/10] Revert "backward compatibility for the win." This reverts commit e1e209454e6efe35eb5552239266b963c3fc9b90. --- docx2html/core.py | 1 - docx2html/tests/test_docx.py | 1 - 2 files changed, 2 deletions(-) diff --git a/docx2html/core.py b/docx2html/core.py index 0b7bf38..1664171 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1,4 +1,3 @@ -from __future__ import with_statement import cgi import os import subprocess diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index 87535c6..00f1b9b 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -1,4 +1,3 @@ -from __future__ import with_statement import tempfile import shutil from os import path From 42e832500c688c82834220650f4a18df10bf7e91 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 16:02:06 -0500 Subject: [PATCH 06/10] refs #1: dropped support for python 2.5 --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4a31043..e969c4f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.5" - "2.6" - "2.7" script: ./run_tests.sh From cece70c70942d3622708770036910331051d3dce Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Fri, 11 Jan 2013 17:13:50 -0500 Subject: [PATCH 07/10] refs #1: put in dummy converters in the test --- docx2html/tests/test_docx.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index 00f1b9b..58b29f1 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -698,10 +698,16 @@ def test_has_title(): assert_html_equal(actual_html, '''

Text

''') -def test_missing_converter(): +def _converter(*args, **kwargs): + # Having a converter that does nothing is the same as if abiword fails to + # convert. + pass + + +def test_converter_broken(): file_path = 'test.doc' try: - convert(file_path) + convert(file_path, converter=_converter) except ConversionFailed: pass else: @@ -713,5 +719,5 @@ def test_fall_back(): def fall_back(*args, **kwargs): return 'success' - html = convert(file_path, fall_back=fall_back) + html = convert(file_path, fall_back=fall_back, converter=_converter) assert html == 'success' From cab9512a0ec67f1c825a04b3f1df8b3e586e27f3 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 14 Jan 2013 12:09:33 -0500 Subject: [PATCH 08/10] refs #1: name change --- docx2html/{errors.py => exceptions.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docx2html/{errors.py => exceptions.py} (100%) diff --git a/docx2html/errors.py b/docx2html/exceptions.py similarity index 100% rename from docx2html/errors.py rename to docx2html/exceptions.py From 3623e0000a20ec99bd4bbd11a7640a9151481606 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 14 Jan 2013 12:09:51 -0500 Subject: [PATCH 09/10] refs #1: made changes based on review notes --- docx2html/converters.py | 17 +++++++++++++++++ docx2html/core.py | 34 ++++++++++++---------------------- docx2html/exceptions.py | 8 ++++++-- docx2html/tests/test_docx.py | 13 ++++++------- 4 files changed, 41 insertions(+), 31 deletions(-) create mode 100644 docx2html/converters.py diff --git a/docx2html/converters.py b/docx2html/converters.py new file mode 100644 index 0000000..36bc283 --- /dev/null +++ b/docx2html/converters.py @@ -0,0 +1,17 @@ +import subprocess + + +def convert_with_abiword(docx_path, file_path): + """ + This will convert ``file_path`` to docx and place the converted file at + ``docx_path`` + """ + subprocess.call( + [ + 'abiword', + '--to=docx', + '--to-name', + docx_path, + file_path, + ], + ) diff --git a/docx2html/core.py b/docx2html/core.py index 1664171..1c5d8ca 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1,6 +1,6 @@ import cgi import os -import subprocess +import os.path from PIL import Image from lxml import etree from lxml.etree import XMLSyntaxError @@ -8,9 +8,10 @@ from collections import namedtuple, defaultdict from zipfile import ZipFile, BadZipfile -from docx2html.errors import ( +from docx2html.exceptions import ( ConversionFailed, - InvalidFileExtension, + FileNotDocx, + MalformedDocx, ) DETECT_FONT_SIZE = False @@ -1183,30 +1184,19 @@ def convert(file_path, image_handler=None, fall_back=None, converter=None): docx_path = file_path else: if converter is None: - def converter(file_path): - subprocess.call( - [ - 'abiword', - '--to=docx', - '--to-name', - docx_path, - file_path, - ], - ) - converter(file_path) + raise FileNotDocx('The file passed in is not a docx.') + converter(docx_path, file_path) + if not os.path.isfile(docx_path): + if fall_back is None: + raise ConversionFailed('Conversion to docx failed.') + else: + return fall_back(file_path) try: # Docx files are actually just zip files. zf = get_zip_file_handler(docx_path) except BadZipfile: - # If its a malformed zip file raise InvalidFileExtension - raise InvalidFileExtension('This file is not a docx') - except IOError: - # This means that the conversion from abiword failed. - if fall_back is not None: - return fall_back(file_path) - else: - raise ConversionFailed('Conversion to docx failed.') + raise MalformedDocx('This file is not a docx') # Need to populate the xml based on word/document.xml tree, meta_data = _get_document_data(zf, image_handler) diff --git a/docx2html/exceptions.py b/docx2html/exceptions.py index b2fd09b..c8e01bf 100644 --- a/docx2html/exceptions.py +++ b/docx2html/exceptions.py @@ -2,9 +2,13 @@ class Docx2HtmlException(Exception): pass -class InvalidFileExtension(Docx2HtmlException): +class ConversionFailed(Docx2HtmlException): pass -class ConversionFailed(Docx2HtmlException): +class FileNotDocx(Docx2HtmlException): + pass + + +class MalformedDocx(Docx2HtmlException): pass diff --git a/docx2html/tests/test_docx.py b/docx2html/tests/test_docx.py index 58b29f1..6591ba4 100644 --- a/docx2html/tests/test_docx.py +++ b/docx2html/tests/test_docx.py @@ -3,6 +3,7 @@ from os import path from zipfile import ZipFile from nose.plugins.skip import SkipTest +from nose.tools import assert_raises from docx2html.tests import collapse_html from docx2html import convert @@ -10,7 +11,7 @@ _get_document_data, DETECT_FONT_SIZE, ) -from docx2html.errors import ( +from docx2html.exceptions import ( ConversionFailed, ) @@ -706,12 +707,10 @@ def _converter(*args, **kwargs): def test_converter_broken(): file_path = 'test.doc' - try: - convert(file_path, converter=_converter) - except ConversionFailed: - pass - else: - raise AssertionError('ConversionFailed was not raised') + assert_raises( + ConversionFailed, + lambda: convert(file_path, converter=_converter), + ) def test_fall_back(): From 57a8086065a4d46539c827ad56b6b332c6f4eb4b Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 14 Jan 2013 12:14:11 -0500 Subject: [PATCH 10/10] refs #1: added a docstring --- docx2html/core.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docx2html/core.py b/docx2html/core.py index 1c5d8ca..a09803a 100644 --- a/docx2html/core.py +++ b/docx2html/core.py @@ -1169,6 +1169,19 @@ def get_zip_file_handler(file_path): def convert(file_path, image_handler=None, fall_back=None, converter=None): + """ + ``file_path`` is a path to the file on the file system that you want to be + converted to html. + ``image_handler`` is a function that takes an image_id and a + relationship_dict to generate the src attribute for images. (see readme + for more details) + ``fall_back`` is a function that takes a ``file_path``. This function will + only be called if for whatever reason the conversion fails. + ``converter`` is a function to convert a document that is not docx to docx + (examples in docx2html.converters) + + Returns html extracted from ``file_path`` + """ file_base, extension = os.path.splitext(os.path.basename(file_path)) if extension == '.html':