diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3d00690 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea +*.pyc +build +dist +upload.sh diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f815681 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,11 @@ +language: python +python: + - "2.7" + - "3.3" + - "3.4" + - "3.5" +# command to install dependencies +install: + - python setup.py -q install +# command to run tests +script: cd tests && python test-hello.py \ No newline at end of file diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..b42dce8 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,7 @@ +# file GENERATED by distutils, do NOT edit +LICENSE.txt +setup.cfg +setup.py +bin/docx2txt +docxpy/__init__.py +docxpy/docxreader.py diff --git a/README.md b/README.md deleted file mode 100644 index d1b360b..0000000 --- a/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# python-docx2txt # - -A pure python-based utility to extract text from docx files. - -The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__ - -## How to install? ## -```bash -pip install docx2txt -``` - -## How to run? ## - -a. From command line: -```bash -# extract text -docx2txt file.docx -# extract text and images -docx2txt -i /tmp/img_dir file.docx -``` -b. From python: -```python -import docx2txt - -# extract text -text = docx2txt.process("file.docx") - -# extract text and write images in /tmp/img_dir -text = docx2txt.process("file.docx", "/tmp/img_dir") -``` diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..63d3eed --- /dev/null +++ b/README.rst @@ -0,0 +1,57 @@ +docxpy +====== + +|image0| |PyPI| + +This project is forked from +`ankushshah89/python-docx2txt `__. +A new feature is added: extract the hyperlinks and its corresponding +texts. + +It is a pure python-based utility to extract text from docx files. The +code is taken and adapted from +`python-docx `__. It can +however also extract **text** from header, footer and **hyperlinks**. It +can now also extract **images**. + +How to install? +--------------- + +.. code:: bash + + pip install docxpy + +How to run? +----------- + +a. From command line: + +.. code:: bash + + # extract text + docx2txt file.docx + # extract text and images + docx2txt -i /tmp/img_dir file.docx + +b. From python: + +.. code:: python + + import docxpy + + file = 'file.docx' + + # extract text + text = docxpy.process(file) + + # extract text and write images in /tmp/img_dir + text = docxpy.process(file, "/tmp/img_dir") + + + # if you want the hyperlinks + doc = docxpy.DOCReader(file) + doc.process() # process file + hyperlinks = doc.data['links'] + +.. |image0| image:: https://travis-ci.org/badbye/docxpy.svg?branch=master +.. |PyPI| image:: https://img.shields.io/pypi/pyversions/scrapy-corenlp.svg?style=flat-square diff --git a/bin/docx2txt b/bin/docx2txt index 62157c2..6a4ba27 100755 --- a/bin/docx2txt +++ b/bin/docx2txt @@ -1,9 +1,10 @@ #! /usr/bin/env python -import docx2txt +import docxpy + if __name__ == '__main__': - import sys - args = docx2txt.process_args() - text = docx2txt.process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) + args = docxpy.process_args() + text = docxpy.process(args.docx, args.img_dir) + print(text.encode('utf-8')) + diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py deleted file mode 100644 index 778804c..0000000 --- a/docx2txt/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .docx2txt import process -from .docx2txt import process_args - -VERSION = '0.6' diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py deleted file mode 100755 index d48f9e5..0000000 --- a/docx2txt/docx2txt.py +++ /dev/null @@ -1,113 +0,0 @@ -#! /usr/bin/env python - -import argparse -import re -import xml.etree.ElementTree as ET -import zipfile -import os -import sys - - -nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} - - -def process_args(): - parser = argparse.ArgumentParser(description='A pure python-based utility ' - 'to extract text and images ' - 'from docx files.') - parser.add_argument("docx", help="path of the docx file") - parser.add_argument('-i', '--img_dir', help='path of directory ' - 'to extract images') - - args = parser.parse_args() - - if not os.path.exists(args.docx): - print('File {} does not exist.'.format(args.docx)) - sys.exit(1) - - if args.img_dir is not None: - if not os.path.exists(args.img_dir): - try: - os.makedirs(args.img_dir) - except OSError: - print("Unable to create img_dir {}".format(args.img_dir)) - sys.exit(1) - return args - - -def qn(tag): - """ - Stands for 'qualified name', a utility function to turn a namespace - prefixed tag name into a Clark-notation qualified tag name for lxml. For - example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. - Source: https://github.com/python-openxml/python-docx/ - """ - prefix, tagroot = tag.split(':') - uri = nsmap[prefix] - return '{{{}}}{}'.format(uri, tagroot) - - -def xml2text(xml): - """ - A string representing the textual content of this run, with content - child elements like ```` translated to their Python - equivalent. - Adapted from: https://github.com/python-openxml/python-docx/ - """ - text = u'' - root = ET.fromstring(xml) - for child in root.iter(): - if child.tag == qn('w:t'): - t_text = child.text - text += t_text if t_text is not None else '' - elif child.tag == qn('w:tab'): - text += '\t' - elif child.tag in (qn('w:br'), qn('w:cr')): - text += '\n' - elif child.tag == qn("w:p"): - text += '\n\n' - return text - - -def process(docx, img_dir=None): - text = u'' - - # unzip the docx in memory - zipf = zipfile.ZipFile(docx) - filelist = zipf.namelist() - - # get header text - # there can be 3 header files in the zip - header_xmls = 'word/header[0-9]*.xml' - for fname in filelist: - if re.match(header_xmls, fname): - text += xml2text(zipf.read(fname)) - - # get main text - doc_xml = 'word/document.xml' - text += xml2text(zipf.read(doc_xml)) - - # get footer text - # there can be 3 footer files in the zip - footer_xmls = 'word/footer[0-9]*.xml' - for fname in filelist: - if re.match(footer_xmls, fname): - text += xml2text(zipf.read(fname)) - - if img_dir is not None: - # extract images - for fname in filelist: - _, extension = os.path.splitext(fname) - if extension in [".jpg", ".jpeg", ".png", ".bmp"]: - dst_fname = os.path.join(img_dir, os.path.basename(fname)) - with open(dst_fname, "w") as dst_f: - dst_f.write(zipf.read(fname)) - - zipf.close() - return text.strip() - - -if __name__ == '__main__': - args = process_args() - text = process(args.docx, args.img_dir) - sys.stdout.write(text.encode('utf-8')) diff --git a/docxpy/__init__.py b/docxpy/__init__.py new file mode 100644 index 0000000..9b080dd --- /dev/null +++ b/docxpy/__init__.py @@ -0,0 +1,5 @@ +from .docxreader import process +from .docxreader import process_args +from .docxreader import DOCReader + +VERSION = '0.8.2' diff --git a/docxpy/docxreader.py b/docxpy/docxreader.py new file mode 100755 index 0000000..3eac2e0 --- /dev/null +++ b/docxpy/docxreader.py @@ -0,0 +1,140 @@ +#! /usr/bin/env python + +import argparse +import xml.etree.ElementTree as ET +import zipfile +import os +import sys +import re + + +def process_args(): + parser = argparse.ArgumentParser(description='A pure python-based utility ' + 'to extract text and images ' + 'from docx files.') + parser.add_argument("docx", help="path of the docx file") + parser.add_argument('-i', '--img_dir', help='path of directory ' + 'to extract images') + + args = parser.parse_args() + + if not os.path.exists(args.docx): + print('File {} does not exist.'.format(args.docx)) + sys.exit(1) + + if args.img_dir is not None: + if not os.path.exists(args.img_dir): + try: + os.makedirs(args.img_dir) + except OSError: + print("Unable to create img_dir {}".format(args.img_dir)) + sys.exit(1) + return args + + +def qn(tag): + """ + Stands for 'qualified name', a utility function to turn a namespace + prefixed tag name into a Clark-notation qualified tag name for lxml. For + example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``. + Source: https://github.com/python-openxml/python-docx/ + """ + nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} + prefix, tagroot = tag.split(':') + uri = nsmap[prefix] + return '{{{}}}{}'.format(uri, tagroot) + + +class DOCReader(object): + def __init__(self, docx, img_dir=None): + if not os.path.exists(docx): + raise Exception('Can not file document: %s' % docx) + self.file = docx + self.img_dir = img_dir + self.data = {'links': []} # save header, footer, document, links + self.links = {} + + # read file + self.zipf = zipfile.ZipFile(self.file) + self.filelist = self.zipf.namelist() + + # parse hyperlinks + hyperlink_document = 'word/_rels/document.xml.rels' + if hyperlink_document in self.filelist: + self.process_hyperlink(self.zipf.read(hyperlink_document)) + + def process_hyperlink(self, doc): + """ + external hyperlink from a string of xml document(typically the `word/_rels/document.xml.rels` file) + """ + root = ET.fromstring(doc) + nodes = [node.attrib for node in root] + nodes = filter(lambda x: x.get('TargetMode', '') == 'External', nodes) + self.links = {node['Id']: node['Target'] for node in nodes} + + def xml2text(self, xml): + """ + A string representing the textual content of this run, with content + child elements like ```` translated to their Python + equivalent. + Adapted from: https://github.com/python-openxml/python-docx/ + """ + text = u'' + root = ET.fromstring(xml) + for child in root.iter(): + attr = child.attrib + for k, v in attr.items(): + if k.endswith('id') and v in self.links: + self.data['links'].append((ET.tostring(child, encoding='utf-8', method='text'), self.links[v])) + if child.tag == qn('w:t'): + t_text = child.text + text += t_text if t_text is not None else '' + elif child.tag == qn('w:tab'): + text += '\t' + elif child.tag in (qn('w:br'), qn('w:cr')): + text += '\n' + elif child.tag == qn("w:p"): + text += '\n\n' + return text + + def process(self): + text = u'' + # get header text + # there can be 3 header files in the zip + header_xmls = re.compile('word/header[0-9]*.xml') + self.data['header'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if header_xmls.match(fname)] + text += '\n'.join(self.data['header']) + + # get main text + doc_xml = 'word/document.xml' + self.data['document'] = self.xml2text(self.zipf.read(doc_xml)) + text += self.data['document'] + + # get footer text + # there can be 3 footer files in the zip + footer_xmls = re.compile('word/footer[0-9]*.xml') + self.data['footer'] = [self.xml2text(self.zipf.read(fname)) for fname in self.filelist if footer_xmls.match(fname)] + text += '\n'.join(self.data['footer']) + + if self.img_dir is not None: + # extract images + for fname in self.filelist: + _, extension = os.path.splitext(fname) + if extension in [".jpg", ".jpeg", ".png", ".bmp"]: + dst_fname = os.path.join(self.img_dir, os.path.basename(fname)) + with open(dst_fname, "w") as dst_f: + dst_f.write(self.zipf.read(fname)) + self.zipf.close() + return text.strip() + + +def process(docx, img_dir=None): + obj = DOCReader(docx, img_dir=img_dir) + res = obj.process() + return res + + +if __name__ == '__main__': + args = process_args() + text = process(args.docx, args.img_dir) + print(text.encode('utf-8')) diff --git a/setup.py b/setup.py index f0c5c10..2cb5490 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,34 @@ +import os import glob from distutils.core import setup +from docxpy import VERSION # get all of the scripts scripts = glob.glob('bin/*') + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + setup( - name='docx2txt', - packages=['docx2txt'], - version='0.6', - description='A pure python-based utility to extract text and images ' + name='docxpy', + packages=['docxpy'], + version=VERSION, + description='A pure python-based utility to extract text, hyperlinks and images' 'from docx files.', - author='Ankush Shah', - author_email='ankush.shah.nitk@gmail.com', - url='https://github.com/ankushshah89/python-docx2txt', - download_url='https://github.com/ankushshah89/python-docx2txt/tarball/0.6', - keywords=['python', 'docx', 'text', 'images', 'extract'], - scripts=scripts, - classifiers=[], + long_description=open("README.rst").read(), + author='Ankush Shah, Yalei Du', + author_email='yaleidu@163.com', + url='https://github.com/badbye/docxpy', + keywords=['python', 'docx', 'text', 'links', 'images', 'extract'], + scripts=scripts, + test_suite='nose.collector', + tests_require=['nose'], + classifiers=[ + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5" + ] ) diff --git a/tests/Hello.docx b/tests/Hello.docx new file mode 100644 index 0000000..3b2f14a Binary files /dev/null and b/tests/Hello.docx differ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test-hello.py b/tests/test-hello.py new file mode 100644 index 0000000..80c81e0 --- /dev/null +++ b/tests/test-hello.py @@ -0,0 +1,26 @@ +import unittest +from docxpy import DOCReader + + +class Test(unittest.TestCase): + def setUp(self): + self.file = DOCReader('Hello.docx') + self.file.process() + + def test_file_data(self): + self.assertIsInstance(self.file.data, dict) + self.assertTrue('header' in self.file.data) + self.assertTrue('footer' in self.file.data) + self.assertTrue('document' in self.file.data) + + def test_hyperlinks(self): + links = self.file.data['links'] + self.assertEqual(links, [('This is a hyperlink.'.encode('utf-8'), 'https://www.google.com/')]) + + def test_text(self): + text = self.file.data['document'].replace('\n', '') + self.assertEqual(text, 'TitleThis is a hyperlink.') + + +if __name__ == '__main__': + unittest.main()