diff --git a/docx/__init__.py b/docx/__init__.py index 59756c02..5b85616d 100644 --- a/docx/__init__.py +++ b/docx/__init__.py @@ -10,6 +10,7 @@ from docx.opc.constants import CONTENT_TYPE as CT, RELATIONSHIP_TYPE as RT from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.customprops import CustomPropertiesPart from docx.parts.document import DocumentPart from docx.parts.hdrftr import FooterPart, HeaderPart @@ -27,6 +28,7 @@ def part_class_selector(content_type, reltype): PartFactory.part_class_selector = part_class_selector PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart +PartFactory.part_type_for[CT.OPC_CUSTOM_PROPERTIES] = CustomPropertiesPart PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart PartFactory.part_type_for[CT.WML_FOOTER] = FooterPart PartFactory.part_type_for[CT.WML_HEADER] = HeaderPart diff --git a/docx/document.py b/docx/document.py index 6493c458..bb42cb84 100644 --- a/docx/document.py +++ b/docx/document.py @@ -101,6 +101,14 @@ def core_properties(self): """ return self._part.core_properties + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties of this document. + """ + return self._part.custom_properties + @property def inline_shapes(self): """ diff --git a/docx/opc/constants.py b/docx/opc/constants.py index b90aa394..1bcf16f6 100644 --- a/docx/opc/constants.py +++ b/docx/opc/constants.py @@ -77,6 +77,9 @@ class CONTENT_TYPE(object): OPC_CORE_PROPERTIES = ( 'application/vnd.openxmlformats-package.core-properties+xml' ) + OPC_CUSTOM_PROPERTIES = ( + 'application/vnd.openxmlformats-officedocument.custom-properties+xml' + ) OPC_DIGITAL_SIGNATURE_CERTIFICATE = ( 'application/vnd.openxmlformats-package.digital-signature-certificat' 'e' diff --git a/docx/opc/customprops.py b/docx/opc/customprops.py new file mode 100644 index 00000000..95e3fb7b --- /dev/null +++ b/docx/opc/customprops.py @@ -0,0 +1,71 @@ +# encoding: utf-8 + +""" +The :mod:`pptx.packaging` module coheres around the concerns of reading and +writing presentations to and from a .pptx file. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import numbers +from lxml import etree + +NS_VT = "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes" + +class CustomProperties(object): + """ + Corresponds to part named ``/docProps/custom.xml``, containing the custom + document properties for this document package. + """ + def __init__(self, element): + self._element = element + + def __getitem__( self, item ): + prop = self.lookup(item) + if prop is not None : + # print(etree.tostring(prop, pretty_print=True)) + elm = prop[0] + if elm.tag == '{%s}i4' % NS_VT: + try: + return int(elm.text) + except: + return elm.text + elif elm.tag == '{%s}bool' % NS_VT: + return True if elm.text == '1' else False + return elm.text + + def __setitem__( self, key, value ): + prop = self.lookup(key) + if prop is None : + elmType = 'lpwstr' + if isinstance(value, bool): + elmType = 'bool' + value = str(1 if value else 0) + elif isinstance(value, numbers.Number): + elmType = 'i4' + value = str(int(value)) + prop = etree.SubElement( self._element, "property" ) + elm = etree.SubElement(prop, '{%s}%s' %(NS_VT, elmType), nsmap = {'vt':NS_VT} ) + elm.text = value + prop.set("name", key) + prop.set("fmtid", "{D5CDD505-2E9C-101B-9397-08002B2CF9AE}") + prop.set("pid", "%s" % str(len(self._element) + 1)) + else: + elm = prop[0] + if elm.tag == '{%s}i4' % NS_VT: + elm.text = str(int(value)) + elif elm.tag == '{%s}bool' % NS_VT: + elm.text = str(1 if value else 0) + else: + elm.text = '%s' % str(value) + + def __len__( self ): + return len(self._element) + + def lookup(self, item): + for child in self._element : + if child.get("name") == item : + return child + return None \ No newline at end of file diff --git a/docx/opc/package.py b/docx/opc/package.py index 7ba87bab..bc51893b 100644 --- a/docx/opc/package.py +++ b/docx/opc/package.py @@ -8,6 +8,7 @@ from docx.opc.packuri import PACKAGE_URI, PackURI from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.customprops import CustomPropertiesPart from docx.opc.pkgreader import PackageReader from docx.opc.pkgwriter import PackageWriter from docx.opc.rel import Relationships @@ -41,6 +42,14 @@ def core_properties(self): """ return self._core_properties_part.core_properties + @property + def custom_properties(self): + """ + |CustomProperties| object providing read/write access to the Dublin + Core properties for this document. + """ + return self._custom_properties_part.custom_properties + def iter_rels(self): """ Generate exactly one reference to each relationship in the package by @@ -184,6 +193,19 @@ def _core_properties_part(self): self.relate_to(core_properties_part, RT.CORE_PROPERTIES) return core_properties_part + @property + def _custom_properties_part(self): + """ + |CustomPropertiesPart| object related to this package. Creates + a default custom properties part if one is not present (not common). + """ + try: + return self.part_related_by(RT.CUSTOM_PROPERTIES) + except KeyError: + custom_properties_part = CustomPropertiesPart.default(self) + self.relate_to(custom_properties_part, RT.CUSTOM_PROPERTIES) + return custom_properties_part + class Unmarshaller(object): """Hosts static methods for unmarshalling a package from a |PackageReader|.""" diff --git a/docx/opc/parts/customprops.py b/docx/opc/parts/customprops.py new file mode 100644 index 00000000..e6ec3616 --- /dev/null +++ b/docx/opc/parts/customprops.py @@ -0,0 +1,74 @@ +# encoding: utf-8 + +""" +Custom properties part, corresponds to ``/docProps/custom.xml`` part in package. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from lxml import etree + +from datetime import datetime + +from ..constants import CONTENT_TYPE as CT +from ..customprops import CustomProperties +from ...oxml.customprops import CT_CustomProperties +from ..packuri import PackURI +from ..part import XmlPart + +# configure XML parser +parser_lookup = etree.ElementDefaultClassLookup(element=CT_CustomProperties) +ct_parser = etree.XMLParser(remove_blank_text=True) +ct_parser.set_element_class_lookup(parser_lookup) + + +def ct_parse_xml(xml): + """ + Return root lxml element obtained by parsing XML character string in + *xml*, which can be either a Python 2.x string or unicode. The custom + parser is used, so custom element classes are produced for elements in + *xml* that have them. + """ + root_element = etree.fromstring(xml, ct_parser) + return root_element + + + +class CustomPropertiesPart(XmlPart): + """ + Corresponds to part named ``/docProps/custom.xml``, containing the custom + document properties for this document package. + """ + @classmethod + def default(cls, package): + """ + Return a new |CustomPropertiesPart| object initialized with default + values for its base properties. + """ + custom_properties_part = cls._new(package) + custom_properties = custom_properties_part.custom_properties + return custom_properties_part + + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties contained in this custom properties part. + """ + return CustomProperties(self.element) + + @classmethod + def load(cls, partname, content_type, blob, package): + element = ct_parse_xml(blob) + return cls(partname, content_type, element, package) + + @classmethod + def _new(cls, package): + partname = PackURI('/docProps/custom.xml') + content_type = CT.OPC_CUSTOM_PROPERTIES + customProperties = CT_CustomProperties.new() + return CustomPropertiesPart( + partname, content_type, customProperties, package + ) diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py index 093c1b45..d7680cbb 100644 --- a/docx/oxml/__init__.py +++ b/docx/oxml/__init__.py @@ -39,6 +39,14 @@ def register_element_cls(tag, cls): namespace = element_class_lookup.get_namespace(nsmap[nspfx]) namespace[tagroot] = cls +def register_element_cls_ns(tag, ns, cls): + """ + Register *cls* to be constructed when the oxml parser encounters an + element with matching *tag*. *tag* is a string of the form + ``nspfx:tagroot``, e.g. ``'w:document'``. + """ + namespace = element_class_lookup.get_namespace(ns) + namespace[tag] = cls def OxmlElement(nsptag_str, attrs=None, nsdecls=None): """ @@ -72,7 +80,10 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None): from .coreprops import CT_CoreProperties # noqa register_element_cls('cp:coreProperties', CT_CoreProperties) -from .document import CT_Body, CT_Document # noqa +from .customprops import CT_CustomProperties +register_element_cls_ns('Properties', 'http://schemas.openxmlformats.org/officeDocument/2006/custom-properties', CT_CustomProperties) + +from .document import CT_Body, CT_Document register_element_cls('w:body', CT_Body) register_element_cls('w:document', CT_Document) diff --git a/docx/oxml/customprops.py b/docx/oxml/customprops.py new file mode 100644 index 00000000..3be69e4d --- /dev/null +++ b/docx/oxml/customprops.py @@ -0,0 +1,156 @@ +# encoding: utf-8 + +""" +lxml custom element classes for core properties-related XML elements. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import re + +from datetime import datetime, timedelta +from lxml import etree +from .ns import nsdecls, qn +from .xmlchemy import BaseOxmlElement, ZeroOrOne +from . import parse_xml + +class CT_CustomProperties(BaseOxmlElement): + """ + ```` element, the root element of the Custom Properties + part stored as ``/docProps/custom.xml``. String elements are + limited in length to 255 unicode characters. + """ + + _customProperties_tmpl = ( + '\n' % nsdecls('vt') + ) + + @classmethod + def new(cls): + """ + Return a new ```` element + """ + xml = cls._customProperties_tmpl + customProperties = parse_xml(xml) + return customProperties + + def _datetime_of_element(self, property_name): + element = getattr(self, property_name) + if element is None: + return None + datetime_str = element.text + try: + return self._parse_W3CDTF_to_datetime(datetime_str) + except ValueError: + # invalid datetime strings are ignored + return None + + def _get_or_add(self, prop_name): + """ + Return element returned by 'get_or_add_' method for *prop_name*. + """ + get_or_add_method_name = 'get_or_add_%s' % prop_name + get_or_add_method = getattr(self, get_or_add_method_name) + element = get_or_add_method() + return element + + @classmethod + def _offset_dt(cls, dt, offset_str): + """ + Return a |datetime| instance that is offset from datetime *dt* by + the timezone offset specified in *offset_str*, a string like + ``'-07:00'``. + """ + match = cls._offset_pattern.match(offset_str) + if match is None: + raise ValueError( + "'%s' is not a valid offset string" % offset_str + ) + sign, hours_str, minutes_str = match.groups() + sign_factor = -1 if sign == '+' else 1 + hours = int(hours_str) * sign_factor + minutes = int(minutes_str) * sign_factor + td = timedelta(hours=hours, minutes=minutes) + return dt + td + + _offset_pattern = re.compile('([+-])(\d\d):(\d\d)') + + @classmethod + def _parse_W3CDTF_to_datetime(cls, w3cdtf_str): + # valid W3CDTF date cases: + # yyyy e.g. '2003' + # yyyy-mm e.g. '2003-12' + # yyyy-mm-dd e.g. '2003-12-31' + # UTC timezone e.g. '2003-12-31T10:14:55Z' + # numeric timezone e.g. '2003-12-31T10:14:55-08:00' + templates = ( + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%d', + '%Y-%m', + '%Y', + ) + # strptime isn't smart enough to parse literal timezone offsets like + # '-07:30', so we have to do it ourselves + parseable_part = w3cdtf_str[:19] + offset_str = w3cdtf_str[19:] + dt = None + for tmpl in templates: + try: + dt = datetime.strptime(parseable_part, tmpl) + except ValueError: + continue + if dt is None: + tmpl = "could not parse W3CDTF datetime string '%s'" + raise ValueError(tmpl % w3cdtf_str) + if len(offset_str) == 6: + return cls._offset_dt(dt, offset_str) + return dt + + def _set_element_datetime(self, prop_name, value): + """ + Set date/time value of child element having *prop_name* to *value*. + """ + if not isinstance(value, datetime): + tmpl = ( + "property requires object, got %s" + ) + raise ValueError(tmpl % type(value)) + element = self._get_or_add(prop_name) + dt_str = value.strftime('%Y-%m-%dT%H:%M:%SZ') + element.text = dt_str + if prop_name in ('created', 'modified'): + # These two require an explicit 'xsi:type="dcterms:W3CDTF"' + # attribute. The first and last line are a hack required to add + # the xsi namespace to the root element rather than each child + # element in which it is referenced + self.set(qn('xsi:foo'), 'bar') + element.set(qn('xsi:type'), 'dcterms:W3CDTF') + del self.attrib[qn('xsi:foo')] + + def _set_element_text(self, prop_name, value): + """ + Set string value of *name* property to *value*. + """ + value = str(value) + if len(value) > 255: + tmpl = ( + "exceeded 255 char limit for property, got:\n\n'%s'" + ) + raise ValueError(tmpl % value) + element = self._get_or_add(prop_name) + element.text = value + + def _text_of_element(self, property_name): + """ + Return the text in the element matching *property_name*, or an empty + string if the element is not present or contains no text. + """ + element = getattr(self, property_name) + if element is None: + return '' + if element.text is None: + return '' + return element.text + diff --git a/docx/oxml/ns.py b/docx/oxml/ns.py index 6b086128..42d5e2a8 100644 --- a/docx/oxml/ns.py +++ b/docx/oxml/ns.py @@ -19,6 +19,7 @@ "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", "sl": "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + 'vt' : "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes", "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", 'w14': "http://schemas.microsoft.com/office/word/2010/wordml", "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", diff --git a/docx/parts/document.py b/docx/parts/document.py index 59d0b7a7..27d61d56 100644 --- a/docx/parts/document.py +++ b/docx/parts/document.py @@ -44,6 +44,14 @@ def core_properties(self): """ return self.package.core_properties + @property + def custom_properties(self): + """ + A |CustomProperties| object providing read/write access to the custom + properties of this document. + """ + return self.package.custom_properties + @property def document(self): """ diff --git a/features/doc-customprops.feature b/features/doc-customprops.feature new file mode 100644 index 00000000..2dcec28a --- /dev/null +++ b/features/doc-customprops.feature @@ -0,0 +1,28 @@ +Feature: Read and write custom document properties + In order to find documents and make them manageable by digital means + As a developer using python-docx + I need to access and modify the Dublin Core metadata for a document + + + Scenario: read the custom properties of a document + Given a document having known custom properties + Then I can access the custom properties object + And the custom property values match the known values + + + Scenario: change the custom properties of a document + Given a document having known custom properties + When I assign new values to the custom properties + Then the custom property values match the new values + + + Scenario: a default custom properties part is added if doc doesn't have one + Given a document having no custom properties part + When I access the custom properties object + Then a custom properties part with no values is added + + + Scenario: set custom properties on a document that doesn't have one + Given a document having no custom properties part + When I assign new values to the custom properties + Then the custom property values match the new values diff --git a/features/steps/customprops.py b/features/steps/customprops.py new file mode 100644 index 00000000..d3c06c9d --- /dev/null +++ b/features/steps/customprops.py @@ -0,0 +1,89 @@ +# encoding: utf-8 + +""" +Gherkin step implementations for custom properties-related features. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from datetime import datetime, timedelta + +from behave import given, then, when + +from docx import Document +from docx.opc.customprops import CustomProperties + +from helpers import test_docx + + +# given =================================================== + +@given('a document having known custom properties') +def given_a_document_having_known_custom_properties(context): + context.document = Document(test_docx('doc-customprops')) + + +@given('a document having no custom properties part') +def given_a_document_having_no_custom_properties_part(context): + context.document = Document(test_docx('doc-no-customprops')) + + +# when ==================================================== + +@when('I access the custom properties object') +def when_I_access_the_custom_properties_object(context): + context.document.custom_properties + + +@when("I assign new values to the custom properties") +def when_I_assign_new_values_to_the_custom_properties(context): + context.propvals = ( + ('CustomPropBool', False), + ('CustomPropInt', 1), + ('CustomPropString', 'Lorem ipsum'), + ) + custom_properties = context.document.custom_properties + for name, value in context.propvals: + custom_properties[name] = value + + +# then ==================================================== + +@then('a custom properties part with no values is added') +def then_a_custom_properties_part_with_no_values_is_added(context): + custom_properties = context.document.custom_properties + assert len(custom_properties) == 0 + + +@then('I can access the custom properties object') +def then_I_can_access_the_custom_properties_object(context): + document = context.document + custom_properties = document.custom_properties + assert isinstance(custom_properties, CustomProperties) + + +@then('the custom property values match the known values') +def then_the_custom_property_values_match_the_known_values(context): + known_propvals = ( + ('CustomPropBool', True), + ('CustomPropInt', 13), + ('CustomPropString', 'Test String'), + ) + custom_properties = context.document.custom_properties + for name, expected_value in known_propvals: + value = custom_properties[name] + assert value == expected_value, ( + "got '%s' for custom property '%s'" % (value, name) + ) + + +@then('the custom property values match the new values') +def then_the_custom_property_values_match_the_new_values(context): + custom_properties = context.document.custom_properties + for name, expected_value in context.propvals: + value = custom_properties[name] + assert value == expected_value, ( + "got '%s' for custom property '%s'" % (value, name) + ) diff --git a/features/steps/test_files/doc-customprops.docx b/features/steps/test_files/doc-customprops.docx new file mode 100644 index 00000000..a3dc7a02 Binary files /dev/null and b/features/steps/test_files/doc-customprops.docx differ diff --git a/features/steps/test_files/doc-no-customprops.docx b/features/steps/test_files/doc-no-customprops.docx new file mode 100644 index 00000000..588bf557 Binary files /dev/null and b/features/steps/test_files/doc-no-customprops.docx differ diff --git a/tests/opc/parts/test_customprops.py b/tests/opc/parts/test_customprops.py new file mode 100644 index 00000000..bfd42ecc --- /dev/null +++ b/tests/opc/parts/test_customprops.py @@ -0,0 +1,51 @@ +# encoding: utf-8 + +""" +Unit test suite for the docx.opc.parts.customprops module +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from datetime import datetime, timedelta + +import pytest + +from docx.opc.customprops import CustomProperties +from docx.opc.parts.customprops import CustomPropertiesPart +from docx.oxml.customprops import CT_CustomProperties + +from ...unitutil.mock import class_mock, instance_mock + + +class DescribeCustomPropertiesPart(object): + + def it_provides_access_to_its_custom_props_object(self, customprops_fixture): + custom_properties_part, CustomProperties_ = customprops_fixture + custom_properties = custom_properties_part.custom_properties + CustomProperties_.assert_called_once_with(custom_properties_part.element) + assert isinstance(custom_properties, CustomProperties) + + def it_can_create_a_default_custom_properties_part(self): + custom_properties_part = CustomPropertiesPart.default(None) + assert isinstance(custom_properties_part, CustomPropertiesPart) + custom_properties = custom_properties_part.custom_properties + assert len(custom_properties) == 0 + + # fixtures --------------------------------------------- + + @pytest.fixture + def customprops_fixture(self, element_, CustomProperties_): + custom_properties_part = CustomPropertiesPart(None, None, element_, None) + return custom_properties_part, CustomProperties_ + + # fixture components ----------------------------------- + + @pytest.fixture + def CustomProperties_(self, request): + return class_mock(request, 'docx.opc.parts.customprops.CustomProperties') + + @pytest.fixture + def element_(self, request): + return instance_mock(request, CT_CustomProperties) diff --git a/tests/opc/test_customprops.py b/tests/opc/test_customprops.py new file mode 100644 index 00000000..8f3482d0 --- /dev/null +++ b/tests/opc/test_customprops.py @@ -0,0 +1,90 @@ +# encoding: utf-8 + +""" +Unit test suite for the docx.opc.customprops module +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import pytest + +from datetime import datetime + +from docx.opc.customprops import CustomProperties +from docx.oxml.customprops import CT_CustomProperties +from docx.oxml import parse_xml +from lxml import etree + +class DescribeCustomProperties(object): + + def it_can_read_existing_prop_values(self, prop_get_fixture): + custom_properties, prop_name, exp_value = prop_get_fixture + actual_value = custom_properties[prop_name] + assert actual_value == exp_value + + def it_can_change_existing_prop_values(self): + pass + + def it_can_set_new_prop_values(self, prop_set_fixture): + custom_properties, prop_name, value, exp_xml = prop_set_fixture + custom_properties[prop_name] = value + assert custom_properties._element.xml == exp_xml + + # fixtures ------------------------------------------------------- + + @pytest.fixture(params=[ + ('CustomPropString', 'Test String'), + ('CustomPropBool', True), + ('CustomPropInt', 13), + ('CustomPropFoo', None), + ]) + def prop_get_fixture(self, request, custom_properties_default): + prop_name, expected_value = request.param + return custom_properties_default, prop_name, expected_value + + @pytest.fixture(params=[ + ('CustomPropString', 'lpwstr', 'Hi there!', 'Hi there!'), + ('CustomPropBool', 'bool', '0', False), + ('CustomPropInt', 'i4', '5', 5), + ]) + def prop_set_fixture(self, request, custom_properties_blank): + prop_name, str_type, str_value, value = request.param + expected_xml = self.customProperties(prop_name, str_type, str_value) + return custom_properties_blank, prop_name, value, expected_xml + + # fixture components --------------------------------------------- + + def customProperties(self, prop_name, str_type, str_value): + tmpl = ( + '\n' + ' \n' + ' %s\n' + ' \n' + '' + ) + return tmpl %(prop_name, str_type, str_value, str_type) + + @pytest.fixture + def custom_properties_blank(self): + element = parse_xml( + '' + '\n' + ) + return CustomProperties(element) + + @pytest.fixture + def custom_properties_default(self): + element = parse_xml( + b'\n' + b'\n' + b' 1\n' + b' 13\n' + b' Test String\n' + b'\n' + ) + return CustomProperties(element)