From 6e9ac924da95c5b3df323c35b0ff990c621c89f1 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 11 Sep 2024 11:38:27 +0200 Subject: [PATCH 001/191] docs/conf.py: use absolute path to VERSION --- docs/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 3ab2e1826f..05bde3519e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,8 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('VERSION', encoding='utf-8') as f: +from pathlib import Path +with open(Path(__file__).parent.parent / 'VERSION', encoding='utf-8') as f: VERSION = f.read() From 60b223a296f7ab534963f96d404866622eb6653a Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 11 Sep 2024 11:39:26 +0200 Subject: [PATCH 002/191] update apidocs --- .../ocrd_network/ocrd_network.client_utils.rst | 7 +++++++ .../ocrd_network/ocrd_network.logging_utils.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.helpers.rst | 7 +++++++ .../ocrd_network.rabbitmq_utils.rst | 1 + docs/api/ocrd_network/ocrd_network.rst | 8 ++++---- .../ocrd_network.runtime_data.config_parser.rst | 7 +++++++ ..._network.runtime_data.connection_clients.rst | 7 +++++++ .../ocrd_network.runtime_data.deployer.rst | 7 +++++++ .../ocrd_network.runtime_data.hosts.rst | 7 +++++++ ...ocrd_network.runtime_data.network_agents.rst | 7 +++++++ ...rd_network.runtime_data.network_services.rst | 7 +++++++ .../ocrd_network/ocrd_network.runtime_data.rst | 17 +++++++++++++++-- .../ocrd_network.tcp_to_uds_mets_proxy.rst | 7 +++++++ 13 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 docs/api/ocrd_network/ocrd_network.client_utils.rst create mode 100644 docs/api/ocrd_network/ocrd_network.logging_utils.rst create mode 100644 docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst create mode 100644 docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst create mode 100644 docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst diff --git a/docs/api/ocrd_network/ocrd_network.client_utils.rst b/docs/api/ocrd_network/ocrd_network.client_utils.rst new file mode 100644 index 0000000000..973e27cdb5 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.client_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.client\_utils module +================================== + +.. automodule:: ocrd_network.client_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.logging_utils.rst b/docs/api/ocrd_network/ocrd_network.logging_utils.rst new file mode 100644 index 0000000000..561ce00193 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.logging_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.logging\_utils module +=================================== + +.. automodule:: ocrd_network.logging_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst new file mode 100644 index 0000000000..e13ff897a9 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.helpers.rst @@ -0,0 +1,7 @@ +ocrd\_network.rabbitmq\_utils.helpers module +============================================ + +.. automodule:: ocrd_network.rabbitmq_utils.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst index 36b581a337..63fd6f89aa 100644 --- a/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst +++ b/docs/api/ocrd_network/ocrd_network.rabbitmq_utils.rst @@ -15,5 +15,6 @@ Submodules ocrd_network.rabbitmq_utils.connector ocrd_network.rabbitmq_utils.constants ocrd_network.rabbitmq_utils.consumer + ocrd_network.rabbitmq_utils.helpers ocrd_network.rabbitmq_utils.ocrd_messages ocrd_network.rabbitmq_utils.publisher diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index ae12ae1f5d..d61da39313 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -15,6 +15,7 @@ Subpackages ocrd_network.cli ocrd_network.models ocrd_network.rabbitmq_utils + ocrd_network.runtime_data Submodules ---------- @@ -23,17 +24,16 @@ Submodules :maxdepth: 4 ocrd_network.client + ocrd_network.client_utils ocrd_network.constants ocrd_network.database - ocrd_network.deployer - ocrd_network.deployment_utils - ocrd_network.logging + ocrd_network.logging_utils ocrd_network.param_validators ocrd_network.process_helpers ocrd_network.processing_server ocrd_network.processing_worker ocrd_network.processor_server - ocrd_network.runtime_data ocrd_network.server_cache ocrd_network.server_utils + ocrd_network.tcp_to_uds_mets_proxy ocrd_network.utils diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst new file mode 100644 index 0000000000..e56ad31f89 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.config_parser.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.config\_parser module +================================================= + +.. automodule:: ocrd_network.runtime_data.config_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst new file mode 100644 index 0000000000..2fd62e5ef2 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.connection_clients.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.connection\_clients module +====================================================== + +.. automodule:: ocrd_network.runtime_data.connection_clients + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst new file mode 100644 index 0000000000..62abe20db3 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.deployer.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.deployer module +=========================================== + +.. automodule:: ocrd_network.runtime_data.deployer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst new file mode 100644 index 0000000000..8f9001c381 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.hosts.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.hosts module +======================================== + +.. automodule:: ocrd_network.runtime_data.hosts + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst new file mode 100644 index 0000000000..1a597caad1 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_agents.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_agents module +================================================== + +.. automodule:: ocrd_network.runtime_data.network_agents + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst new file mode 100644 index 0000000000..d72e67c9d6 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.network_services.rst @@ -0,0 +1,7 @@ +ocrd\_network.runtime\_data.network\_services module +==================================================== + +.. automodule:: ocrd_network.runtime_data.network_services + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.runtime_data.rst b/docs/api/ocrd_network/ocrd_network.runtime_data.rst index fefa00b492..cdf45f6b6e 100644 --- a/docs/api/ocrd_network/ocrd_network.runtime_data.rst +++ b/docs/api/ocrd_network/ocrd_network.runtime_data.rst @@ -1,7 +1,20 @@ -ocrd\_network.runtime\_data module -================================== +ocrd\_network.runtime\_data package +=================================== .. automodule:: ocrd_network.runtime_data :members: :undoc-members: :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + ocrd_network.runtime_data.config_parser + ocrd_network.runtime_data.connection_clients + ocrd_network.runtime_data.deployer + ocrd_network.runtime_data.hosts + ocrd_network.runtime_data.network_agents + ocrd_network.runtime_data.network_services diff --git a/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst new file mode 100644 index 0000000000..fa6e607f94 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.tcp_to_uds_mets_proxy.rst @@ -0,0 +1,7 @@ +ocrd\_network.tcp\_to\_uds\_mets\_proxy module +============================================== + +.. automodule:: ocrd_network.tcp_to_uds_mets_proxy + :members: + :undoc-members: + :show-inheritance: From dfeddbd6728523cb78a879246bd07f02cdbc7ab7 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 12 Sep 2024 18:44:26 +0200 Subject: [PATCH 003/191] Fix typos (most of them found by codespell) Signed-off-by: Stefan Weil --- Makefile | 2 +- src/ocrd/mets_server.py | 2 +- src/ocrd_network/processing_server.py | 2 +- tests/network/test_modules_mets_server_proxy.py | 2 +- tests/test_resolver.py | 2 +- tests/test_resource_manager.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 4997066d1b..70b047c8a9 100644 --- a/Makefile +++ b/Makefile @@ -178,7 +178,7 @@ build: # (Re)install the tool install: #build - # not stricttly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 + # not strictly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 $(PIP) install -U pip wheel $(PIP_INSTALL) . $(PIP_INSTALL_CONFIG_OPTION) @# workaround for shapely#1598 diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..c73dbb9b99 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -303,7 +303,7 @@ def add_file( class MpxReq: - """This class wrapps the request bodies needed for the tcp forwarding + """This class wraps the request bodies needed for the tcp forwarding For every mets-server-call like find_files or workspace_path a special request_body is needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions. diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..a9948ccf80 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -320,7 +320,7 @@ async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict """Forward mets-server-request A processor calls a mets related method like add_file with ClientSideOcrdMets. This sends - a request to this endpoint. This request contains all infomation neccessary to make a call + a request to this endpoint. This request contains all information necessary to make a call to the uds-mets-server. This information is used by `MetsServerProxy` to make a the call to the local (local for the processing-server) reachable the uds-mets-server. """ diff --git a/tests/network/test_modules_mets_server_proxy.py b/tests/network/test_modules_mets_server_proxy.py index 8b8c0d35f7..f19d7e415e 100644 --- a/tests/network/test_modules_mets_server_proxy.py +++ b/tests/network/test_modules_mets_server_proxy.py @@ -119,7 +119,7 @@ def test_find_files(start_uds_mets_server): {"file_grp": test_file_group} ) response_dict = MetsServerProxy().forward_tcp_request(request_body=request_body) - assert len(response_dict["files"]) == 3, "Expected to find exatly 3 matching files" + assert len(response_dict["files"]) == 3, "Expected to find exactly 3 matching files" request_body = MpxReq.find_files( TEST_WORKSPACE_DIR, {"file_grp": test_non_existing_file_group} diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..aa0d802926 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -118,7 +118,7 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): """ - Fail with clobber_mets=False, succeeed with clobber_mets=True + Fail with clobber_mets=False, succeed with clobber_mets=True """ # arrange diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 653167e10a..286f6ea6b0 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -80,7 +80,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): assert mgr.userdir == tmp_path -def test_resources_manager_config_explicite(tmp_path): +def test_resources_manager_config_explicit(tmp_path): # act from ocrd.resource_manager import OcrdResourceManager From c597de69b65ca1ef46723fe512c5303360572c9d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Sep 2024 22:29:38 +0200 Subject: [PATCH 004/191] add ocrd-filter processor --- pyproject.toml | 1 + .../processor/builtin/dummy/ocrd-tool.json | 50 +++++++ .../processor/builtin/filter_processor.py | 135 ++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 src/ocrd/processor/builtin/filter_processor.py diff --git a/pyproject.toml b/pyproject.toml index 5a081bb91e..0e643c23ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ Issues = "https://github.com/OCR-D/core/issues" [project.scripts] ocrd = "ocrd.cli:cli" ocrd-dummy = "ocrd.processor.builtin.dummy_processor:cli" +ocrd-filter = "ocrd.processor.builtin.filter_processor:cli" [tool.setuptools] include-package-data = true diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index ef4a4810fe..2f65f58ea3 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -16,6 +16,56 @@ "description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)" } } + }, + "ocrd-filter": { + "executable": "ocrd-filter", + "description": "Bare-bones processor can be dynamically configured to remove segments based on XPath queries", + "steps": ["recognition/post-correction"], + "categories": ["Quality assurance"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "type": { + "type": "string", + "default": "all", + "enum": [ + "all", + "region", + "line", + "word", + "glyph", + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" + ], + "description": "Which type of segments to remove from. Either a precise element name ('TextRegion', 'TextLine') or an alias ('all', 'region', 'line', 'word', 'glyph')." + }, + "query": { + "type": "string", + "default": "", + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'TextEquiv/@conf < 0.7'. Or low layout confidence, 'Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + }, + "plot": { + "type": "boolean", + "default": false, + "description": "Whether to extract an image for each filtered segment and write to the output fileGrp." + } + } } } } diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py new file mode 100644 index 0000000000..b5c1fa9ad1 --- /dev/null +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -0,0 +1,135 @@ +# pylint: disable=missing-module-docstring,invalid-name +from typing import Optional + +from lxml import etree +import click + +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_models.ocrd_file import OcrdFileType +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_utils import ( + make_file_id, + MIME_TO_EXT, + MIMETYPE_PAGE, + xywh_from_points, + parse_json_string_with_comments, + resource_string, + config +) +from ocrd_modelfactory import page_from_file + +def pc_area(ctxt, node): + # FIXME find out why this gets passed as list + node = node[0] + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + return 0 + points = coords.attrib['points'] + xywh = xywh_from_points(points) + return xywh['w'] * xywh['h'] + +def pc_text(ctxt, node): + # FIXME find out why this gets passed as list + node = node[0] + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + return '' + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + return '' + return string.text + +class FilterProcessor(Processor): + + def setup(self): + ns = etree.FunctionNamespace(None) + ns['pixelarea'] = pc_area + # cannot use text() - conflicts with builtin fn + ns['textequiv'] = pc_text + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """ + Remove segments based on flexible selection criteria. + + Open and deserialise PAGE input file, then iterate over the segment hierarchy + down to the level required for ``type``. + + Remove any segments of type ``type`` which also evaluate the XPath predicate ``query`` + to true (or non-empty). + + If ``plot`` is `true`, then extract and write an image file for all removed segments + to the output fileGrp (without reference to the PAGE). + + Produce a new PAGE output file by serialising the resulting hierarchy. + """ + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + root = pcgts.etree + NS = {'re': 'http://exslt.org/regular-expressions', + 'pc': root.nsmap[root.prefix], + root.prefix: root.nsmap[root.prefix]} + segtype = self.parameter['type'] + segpred = self.parameter['query'] + if segtype == 'region': + segments = pcgts.get_Page().get_AllRegions() + elif segtype == 'line': + segments = pcgts.get_Page().get_AllTextLines() + elif segtype == 'word': + lines = pcgts.get_Page().get_AllTextLines() + segments = [word for line in lines for word in line.get_Word() or []] + elif segtype == 'glyph': + lines = pcgts.get_Page().get_AllTextLines() + segments = [glyph for line in lines for word in line.get_Word() or [] for glyph in word.get_Glyph() or []] + else: + nodes = [node.attrib['id'] for node in pcgts.etree.xpath(f'//pc:{segtype}', namespaces=NS)] + regions = pcgts.get_Page().get_AllRegions() + textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] + lines = [line for region in textregions for line in region.get_TextLine() or []] + words = [word for line in lines for word in line.get_Word() or []] + glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] + segments = [segment for segment in regions + lines + words + glyphs + if segment.id in nodes or segtype == 'all'] + if not(len(segments)): + self.logger.info("no matches") + return result + if self.parameter['plot']: + page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) + for segment in segments: + node = pcgts.mapping[id(segment)] + if not segpred or node.xpath(segpred, namespaces=NS): + segtype = segment.original_tagname_ + self.logger.info("matched %s segment %s", segtype, segment.id) + parent = segment.parent_object_ + partype = parent.__class__.__name__.replace('Type', '') + if partype == 'Page': + getattr(parent, 'get_' + segtype)().remove(segment) + elif partype.endswith('Region'): + if segtype.endswith('Region'): + getattr(parent, 'get_' + segtype)().remove(segment) + else: + parent.TextLine.remove(segment) + elif partype == 'TextLine': + parent.Word.remove(segment) + elif partype == 'Word': + parent.Glyph.remove(segment) + else: + raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") + segment.parent_object_ = None + if self.parameter['plot']: + segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) + result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) + return result + + @property + def metadata_filename(self): + return 'processor/builtin/dummy/ocrd-tool.json' + + @property + def executable(self): + return 'ocrd-filter' + +@click.command() +@ocrd_cli_options +def cli(*args, **kwargs): + return ocrd_cli_wrap_processor(FilterProcessor, *args, **kwargs) From 465ebdb128899f9bddde119d0319adfe71e9f13f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:26:32 +0200 Subject: [PATCH 005/191] ocrd-filter: also remove removed segments from ReadingOrder --- src/ocrd/processor/builtin/dummy/ocrd-tool.json | 2 +- src/ocrd/processor/builtin/filter_processor.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 2f65f58ea3..97174a4073 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -58,7 +58,7 @@ "query": { "type": "string", "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'TextEquiv/@conf < 0.7'. Or low layout confidence, 'Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index b5c1fa9ad1..70e4a0c3dc 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -93,6 +93,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if not(len(segments)): self.logger.info("no matches") return result + rodict = pcgts.get_Page().get_ReadingOrderGroups() if self.parameter['plot']: page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: @@ -116,6 +117,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional else: raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") segment.parent_object_ = None + if segtype.endswith('Region') and segment.id in rodict: + # remove from ReadingOrder as well + roelem = rodict[segment.id] + rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) + rorefs.remove(roelem) + roelem.parent_object_ = None + del rodict[segment.id] if self.parameter['plot']: segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) From 6983fd6ee7e832b1a97ad3d61803a038485e3197 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 10 Sep 2024 19:16:05 +0200 Subject: [PATCH 006/191] ocrd-filter: register XPath functions under PAGE prefix/NS, precompile, avoid buggy lxml global registration mechanism --- .../processor/builtin/dummy/ocrd-tool.json | 2 +- .../processor/builtin/filter_processor.py | 169 ++++++++++++------ 2 files changed, 119 insertions(+), 52 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 97174a4073..3d73169ec6 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -58,7 +58,7 @@ "query": { "type": "string", "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pixelarea()' for the number of pixels of the bounding box, and 'textequiv()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pixelarea(.) div string-length(textequiv(.)) > 500'." + "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pc:area()' for the number of pixels of the bounding box, and 'pc:text()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pc:area(.) div string-length(pc:text(.)) > 500'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 70e4a0c3dc..1db9c0b131 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -8,6 +8,7 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.constants import NAMESPACES from ocrd_utils import ( make_file_id, MIME_TO_EXT, @@ -19,34 +20,110 @@ ) from ocrd_modelfactory import page_from_file -def pc_area(ctxt, node): - # FIXME find out why this gets passed as list - node = node[0] - coords = node.find(f'{node.prefix}:Coords', node.nsmap) - if coords is None: - return 0 - points = coords.attrib['points'] - xywh = xywh_from_points(points) - return xywh['w'] * xywh['h'] - -def pc_text(ctxt, node): - # FIXME find out why this gets passed as list - node = node[0] - equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) - if equiv is None: - return '' - string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) - if string is None: - return '' - return string.text +def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): + ns = etree.FunctionNamespace(ns_uri) + if ns_prefix: + # FIXME: this crashes lxml (even with just a single thread) when called repeatedly + # we work around this by using the `extensions` kwarg to XPath init in setup() below + # (i.e. registerLocalFunctions instead of registerGlobalFunctions) + #ns.prefix = ns_prefix + raise NotImplementedError() + name = func.__name__.replace('_', '-') + if ns_prefix and name.startswith(ns_prefix): + name = name[len(ns_prefix):] + if name.startswith('-'): + name = name[1:] + ns[name] = func + return func -class FilterProcessor(Processor): +def pc_xpath(func): + return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') + +#@pc_xpath +def pc_area(ctxt, nodes): + """ + Extract Coords/@points from all nodes, calculate the bounding + box, and accumulate areas. + """ + area = 0 + for node in nodes: + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + continue + points = coords.attrib['points'] + xywh = xywh_from_points(points) + area += xywh['w'] * xywh['h'] + return area + +#@pc_xpath +def pc_text(ctxt, nodes): + """ + Extract TextEquiv/Unicode from all nodes, then concatenate + (interspersed with spaces or newlines). + """ + text = '' + for node in nodes: + if text and node.tag.endswith('Region'): + text += '\n' + if text and node.tag.endswith('Line'): + text += '\n' + if text and node.tag.endswith('Word'): + text += ' ' + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + continue + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + continue + text += str(string.text) + return text +_SEGTYPES = [ + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" +] + +class FilterProcessor(Processor): def setup(self): - ns = etree.FunctionNamespace(None) - ns['pixelarea'] = pc_area - # cannot use text() - conflicts with builtin fn - ns['textequiv'] = pc_text + NS = {'re': 'http://exslt.org/regular-expressions', + 'pc': NAMESPACES['page']} + extensions = {(NAMESPACES['page'], 'area'): pc_area, + (NAMESPACES['page'], 'text'): pc_text} + segtype = self.parameter['type'] + if segtype == 'all': + segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES) + elif segtype == 'region': + segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES if segtype.endswith('Region')) + elif segtype == 'line': + segtype = '//pc:TextLine' + elif segtype == 'word': + segtype = '//pc:Word' + elif segtype == 'glyph': + segtype = '//pc:Glyph' + else: + segtype = '//pc:' + segtype + self.segtypexpath = etree.XPath(segtype, namespaces=NS, extensions=extensions) + segpred = self.parameter['query'] + if segpred: + self.segpredxpath = etree.XPath(segpred, namespaces=NS, extensions=extensions) + else: + self.segpredxpath = lambda: True def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ @@ -65,31 +142,18 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - root = pcgts.etree - NS = {'re': 'http://exslt.org/regular-expressions', - 'pc': root.nsmap[root.prefix], - root.prefix: root.nsmap[root.prefix]} - segtype = self.parameter['type'] - segpred = self.parameter['query'] - if segtype == 'region': - segments = pcgts.get_Page().get_AllRegions() - elif segtype == 'line': - segments = pcgts.get_Page().get_AllTextLines() - elif segtype == 'word': - lines = pcgts.get_Page().get_AllTextLines() - segments = [word for line in lines for word in line.get_Word() or []] - elif segtype == 'glyph': - lines = pcgts.get_Page().get_AllTextLines() - segments = [glyph for line in lines for word in line.get_Word() or [] for glyph in word.get_Glyph() or []] - else: - nodes = [node.attrib['id'] for node in pcgts.etree.xpath(f'//pc:{segtype}', namespaces=NS)] - regions = pcgts.get_Page().get_AllRegions() - textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] - lines = [line for region in textregions for line in region.get_TextLine() or []] - words = [word for line in lines for word in line.get_Word() or []] - glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] - segments = [segment for segment in regions + lines + words + glyphs - if segment.id in nodes or segtype == 'all'] + nodes = [node.attrib['id'] for node in self.segtypexpath(pcgts.etree)] + if self.segtypexpath.error_log: + self.logger.error(self.segtypexpath.error_log) + # get PAGE objects from matching etree nodes + # FIXME: this should be easier (OcrdPage should have id lookup mechanism) + regions = pcgts.get_Page().get_AllRegions() + textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] + lines = [line for region in textregions for line in region.get_TextLine() or []] + words = [word for line in lines for word in line.get_Word() or []] + glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] + segments = [segment for segment in regions + lines + words + glyphs + if segment.id in nodes] if not(len(segments)): self.logger.info("no matches") return result @@ -98,7 +162,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: node = pcgts.mapping[id(segment)] - if not segpred or node.xpath(segpred, namespaces=NS): + assert isinstance(node, etree._Element) + if self.segpredxpath(node): segtype = segment.original_tagname_ self.logger.info("matched %s segment %s", segtype, segment.id) parent = segment.parent_object_ @@ -127,6 +192,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if self.parameter['plot']: segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) + if self.segpredxpath.error_log: + self.logger.error(self.segpredxpath.error_log) return result @property From 62712364d3e38dd0ebfd324917515a30a07b461b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 00:00:26 +0200 Subject: [PATCH 007/191] ocrd-filter: simplify parameters (just 'select' instead of 'type' and 'query'), use 'elementpath.XPathParser.external_function' with global registration instead of 'etree.FunctionNamespace' with local extension --- requirements.txt | 1 + .../processor/builtin/dummy/ocrd-tool.json | 36 +---- .../processor/builtin/filter_processor.py | 125 ++++++++---------- 3 files changed, 58 insertions(+), 104 deletions(-) diff --git a/requirements.txt b/requirements.txt index e78c186618..1c14260ae5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click >=7 cryptography < 43.0.0 Deprecated == 1.2.0 docker +elementpath fastapi>=0.78.0 filetype Flask diff --git a/src/ocrd/processor/builtin/dummy/ocrd-tool.json b/src/ocrd/processor/builtin/dummy/ocrd-tool.json index 3d73169ec6..c79afcacbd 100644 --- a/src/ocrd/processor/builtin/dummy/ocrd-tool.json +++ b/src/ocrd/processor/builtin/dummy/ocrd-tool.json @@ -25,40 +25,10 @@ "input_file_grp_cardinality": 1, "output_file_grp_cardinality": 1, "parameters": { - "type": { + "select": { "type": "string", - "default": "all", - "enum": [ - "all", - "region", - "line", - "word", - "glyph", - "NoiseRegion", - "LineDrawingRegion", - "AdvertRegion", - "ImageRegion", - "ChartRegion", - "MusicRegion", - "GraphicRegion", - "UnknownRegion", - "CustomRegion", - "SeparatorRegion", - "MathsRegion", - "TextRegion", - "MapRegion", - "ChemRegion", - "TableRegion", - "TextLine", - "Word", - "Glyph" - ], - "description": "Which type of segments to remove from. Either a precise element name ('TextRegion', 'TextLine') or an alias ('all', 'region', 'line', 'word', 'glyph')." - }, - "query": { - "type": "string", - "default": "", - "description": "Which segments to select for removal (XPath predicate). Matches unconditionally, if empty. For example, to remove elements with low text confidence, set 'pc:TextEquiv/@conf < 0.7'. Or low layout confidence, 'pc:Coords/@conf < 0.7'.\nSupports extra predicates 'pc:area()' for the number of pixels of the bounding box, and 'pc:text()' for the first TextEquiv unicode string. For example, to remove high pixel-to-character rate, set 'pc:area(.) div string-length(pc:text(.)) > 500'." + "default": "//*[ends-with(local-name(),'Region')]", + "description": "Which segments to select for removal. An XPath 2.0 query expression (path and optional predicates), with 'pc' as namespace prefix for PAGE-XML and our extension functions (see help text). Only selection of segment hierarchy elements is allowed (so e.g. `*` would be equivalent to `pc:NoiseRegion|pc:LineDrawingRegion|pc:AdvertRegion|pc:ImageRegion|pc:ChartRegion|pc:MusicRegion|pc:GraphicRegion|pc:UnknownRegion|pc:CustomRegion|pc:SeparatorRegion|pc:MathsRegion|pc:TextRegion|pc:MapRegion|pc:ChemRegion|pc:TableRegion|pc:TextLine|pc:Word|pc:Glyph`, but `pc:MetadataItem` or `pc:Border` or `pc:Coords` would not match).\nFor example, to remove words or glyphs with low text confidence, select '(pc:Word|pc:Glyph)[pc:TextEquiv/@conf < 0.7]'. Or low layout confidence, '*[pc:Coords/@conf < 0.7]'.\nTo remove high pixel-to-character rate, select '*[pc:pixelarea(.) div string-length(pc:textequiv(.)) > 10000]'." }, "plot": { "type": "boolean", diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 1db9c0b131..a8beb09da3 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -2,6 +2,7 @@ from typing import Optional from lxml import etree +import elementpath import click from ocrd import Processor, OcrdPageResult, OcrdPageResultImage @@ -20,33 +21,31 @@ ) from ocrd_modelfactory import page_from_file +PARSER = elementpath.XPath2Parser(namespaces={**NAMESPACES, 'pc': NAMESPACES['page']}) + def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): - ns = etree.FunctionNamespace(ns_uri) - if ns_prefix: - # FIXME: this crashes lxml (even with just a single thread) when called repeatedly - # we work around this by using the `extensions` kwarg to XPath init in setup() below - # (i.e. registerLocalFunctions instead of registerGlobalFunctions) - #ns.prefix = ns_prefix - raise NotImplementedError() name = func.__name__.replace('_', '-') if ns_prefix and name.startswith(ns_prefix): name = name[len(ns_prefix):] if name.startswith('-'): name = name[1:] - ns[name] = func + # register + PARSER.external_function(func, name=name, prefix=ns_prefix) return func def pc_xpath(func): return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') -#@pc_xpath -def pc_area(ctxt, nodes): +@pc_xpath +def pc_pixelarea(nodes): """ Extract Coords/@points from all nodes, calculate the bounding box, and accumulate areas. """ area = 0 for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value coords = node.find(f'{node.prefix}:Coords', node.nsmap) if coords is None: continue @@ -55,14 +54,16 @@ def pc_area(ctxt, nodes): area += xywh['w'] * xywh['h'] return area -#@pc_xpath -def pc_text(ctxt, nodes): +@pc_xpath +def pc_textequiv(nodes): """ Extract TextEquiv/Unicode from all nodes, then concatenate (interspersed with spaces or newlines). """ text = '' for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value if text and node.tag.endswith('Region'): text += '\n' if text and node.tag.endswith('Line'): @@ -101,39 +102,26 @@ def pc_text(ctxt, nodes): class FilterProcessor(Processor): def setup(self): - NS = {'re': 'http://exslt.org/regular-expressions', - 'pc': NAMESPACES['page']} - extensions = {(NAMESPACES['page'], 'area'): pc_area, - (NAMESPACES['page'], 'text'): pc_text} - segtype = self.parameter['type'] - if segtype == 'all': - segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES) - elif segtype == 'region': - segtype = '|'.join('//pc:' + segtype for segtype in _SEGTYPES if segtype.endswith('Region')) - elif segtype == 'line': - segtype = '//pc:TextLine' - elif segtype == 'word': - segtype = '//pc:Word' - elif segtype == 'glyph': - segtype = '//pc:Glyph' - else: - segtype = '//pc:' + segtype - self.segtypexpath = etree.XPath(segtype, namespaces=NS, extensions=extensions) - segpred = self.parameter['query'] - if segpred: - self.segpredxpath = etree.XPath(segpred, namespaces=NS, extensions=extensions) - else: - self.segpredxpath = lambda: True + token = PARSER.parse(self.parameter['select']) + def select(root): + context = elementpath.XPathContext(root) + return token.get_results(context) + self.selectxpath = select def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ - Remove segments based on flexible selection criteria. + Remove PAGE segment hierarchy elements based on flexible selection criteria. Open and deserialise PAGE input file, then iterate over the segment hierarchy - down to the level required for ``type``. + down to the level required for ``select`` (which could be multiple levels at once). + + Remove any segments matching XPath query ``select`` from that hierarchy (and from + the `ReadingOrder` if it is a region type). - Remove any segments of type ``type`` which also evaluate the XPath predicate ``query`` - to true (or non-empty). + \b + Besides full XPath 2.0 syntax, this supports extra predicates: + - `pc:pixelarea()` for the number of pixels of the bounding box (or sum area on node sets), + - `pc:textequiv()` for the first TextEquiv unicode string (or concatenated string on node sets). If ``plot`` is `true`, then extract and write an image file for all removed segments to the output fileGrp (without reference to the PAGE). @@ -142,9 +130,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] for node in self.segtypexpath(pcgts.etree)] - if self.segtypexpath.error_log: - self.logger.error(self.segtypexpath.error_log) + nodes = [node.attrib['id'] for node in self.selectxpath(pcgts.etree) if 'id' in node.attrib] # get PAGE objects from matching etree nodes # FIXME: this should be easier (OcrdPage should have id lookup mechanism) regions = pcgts.get_Page().get_AllRegions() @@ -163,37 +149,34 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for segment in segments: node = pcgts.mapping[id(segment)] assert isinstance(node, etree._Element) - if self.segpredxpath(node): - segtype = segment.original_tagname_ - self.logger.info("matched %s segment %s", segtype, segment.id) - parent = segment.parent_object_ - partype = parent.__class__.__name__.replace('Type', '') - if partype == 'Page': + segtype = segment.original_tagname_ + self.logger.info("matched %s segment %s", segtype, segment.id) + parent = segment.parent_object_ + partype = parent.__class__.__name__.replace('Type', '') + if partype == 'Page': + getattr(parent, 'get_' + segtype)().remove(segment) + elif partype.endswith('Region'): + if segtype.endswith('Region'): getattr(parent, 'get_' + segtype)().remove(segment) - elif partype.endswith('Region'): - if segtype.endswith('Region'): - getattr(parent, 'get_' + segtype)().remove(segment) - else: - parent.TextLine.remove(segment) - elif partype == 'TextLine': - parent.Word.remove(segment) - elif partype == 'Word': - parent.Glyph.remove(segment) else: - raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") - segment.parent_object_ = None - if segtype.endswith('Region') and segment.id in rodict: - # remove from ReadingOrder as well - roelem = rodict[segment.id] - rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) - rorefs.remove(roelem) - roelem.parent_object_ = None - del rodict[segment.id] - if self.parameter['plot']: - segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) - result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) - if self.segpredxpath.error_log: - self.logger.error(self.segpredxpath.error_log) + parent.TextLine.remove(segment) + elif partype == 'TextLine': + parent.Word.remove(segment) + elif partype == 'Word': + parent.Glyph.remove(segment) + else: + raise Exception(f"unexpected type ({partype}) of parent for matched segment ({segtype})") + segment.parent_object_ = None + if segtype.endswith('Region') and segment.id in rodict: + # remove from ReadingOrder as well + roelem = rodict[segment.id] + rorefs = getattr(roelem.parent_object_, roelem.__class__.__name__.replace('Type', '')) + rorefs.remove(roelem) + roelem.parent_object_ = None + del rodict[segment.id] + if self.parameter['plot']: + segment_image, _ = self.workspace.image_from_segment(segment, page_image, page_coords) + result.images.append(OcrdPageResultImage(segment_image, segment.id + '.IMG', None)) return result @property From 09cad0ff5434418536f68f11b91598db015786e2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 01:21:24 +0200 Subject: [PATCH 008/191] ocrd_models.OcrdPage: add XPath 2.0 parser and extended functions --- src/ocrd_models/ocrd_page.py | 15 +++++++++ src/ocrd_models/xpath_functions.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 src/ocrd_models/xpath_functions.py diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 3f0cc690fa..6a8ea4586f 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -4,6 +4,7 @@ from io import StringIO from typing import Dict, Union from lxml import etree as ET +from elementpath import XPath2Parser, XPathContext __all__ = [ 'parse', @@ -132,6 +133,7 @@ ) from .constants import NAMESPACES +from .xpath_functions import pc_functions # add docstrings parse.__doc__ = ( @@ -195,6 +197,19 @@ def __init__( self.etree = etree self.mapping = mapping self.revmap = revmap + self.xpath_parser = XPath2Parser(namespaces={ + 'page': NAMESPACES['page'], + 'pc': NAMESPACES['page']}) + for func in pc_functions: + name = func.__name__.replace('_', '-') + if name.startswith('pc-'): + name = name[3:] + elif name.startswith('pc'): + name = name[2:] + # register + self.xpath_parser.external_function(func, name=name, prefix='pc') + self.xpath_context = XPathContext(self.etree) + self.xpath = lambda expression: self.xpath_parser.parse(expression).get_results(self.xpath_context) def __getattr__(self, name): return getattr(self._pcgts, name) diff --git a/src/ocrd_models/xpath_functions.py b/src/ocrd_models/xpath_functions.py new file mode 100644 index 0000000000..c204811cae --- /dev/null +++ b/src/ocrd_models/xpath_functions.py @@ -0,0 +1,51 @@ +from ocrd_utils import xywh_from_points + +pc_functions = [] + +def _export(func): + pc_functions.append(func) + return func + +@_export +def pc_pixelarea(nodes): + """ + Extract Coords/@points from all nodes, calculate the bounding + box, and accumulate areas. + """ + area = 0 + for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value + coords = node.find(f'{node.prefix}:Coords', node.nsmap) + if coords is None: + continue + points = coords.attrib['points'] + xywh = xywh_from_points(points) + area += xywh['w'] * xywh['h'] + return area + +@_export +def pc_textequiv(nodes): + """ + Extract TextEquiv/Unicode from all nodes, then concatenate + (interspersed with spaces or newlines). + """ + text = '' + for node in nodes: + # FIXME: find out why we need to go to the parent here + node = node.parent.value + if text and node.tag.endswith('Region'): + text += '\n' + if text and node.tag.endswith('Line'): + text += '\n' + if text and node.tag.endswith('Word'): + text += ' ' + equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) + if equiv is None: + continue + string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) + if string is None: + continue + text += str(string.text) + return text + From 634384931ca758f82a949f722fdb24d6c5ae0d2f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 01:22:07 +0200 Subject: [PATCH 009/191] ocrd-filter: adapt (just delegate to OcrdPage.xpath) --- src/ocrd/processor/builtin/dummy_processor.py | 3 - .../processor/builtin/filter_processor.py | 105 +----------------- 2 files changed, 4 insertions(+), 104 deletions(-) diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index 72a260968f..bf7e2940b8 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -13,9 +13,6 @@ make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, - parse_json_string_with_comments, - resource_string, - config ) from ocrd_modelfactory import page_from_file diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index a8beb09da3..10b5572c3f 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -2,112 +2,13 @@ from typing import Optional from lxml import etree -import elementpath import click from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_file import OcrdFileType -from ocrd_models.ocrd_page import OcrdPage, to_xml -from ocrd_models.constants import NAMESPACES -from ocrd_utils import ( - make_file_id, - MIME_TO_EXT, - MIMETYPE_PAGE, - xywh_from_points, - parse_json_string_with_comments, - resource_string, - config -) -from ocrd_modelfactory import page_from_file - -PARSER = elementpath.XPath2Parser(namespaces={**NAMESPACES, 'pc': NAMESPACES['page']}) - -def xpath(func, *, ns_uri: Optional[str] = None, ns_prefix: Optional[str] = ''): - name = func.__name__.replace('_', '-') - if ns_prefix and name.startswith(ns_prefix): - name = name[len(ns_prefix):] - if name.startswith('-'): - name = name[1:] - # register - PARSER.external_function(func, name=name, prefix=ns_prefix) - return func - -def pc_xpath(func): - return xpath(func, ns_uri=NAMESPACES['page'], ns_prefix='pc') - -@pc_xpath -def pc_pixelarea(nodes): - """ - Extract Coords/@points from all nodes, calculate the bounding - box, and accumulate areas. - """ - area = 0 - for node in nodes: - # FIXME: find out why we need to go to the parent here - node = node.parent.value - coords = node.find(f'{node.prefix}:Coords', node.nsmap) - if coords is None: - continue - points = coords.attrib['points'] - xywh = xywh_from_points(points) - area += xywh['w'] * xywh['h'] - return area - -@pc_xpath -def pc_textequiv(nodes): - """ - Extract TextEquiv/Unicode from all nodes, then concatenate - (interspersed with spaces or newlines). - """ - text = '' - for node in nodes: - # FIXME: find out why we need to go to the parent here - node = node.parent.value - if text and node.tag.endswith('Region'): - text += '\n' - if text and node.tag.endswith('Line'): - text += '\n' - if text and node.tag.endswith('Word'): - text += ' ' - equiv = node.find(f'{node.prefix}:TextEquiv', node.nsmap) - if equiv is None: - continue - string = equiv.find(f'{node.prefix}:Unicode', node.nsmap) - if string is None: - continue - text += str(string.text) - return text - -_SEGTYPES = [ - "NoiseRegion", - "LineDrawingRegion", - "AdvertRegion", - "ImageRegion", - "ChartRegion", - "MusicRegion", - "GraphicRegion", - "UnknownRegion", - "CustomRegion", - "SeparatorRegion", - "MathsRegion", - "TextRegion", - "MapRegion", - "ChemRegion", - "TableRegion", - "TextLine", - "Word", - "Glyph" -] +from ocrd_models import OcrdPage class FilterProcessor(Processor): - def setup(self): - token = PARSER.parse(self.parameter['select']) - def select(root): - context = elementpath.XPathContext(root) - return token.get_results(context) - self.selectxpath = select - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Remove PAGE segment hierarchy elements based on flexible selection criteria. @@ -130,7 +31,9 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] for node in self.selectxpath(pcgts.etree) if 'id' in node.attrib] + nodes = [node.attrib['id'] + for node in pcgts.xpath(self.parameter['select']) + if 'id' in node.attrib] # get PAGE objects from matching etree nodes # FIXME: this should be easier (OcrdPage should have id lookup mechanism) regions = pcgts.get_Page().get_AllRegions() From c47ae77cf57b2fbe4227cc3c9541de5a7d0f1031 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:50:45 +0200 Subject: [PATCH 010/191] OcrdPage: migrate to newest generateds, adapt user methods, re-generate --- requirements_test.txt | 2 +- src/ocrd_models/ocrd_page_generateds.py | 4251 +++++++++++------ src/ocrd_page_user_methods.py | 2 +- ...upType.py => _exportChildren_GroupType.py} | 11 +- 4 files changed, 2823 insertions(+), 1443 deletions(-) rename src/ocrd_page_user_methods/{exportChildren_GroupType.py => _exportChildren_GroupType.py} (65%) diff --git a/requirements_test.txt b/requirements_test.txt index a6a87918fc..585bb53954 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,7 +1,7 @@ autopep8 cryptography < 43.0.0 pytest >= 4.0.0 -generateDS == 2.35.20 +generateDS == 2.44.1 pytest-benchmark >= 3.2.3 pytest-timeout coverage >= 4.5.2 diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index f2b7c0551e..97d5a800b6 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # -# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Generated Sun Sep 15 21:49:27 2024 by generateDS.py version 2.44.1. # Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: @@ -24,21 +24,23 @@ # core # +import sys +try: + ModulenotfoundExp_ = ModuleNotFoundError +except NameError: + ModulenotfoundExp_ = ImportError from itertools import zip_longest import os -import sys import re as re_ import base64 import datetime as datetime_ import decimal as decimal_ -try: - from lxml import etree as etree_ -except ImportError: - from xml.etree import ElementTree as etree_ +from lxml import etree as etree_ Validate_simpletypes_ = True SaveElementTreeNode = True +TagNamePrefix = "" if sys.version_info.major == 2: BaseStrType_ = basestring else: @@ -97,7 +99,7 @@ def parsexmlstring_(instring, parser=None, **kwargs): # Additionally, the generatedsnamespaces module can contain a python # dictionary named GenerateDSNamespaceTypePrefixes that associates element # types with the namespace prefixes that are to be added to the -# "xsi:type" attribute value. See the exportAttributes method of +# "xsi:type" attribute value. See the _exportAttributes method of # any generated element type and the generation of "xsi:type" for an # example of the use of this table. # An example table: @@ -112,11 +114,11 @@ def parsexmlstring_(instring, parser=None, **kwargs): try: from generatedsnamespaces import GenerateDSNamespaceDefs as GenerateDSNamespaceDefs_ -except ImportError: +except ModulenotfoundExp_ : GenerateDSNamespaceDefs_ = {} try: from generatedsnamespaces import GenerateDSNamespaceTypePrefixes as GenerateDSNamespaceTypePrefixes_ -except ImportError: +except ModulenotfoundExp_ : GenerateDSNamespaceTypePrefixes_ = {} # @@ -127,7 +129,7 @@ def parsexmlstring_(instring, parser=None, **kwargs): # try: from generatedscollector import GdsCollector as GdsCollector_ -except ImportError: +except ModulenotfoundExp_ : class GdsCollector_(object): @@ -161,7 +163,7 @@ def write_messages(self, outstream): try: from enum import Enum -except ImportError: +except ModulenotfoundExp_ : Enum = object # @@ -174,7 +176,7 @@ def write_messages(self, outstream): class GeneratedsSuper(object): __hash__ = object.__hash__ - tzoff_pattern = re_.compile(r'(\+|-)((0\d|1[0-3]):[0-5]\d|14:00)$') + tzoff_pattern = re_.compile('(\\+|-)((0[0-9]|1[0-3]):[0-5][0-9]|14:00)$') class _FixedOffsetTZ(datetime_.tzinfo): def __init__(self, offset, name): self.__offset = datetime_.timedelta(minutes=offset) @@ -185,6 +187,33 @@ def tzname(self, dt): return self.__name def dst(self, dt): return None + def __str__(self): + settings = { + 'str_pretty_print': True, + 'str_indent_level': 0, + 'str_namespaceprefix': '', + 'str_name': self.__class__.__name__, + 'str_namespacedefs': '', + } + for n in settings: + if hasattr(self, n): + settings[n] = getattr(self, n) + if sys.version_info.major == 2: + from StringIO import StringIO + else: + from io import StringIO + output = StringIO() + self.export( + output, + settings['str_indent_level'], + pretty_print=settings['str_pretty_print'], + namespaceprefix_=settings['str_namespaceprefix'], + name_=settings['str_name'], + namespacedef_=settings['str_namespacedefs'] + ) + strval = output.getvalue() + output.close() + return strval def gds_format_string(self, input_data, input_name=''): return input_data def gds_parse_string(self, input_data, node=None, input_name=''): @@ -195,11 +224,11 @@ def gds_validate_string(self, input_data, node=None, input_name=''): else: return input_data def gds_format_base64(self, input_data, input_name=''): - return base64.b64encode(input_data) + return base64.b64encode(input_data).decode('ascii') def gds_validate_base64(self, input_data, node=None, input_name=''): return input_data def gds_format_integer(self, input_data, input_name=''): - return '%d' % input_data + return '%d' % int(input_data) def gds_parse_integer(self, input_data, node=None, input_name=''): try: ival = int(input_data) @@ -213,6 +242,8 @@ def gds_validate_integer(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires integer value') return value def gds_format_integer_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_integer_list( self, input_data, node=None, input_name=''): @@ -221,10 +252,14 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer valuess') + raise_parse_error(node, 'Requires sequence of integer values') return values def gds_format_float(self, input_data, input_name=''): - return ('%.15f' % input_data).rstrip('0') + value = ('%.15f' % float(input_data)).rstrip('0') + if value.endswith('.'): + value += '0' + return value + def gds_parse_float(self, input_data, node=None, input_name=''): try: fval_ = float(input_data) @@ -238,6 +273,8 @@ def gds_validate_float(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires float value') return value def gds_format_float_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_float_list( self, input_data, node=None, input_name=''): @@ -249,7 +286,12 @@ def gds_validate_float_list( raise_parse_error(node, 'Requires sequence of float values') return values def gds_format_decimal(self, input_data, input_name=''): - return ('%s' % input_data).rstrip('0') + return_value = '%s' % input_data + if '.' in return_value: + return_value = return_value.rstrip('0') + if return_value.endswith('.'): + return_value = return_value.rstrip('.') + return return_value def gds_parse_decimal(self, input_data, node=None, input_name=''): try: decimal_value = decimal_.Decimal(input_data) @@ -263,7 +305,9 @@ def gds_validate_decimal(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires decimal value') return value def gds_format_decimal_list(self, input_data, input_name=''): - return '%s' % ' '.join(input_data) + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] + return ' '.join([self.gds_format_decimal(item) for item in input_data]) def gds_validate_decimal_list( self, input_data, node=None, input_name=''): values = input_data.split() @@ -274,7 +318,7 @@ def gds_validate_decimal_list( raise_parse_error(node, 'Requires sequence of decimal values') return values def gds_format_double(self, input_data, input_name=''): - return '%e' % input_data + return '%s' % input_data def gds_parse_double(self, input_data, node=None, input_name=''): try: fval_ = float(input_data) @@ -288,6 +332,8 @@ def gds_validate_double(self, input_data, node=None, input_name=''): raise_parse_error(node, 'Requires double or float value') return value def gds_format_double_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_double_list( self, input_data, node=None, input_name=''): @@ -302,6 +348,7 @@ def gds_validate_double_list( def gds_format_boolean(self, input_data, input_name=''): return ('%s' % input_data).lower() def gds_parse_boolean(self, input_data, node=None, input_name=''): + input_data = input_data.strip() if input_data in ('true', '1'): bval = True elif input_data in ('false', '0'): @@ -317,11 +364,14 @@ def gds_validate_boolean(self, input_data, node=None, input_name=''): '(one of True, 1, False, 0)') return input_data def gds_format_boolean_list(self, input_data, input_name=''): + if len(input_data) > 0 and not isinstance(input_data[0], BaseStrType_): + input_data = [str(s) for s in input_data] return '%s' % ' '.join(input_data) def gds_validate_boolean_list( self, input_data, node=None, input_name=''): values = input_data.split() for value in values: + value = self.gds_parse_boolean(value, node, input_name) if value not in (True, 1, False, 0, ): raise_parse_error( node, @@ -478,6 +528,7 @@ def gds_validate_simple_patterns(self, patterns, target): # The target value must match at least one of the patterns # in order for the test to succeed. found1 = True + target = str(target) for patterns1 in patterns: found2 = False for patterns2 in patterns1: @@ -563,7 +614,7 @@ def get_path_(self, node): path_list.reverse() path = '/'.join(path_list) return path - Tag_strip_pattern_ = re_.compile(r'\{.*\}') + Tag_strip_pattern_ = re_.compile(r'{.*}') def get_path_list_(self, node, path_list): if node is None: return @@ -723,6 +774,7 @@ def quote_attrib(inStr): s1 = s1.replace('&', '&') s1 = s1.replace('<', '<') s1 = s1.replace('>', '>') + s1 = s1.replace('\n', ' ') if '"' in s1: if "'" in s1: s1 = '"%s"' % s1.replace('"', """) @@ -768,7 +820,10 @@ def find_attr_value_(attr_name, node): value = attrs.get(attr_name) elif len(attr_parts) == 2: prefix, name = attr_parts - namespace = node.nsmap.get(prefix) + if prefix == 'xml': + namespace = 'http://www.w3.org/XML/1998/namespace' + else: + namespace = node.nsmap.get(prefix) if namespace is not None: value = attrs.get('{%s}%s' % (namespace, name, )) return value @@ -849,7 +904,7 @@ def exportSimple(self, outfile, level, name): self.name, base64.b64encode(self.value), self.name)) - def to_etree(self, element, mapping_=None, nsmap_=None): + def to_etree(self, element, mapping_=None, reverse_mapping_=None, nsmap_=None): if self.category == MixedContainer.CategoryText: # Prevent exporting empty content as empty lines. if self.value.strip(): @@ -869,7 +924,7 @@ def to_etree(self, element, mapping_=None, nsmap_=None): subelement.text = self.to_etree_simple() else: # category == MixedContainer.CategoryComplex self.value.to_etree(element) - def to_etree_simple(self, mapping_=None, nsmap_=None): + def to_etree_simple(self, mapping_=None, reverse_mapping_=None, nsmap_=None): if self.content_type == MixedContainer.TypeString: text = self.value elif (self.content_type == MixedContainer.TypeInteger or @@ -942,11 +997,10 @@ def _cast(typ, value): return value return typ(value) + # -# Data representation classes. +# Start enum classes # - - class AlignSimpleType(str, Enum): LEFT='left' CENTRE='centre' @@ -1013,6 +1067,200 @@ class GroupTypeSimpleType(str, Enum): OTHER='other' +class LanguageSimpleType(str, Enum): + """LanguageSimpleType -- ISO 639.x 2016-07-14 + + """ + ABKHAZ='Abkhaz' + AFAR='Afar' + AFRIKAANS='Afrikaans' + AKAN='Akan' + ALBANIAN='Albanian' + AMHARIC='Amharic' + ARABIC='Arabic' + ARAGONESE='Aragonese' + ARMENIAN='Armenian' + ASSAMESE='Assamese' + AVARIC='Avaric' + AVESTAN='Avestan' + AYMARA='Aymara' + AZERBAIJANI='Azerbaijani' + BAMBARA='Bambara' + BASHKIR='Bashkir' + BASQUE='Basque' + BELARUSIAN='Belarusian' + BENGALI='Bengali' + BIHARI='Bihari' + BISLAMA='Bislama' + BOSNIAN='Bosnian' + BRETON='Breton' + BULGARIAN='Bulgarian' + BURMESE='Burmese' + CAMBODIAN='Cambodian' + CANTONESE='Cantonese' + CATALAN='Catalan' + CHAMORRO='Chamorro' + CHECHEN='Chechen' + CHICHEWA='Chichewa' + CHINESE='Chinese' + CHUVASH='Chuvash' + CORNISH='Cornish' + CORSICAN='Corsican' + CREE='Cree' + CROATIAN='Croatian' + CZECH='Czech' + DANISH='Danish' + DIVEHI='Divehi' + DUTCH='Dutch' + DZONGKHA='Dzongkha' + ENGLISH='English' + ESPERANTO='Esperanto' + ESTONIAN='Estonian' + EWE='Ewe' + FAROESE='Faroese' + FIJIAN='Fijian' + FINNISH='Finnish' + FRENCH='French' + FULA='Fula' + GAELIC='Gaelic' + GALICIAN='Galician' + GANDA='Ganda' + GEORGIAN='Georgian' + GERMAN='German' + GREEK='Greek' + GUARANÍ='Guaraní' + GUJARATI='Gujarati' + HAITIAN='Haitian' + HAUSA='Hausa' + HEBREW='Hebrew' + HERERO='Herero' + HINDI='Hindi' + HIRI_MOTU='Hiri Motu' + HUNGARIAN='Hungarian' + ICELANDIC='Icelandic' + IDO='Ido' + IGBO='Igbo' + INDONESIAN='Indonesian' + INTERLINGUA='Interlingua' + INTERLINGUE='Interlingue' + INUKTITUT='Inuktitut' + INUPIAQ='Inupiaq' + IRISH='Irish' + ITALIAN='Italian' + JAPANESE='Japanese' + JAVANESE='Javanese' + KALAALLISUT='Kalaallisut' + KANNADA='Kannada' + KANURI='Kanuri' + KASHMIRI='Kashmiri' + KAZAKH='Kazakh' + KHMER='Khmer' + KIKUYU='Kikuyu' + KINYARWANDA='Kinyarwanda' + KIRUNDI='Kirundi' + KOMI='Komi' + KONGO='Kongo' + KOREAN='Korean' + KURDISH='Kurdish' + KWANYAMA='Kwanyama' + KYRGYZ='Kyrgyz' + LAO='Lao' + LATIN='Latin' + LATVIAN='Latvian' + LIMBURGISH='Limburgish' + LINGALA='Lingala' + LITHUANIAN='Lithuanian' + LUBA_KATANGA='Luba-Katanga' + LUXEMBOURGISH='Luxembourgish' + MACEDONIAN='Macedonian' + MALAGASY='Malagasy' + MALAY='Malay' + MALAYALAM='Malayalam' + MALTESE='Maltese' + MANX='Manx' + MĀORI='Māori' + MARATHI='Marathi' + MARSHALLESE='Marshallese' + MONGOLIAN='Mongolian' + NAURU='Nauru' + NAVAJO='Navajo' + NDONGA='Ndonga' + NEPALI='Nepali' + NORTH_NDEBELE='North Ndebele' + NORTHERN_SAMI='Northern Sami' + NORWEGIAN='Norwegian' + NORWEGIAN_BOKMÅL='Norwegian Bokmål' + NORWEGIAN_NYNORSK='Norwegian Nynorsk' + NUOSU='Nuosu' + OCCITAN='Occitan' + OJIBWE='Ojibwe' + OLD_CHURCH_SLAVONIC='Old Church Slavonic' + ORIYA='Oriya' + OROMO='Oromo' + OSSETIAN='Ossetian' + PĀLI='Pāli' + PANJABI='Panjabi' + PASHTO='Pashto' + PERSIAN='Persian' + POLISH='Polish' + PORTUGUESE='Portuguese' + PUNJABI='Punjabi' + QUECHUA='Quechua' + ROMANIAN='Romanian' + ROMANSH='Romansh' + RUSSIAN='Russian' + SAMOAN='Samoan' + SANGO='Sango' + SANSKRIT='Sanskrit' + SARDINIAN='Sardinian' + SERBIAN='Serbian' + SHONA='Shona' + SINDHI='Sindhi' + SINHALA='Sinhala' + SLOVAK='Slovak' + SLOVENE='Slovene' + SOMALI='Somali' + SOUTH_NDEBELE='South Ndebele' + SOUTHERN_SOTHO='Southern Sotho' + SPANISH='Spanish' + SUNDANESE='Sundanese' + SWAHILI='Swahili' + SWATI='Swati' + SWEDISH='Swedish' + TAGALOG='Tagalog' + TAHITIAN='Tahitian' + TAJIK='Tajik' + TAMIL='Tamil' + TATAR='Tatar' + TELUGU='Telugu' + THAI='Thai' + TIBETAN='Tibetan' + TIGRINYA='Tigrinya' + TONGA='Tonga' + TSONGA='Tsonga' + TSWANA='Tswana' + TURKISH='Turkish' + TURKMEN='Turkmen' + TWI='Twi' + UIGHUR='Uighur' + UKRAINIAN='Ukrainian' + URDU='Urdu' + UZBEK='Uzbek' + VENDA='Venda' + VIETNAMESE='Vietnamese' + VOLAPÜK='Volapük' + WALLOON='Walloon' + WELSH='Welsh' + WESTERN_FRISIAN='Western Frisian' + WOLOF='Wolof' + XHOSA='Xhosa' + YIDDISH='Yiddish' + YORUBA='Yoruba' + ZHUANG='Zhuang' + ZULU='Zulu' + OTHER='other' + + class PageTypeSimpleType(str, Enum): FRONTCOVER='front-cover' BACKCOVER='back-cover' @@ -1025,7 +1273,9 @@ class PageTypeSimpleType(str, Enum): class ProductionSimpleType(str, Enum): - """Text production type""" + """ProductionSimpleType -- Text production type + + """ PRINTED='printed' TYPEWRITTEN='typewritten' HANDWRITTENCURSIVE='handwritten-cursive' @@ -1041,6 +1291,193 @@ class ReadingDirectionSimpleType(str, Enum): BOTTOMTOTOP='bottom-to-top' +class ScriptSimpleType(str, Enum): + """ScriptSimpleType -- iso15924 2016-07-14 + + """ + ADLM_ADLAM='Adlm - Adlam' + AFAK_AFAKA='Afak - Afaka' + AGHB_CAUCASIAN_ALBANIAN='Aghb - Caucasian Albanian' + AHOM_AHOM_TAI_AHOM='Ahom - Ahom, Tai Ahom' + ARAB_ARABIC='Arab - Arabic' + ARAN_ARABIC_NASTALIQVARIANT='Aran - Arabic (Nastaliq variant)' + ARMI_IMPERIAL_ARAMAIC='Armi - Imperial Aramaic' + ARMN_ARMENIAN='Armn - Armenian' + AVST_AVESTAN='Avst - Avestan' + BALI_BALINESE='Bali - Balinese' + BAMU_BAMUM='Bamu - Bamum' + BASS_BASSA_VAH='Bass - Bassa Vah' + BATK_BATAK='Batk - Batak' + BENG_BENGALI='Beng - Bengali' + BHKS_BHAIKSUKI='Bhks - Bhaiksuki' + BLIS_BLISSYMBOLS='Blis - Blissymbols' + BOPO_BOPOMOFO='Bopo - Bopomofo' + BRAH_BRAHMI='Brah - Brahmi' + BRAI_BRAILLE='Brai - Braille' + BUGI_BUGINESE='Bugi - Buginese' + BUHD_BUHID='Buhd - Buhid' + CAKM_CHAKMA='Cakm - Chakma' + CANS_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS='Cans - Unified Canadian Aboriginal Syllabics' + CARI_CARIAN='Cari - Carian' + CHAM_CHAM='Cham - Cham' + CHER_CHEROKEE='Cher - Cherokee' + CIRT_CIRTH='Cirt - Cirth' + COPT_COPTIC='Copt - Coptic' + CPRT_CYPRIOT='Cprt - Cypriot' + CYRL_CYRILLIC='Cyrl - Cyrillic' + CYRS_CYRILLIC_OLD_CHURCH_SLAVONICVARIANT='Cyrs - Cyrillic (Old Church Slavonic variant)' + DEVA_DEVANAGARI_NAGARI='Deva - Devanagari (Nagari)' + DSRT_DESERET_MORMON='Dsrt - Deseret (Mormon)' + DUPL_DUPLOYANSHORTHAND_DUPLOYANSTENOGRAPHY='Dupl - Duployan shorthand, Duployan stenography' + EGYD_EGYPTIANDEMOTIC='Egyd - Egyptian demotic' + EGYH_EGYPTIANHIERATIC='Egyh - Egyptian hieratic' + EGYP_EGYPTIANHIEROGLYPHS='Egyp - Egyptian hieroglyphs' + ELBA_ELBASAN='Elba - Elbasan' + ETHI_ETHIOPIC='Ethi - Ethiopic' + GEOK_KHUTSURI_ASOMTAVRULIAND_NUSKHURI='Geok - Khutsuri (Asomtavruli and Nuskhuri)' + GEOR_GEORGIAN_MKHEDRULI='Geor - Georgian (Mkhedruli)' + GLAG_GLAGOLITIC='Glag - Glagolitic' + GOTH_GOTHIC='Goth - Gothic' + GRAN_GRANTHA='Gran - Grantha' + GREK_GREEK='Grek - Greek' + GUJR_GUJARATI='Gujr - Gujarati' + GURU_GURMUKHI='Guru - Gurmukhi' + HANB_HANWITH_BOPOMOFO='Hanb - Han with Bopomofo' + HANG_HANGUL='Hang - Hangul' + HANI_HAN_HANZI_KANJI_HANJA='Hani - Han (Hanzi, Kanji, Hanja)' + HANO_HANUNOO_HANUNÓO='Hano - Hanunoo (Hanunóo)' + HANS_HAN_SIMPLIFIEDVARIANT='Hans - Han (Simplified variant)' + HANT_HAN_TRADITIONALVARIANT='Hant - Han (Traditional variant)' + HATR_HATRAN='Hatr - Hatran' + HEBR_HEBREW='Hebr - Hebrew' + HIRA_HIRAGANA='Hira - Hiragana' + HLUW_ANATOLIAN_HIEROGLYPHS='Hluw - Anatolian Hieroglyphs' + HMNG_PAHAWH_HMONG='Hmng - Pahawh Hmong' + HRKT_JAPANESESYLLABARIES='Hrkt - Japanese syllabaries' + HUNG_OLD_HUNGARIAN_HUNGARIAN_RUNIC='Hung - Old Hungarian (Hungarian Runic)' + INDS_INDUS_HARAPPAN='Inds - Indus (Harappan)' + ITAL_OLD_ITALIC_ETRUSCAN_OSCANETC='Ital - Old Italic (Etruscan, Oscan etc.)' + JAMO_JAMO='Jamo - Jamo' + JAVA_JAVANESE='Java - Javanese' + JPAN_JAPANESE='Jpan - Japanese' + JURC_JURCHEN='Jurc - Jurchen' + KALI_KAYAH_LI='Kali - Kayah Li' + KANA_KATAKANA='Kana - Katakana' + KHAR_KHAROSHTHI='Khar - Kharoshthi' + KHMR_KHMER='Khmr - Khmer' + KHOJ_KHOJKI='Khoj - Khojki' + KITL_KHITANLARGESCRIPT='Kitl - Khitan large script' + KITS_KHITANSMALLSCRIPT='Kits - Khitan small script' + KNDA_KANNADA='Knda - Kannada' + KORE_KOREANALIASFOR_HANGUL_HAN='Kore - Korean (alias for Hangul + Han)' + KPEL_KPELLE='Kpel - Kpelle' + KTHI_KAITHI='Kthi - Kaithi' + LANA_TAI_THAM_LANNA='Lana - Tai Tham (Lanna)' + LAOO_LAO='Laoo - Lao' + LATF_LATIN_FRAKTURVARIANT='Latf - Latin (Fraktur variant)' + LATG_LATIN_GAELICVARIANT='Latg - Latin (Gaelic variant)' + LATN_LATIN='Latn - Latin' + LEKE_LEKE='Leke - Leke' + LEPC_LEPCHARÓNG='Lepc - Lepcha (Róng)' + LIMB_LIMBU='Limb - Limbu' + LINA_LINEARA='Lina - Linear A' + LINB_LINEARB='Linb - Linear B' + LISU_LISU_FRASER='Lisu - Lisu (Fraser)' + LOMA_LOMA='Loma - Loma' + LYCI_LYCIAN='Lyci - Lycian' + LYDI_LYDIAN='Lydi - Lydian' + MAHJ_MAHAJANI='Mahj - Mahajani' + MAND_MANDAIC_MANDAEAN='Mand - Mandaic, Mandaean' + MANI_MANICHAEAN='Mani - Manichaean' + MARC_MARCHEN='Marc - Marchen' + MAYA_MAYANHIEROGLYPHS='Maya - Mayan hieroglyphs' + MEND_MENDE_KIKAKUI='Mend - Mende Kikakui' + MERC_MEROITIC_CURSIVE='Merc - Meroitic Cursive' + MERO_MEROITIC_HIEROGLYPHS='Mero - Meroitic Hieroglyphs' + MLYM_MALAYALAM='Mlym - Malayalam' + MODI_MODI_MOḌĪ='Modi - Modi, Moḍī' + MONG_MONGOLIAN='Mong - Mongolian' + MOON_MOON_MOONCODE_MOONSCRIPT_MOONTYPE='Moon - Moon (Moon code, Moon script, Moon type)' + MROO_MRO_MRU='Mroo - Mro, Mru' + MTEI_MEITEI_MAYEK_MEITHEI_MEETEI='Mtei - Meitei Mayek (Meithei, Meetei)' + MULT_MULTANI='Mult - Multani' + MYMR_MYANMAR_BURMESE='Mymr - Myanmar (Burmese)' + NARB_OLD_NORTH_ARABIAN_ANCIENT_NORTH_ARABIAN='Narb - Old North Arabian (Ancient North Arabian)' + NBAT_NABATAEAN='Nbat - Nabataean' + NEWA_NEWA_NEWAR_NEWARI='Newa - Newa, Newar, Newari' + NKGB_NAKHI_GEBA='Nkgb - Nakhi Geba' + NKOON_KO='Nkoo - N’Ko' + NSHUNÜSHU='Nshu - Nüshu' + OGAM_OGHAM='Ogam - Ogham' + OLCK_OL_CHIKI_OL_CEMET_OL_SANTALI='Olck - Ol Chiki (Ol Cemet’, Ol, Santali)' + ORKH_OLD_TURKIC_ORKHON_RUNIC='Orkh - Old Turkic, Orkhon Runic' + ORYA_ORIYA='Orya - Oriya' + OSGE_OSAGE='Osge - Osage' + OSMA_OSMANYA='Osma - Osmanya' + PALM_PALMYRENE='Palm - Palmyrene' + PAUC_PAU_CIN_HAU='Pauc - Pau Cin Hau' + PERM_OLD_PERMIC='Perm - Old Permic' + PHAG_PHAGSPA='Phag - Phags-pa' + PHLI_INSCRIPTIONAL_PAHLAVI='Phli - Inscriptional Pahlavi' + PHLP_PSALTER_PAHLAVI='Phlp - Psalter Pahlavi' + PHLV_BOOK_PAHLAVI='Phlv - Book Pahlavi' + PHNX_PHOENICIAN='Phnx - Phoenician' + PIQD_KLINGONKLIP_IQA_D='Piqd - Klingon (KLI pIqaD)' + PLRD_MIAO_POLLARD='Plrd - Miao (Pollard)' + PRTI_INSCRIPTIONAL_PARTHIAN='Prti - Inscriptional Parthian' + RJNG_REJANG_REDJANG_KAGANGA='Rjng - Rejang (Redjang, Kaganga)' + RORO_RONGORONGO='Roro - Rongorongo' + RUNR_RUNIC='Runr - Runic' + SAMR_SAMARITAN='Samr - Samaritan' + SARA_SARATI='Sara - Sarati' + SARB_OLD_SOUTH_ARABIAN='Sarb - Old South Arabian' + SAUR_SAURASHTRA='Saur - Saurashtra' + SGNW_SIGN_WRITING='Sgnw - SignWriting' + SHAW_SHAVIAN_SHAW='Shaw - Shavian (Shaw)' + SHRD_SHARADAŚĀRADĀ='Shrd - Sharada, Śāradā' + SIDD_SIDDHAM='Sidd - Siddham' + SIND_KHUDAWADI_SINDHI='Sind - Khudawadi, Sindhi' + SINH_SINHALA='Sinh - Sinhala' + SORA_SORA_SOMPENG='Sora - Sora Sompeng' + SUND_SUNDANESE='Sund - Sundanese' + SYLO_SYLOTI_NAGRI='Sylo - Syloti Nagri' + SYRC_SYRIAC='Syrc - Syriac' + SYRE_SYRIAC_ESTRANGELOVARIANT='Syre - Syriac (Estrangelo variant)' + SYRJ_SYRIAC_WESTERNVARIANT='Syrj - Syriac (Western variant)' + SYRN_SYRIAC_EASTERNVARIANT='Syrn - Syriac (Eastern variant)' + TAGB_TAGBANWA='Tagb - Tagbanwa' + TAKR_TAKRI='Takr - Takri' + TALE_TAI_LE='Tale - Tai Le' + TALU_NEW_TAI_LUE='Talu - New Tai Lue' + TAML_TAMIL='Taml - Tamil' + TANG_TANGUT='Tang - Tangut' + TAVT_TAI_VIET='Tavt - Tai Viet' + TELU_TELUGU='Telu - Telugu' + TENG_TENGWAR='Teng - Tengwar' + TFNG_TIFINAGH_BERBER='Tfng - Tifinagh (Berber)' + TGLG_TAGALOG_BAYBAYIN_ALIBATA='Tglg - Tagalog (Baybayin, Alibata)' + THAA_THAANA='Thaa - Thaana' + THAI_THAI='Thai - Thai' + TIBT_TIBETAN='Tibt - Tibetan' + TIRH_TIRHUTA='Tirh - Tirhuta' + UGAR_UGARITIC='Ugar - Ugaritic' + VAII_VAI='Vaii - Vai' + VISP_VISIBLE_SPEECH='Visp - Visible Speech' + WARA_WARANG_CITI_VARANG_KSHITI='Wara - Warang Citi (Varang Kshiti)' + WOLE_WOLEAI='Wole - Woleai' + XPEO_OLD_PERSIAN='Xpeo - Old Persian' + XSUX_CUNEIFORM_SUMERO_AKKADIAN='Xsux - Cuneiform, Sumero-Akkadian' + YIII_YI='Yiii - Yi' + ZINH_CODEFORINHERITEDSCRIPT='Zinh - Code for inherited script' + ZMTH_MATHEMATICALNOTATION='Zmth - Mathematical notation' + ZSYE_SYMBOLS_EMOJIVARIANT='Zsye - Symbols (Emoji variant)' + ZSYM_SYMBOLS='Zsym - Symbols' + ZXXX_CODEFORUNWRITTENDOCUMENTS='Zxxx - Code for unwritten documents' + ZYYY_CODEFORUNDETERMINEDSCRIPT='Zyyy - Code for undetermined script' + ZZZZ_CODEFORUNCODEDSCRIPT='Zzzz - Code for uncoded script' + OTHER='other' + + class TextDataTypeSimpleType(str, Enum): XSDDECIMAL='xsd:decimal' # Examples: "123.456", "+1234.456", "-1234.456", "-.456", "-456" XSDFLOAT='xsd:float' # Examples: "123.456", "+1234.456", "-1.2344e56", "-.45E-6", "INF", "-INF", "NaN" @@ -1087,10 +1524,58 @@ class UnderlineStyleSimpleType(str, Enum): OTHER='other' +class charTypeType(str, Enum): + """charTypeType -- + Type of character represented by the + grapheme, group, or non-printing character element. + + """ + BASE='base' + COMBINING='combining' + + +class imageResolutionUnitType(str, Enum): + """imageResolutionUnitType -- + Specifies the unit of the resolution information + referring to a standardised unit of measurement + (pixels per inch, pixels per centimeter or other). + + """ + PPI='PPI' + PPCM='PPCM' + OTHER='other' + + +class typeType(str, Enum): + """typeType -- + Type of metadata (e.g. author) + + """ + AUTHOR='author' + IMAGE_PROPERTIES='imageProperties' + PROCESSING_STEP='processingStep' + OTHER='other' + + +class typeType1(str, Enum): + LINK='link' + JOIN='join' + + +class typeType3(str, Enum): + XSDSTRING='xsd:string' + XSDINTEGER='xsd:integer' + XSDBOOLEAN='xsd:boolean' + XSDFLOAT='xsd:float' + + +# +# Start data representation classes +# class PcGtsType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('pcGtsId', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('pcGtsId', 'string', 0, 1, {'use': 'optional', 'name': 'pcGtsId'}), MemberSpec_('Metadata', 'MetadataType', 0, 0, {'name': 'Metadata', 'type': 'MetadataType'}, None), MemberSpec_('Page', 'PageType', 0, 0, {'name': 'Page', 'type': 'PageType'}, None), ] @@ -1101,7 +1586,7 @@ def __init__(self, pcGtsId=None, Metadata=None, Page=None, gds_collector_=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.pcGtsId = _cast(None, pcGtsId) self.pcGtsId_nsprefix_ = "pc" self.Metadata = Metadata @@ -1135,7 +1620,7 @@ def get_pcGtsId(self): return self.pcGtsId def set_pcGtsId(self, pcGtsId): self.pcGtsId = pcGtsId - def hasContent_(self): + def has__content(self): if ( self.Metadata is not None or self.Page is not None @@ -1158,19 +1643,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PcGtsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PcGtsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PcGtsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PcGtsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PcGtsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PcGtsType'): if self.pcGtsId is not None and 'pcGtsId' not in already_processed: already_processed.add('pcGtsId') outfile.write(' pcGtsId=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.pcGtsId), input_name='pcGtsId')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PcGtsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PcGtsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1181,7 +1666,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Page is not None: namespaceprefix_ = self.Page_nsprefix_ + ':' if (UseCapturedNS_ and self.Page_nsprefix_) else '' self.Page.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Page', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1190,12 +1675,14 @@ def to_etree(self, parent_element=None, name_='PcGtsType', mapping_=None, nsmap_ element.set('pcGtsId', self.gds_format_string(self.pcGtsId)) if self.Metadata is not None: Metadata_ = self.Metadata - Metadata_.to_etree(element, name_='Metadata', mapping_=mapping_, nsmap_=nsmap_) + Metadata_.to_etree(element, name_='Metadata', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Page is not None: Page_ = self.Page - Page_.to_etree(element, name_='Page', mapping_=mapping_, nsmap_=nsmap_) + Page_.to_etree(element, name_='Page', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1203,17 +1690,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('pcGtsId', node) if value is not None and 'pcGtsId' not in already_processed: already_processed.add('pcGtsId') self.pcGtsId = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Metadata': obj_ = MetadataType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1303,10 +1790,20 @@ def prune_ReadingOrder(self): class MetadataType(GeneratedsSuper): - """External reference of any kind""" + """externalRef -- External reference of any kind + Created -- + The timestamp has to be in UTC (Coordinated + Universal Time) and not local time. + + * LastChange -- + The timestamp has to be in UTC + (Coordinated Universal Time) + and not local time. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('externalRef', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('externalRef', 'string', 0, 1, {'use': 'optional', 'name': 'externalRef'}), MemberSpec_('Creator', 'string', 0, 0, {'name': 'Creator', 'type': 'string'}, None), MemberSpec_('Created', 'dateTime', 0, 0, {'name': 'Created', 'type': 'dateTime'}, None), MemberSpec_('LastChange', 'dateTime', 0, 0, {'name': 'LastChange', 'type': 'dateTime'}, None), @@ -1396,7 +1893,7 @@ def get_externalRef(self): return self.externalRef def set_externalRef(self, externalRef): self.externalRef = externalRef - def hasContent_(self): + def has__content(self): if ( self.Creator is not None or self.Created is not None or @@ -1423,19 +1920,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataType'): if self.externalRef is not None and 'externalRef' not in already_processed: already_processed.add('externalRef') outfile.write(' externalRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.externalRef), input_name='externalRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='MetadataType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='MetadataType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1462,7 +1959,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for MetadataItem_ in self.MetadataItem: namespaceprefix_ = self.MetadataItem_nsprefix_ + ':' if (UseCapturedNS_ and self.MetadataItem_nsprefix_) else '' MetadataItem_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='MetadataItem', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1483,11 +1980,13 @@ def to_etree(self, parent_element=None, name_='MetadataType', mapping_=None, nsm etree_.SubElement(element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Comments').text = self.gds_format_string(Comments_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MetadataItem_ in self.MetadataItem: - MetadataItem_.to_etree(element, name_='MetadataItem', mapping_=mapping_, nsmap_=nsmap_) + MetadataItem_.to_etree(element, name_='MetadataItem', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1495,17 +1994,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('externalRef', node) if value is not None and 'externalRef' not in already_processed: already_processed.add('externalRef') self.externalRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Creator': value_ = child_.text value_ = self.gds_parse_string(value_, node, 'Creator') @@ -1544,15 +2043,22 @@ def __hash__(self): class MetadataItemType(GeneratedsSuper): - """Type of metadata (e.g. author) - E.g. imagePhotometricInterpretation - E.g. RGB""" + """type -- + Type of metadata (e.g. author) + + * name -- + E.g. imagePhotometricInterpretation + + * value -- E.g. RGB + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('name', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('value', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('date', 'dateTime', 0, 1, {'use': 'optional'}), + MemberSpec_('type_', 'typeType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('name', 'string', 0, 1, {'use': 'optional', 'name': 'name'}), + MemberSpec_('value', 'string', 0, 0, {'use': 'required', 'name': 'value'}), + MemberSpec_('date', 'dateTime', 0, 1, {'use': 'optional', 'name': 'date'}), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), ] subclass = None @@ -1562,7 +2068,7 @@ def __init__(self, type_=None, name=None, value=None, date=None, Labels=None, gd self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.type_ = _cast(None, type_) self.type__nsprefix_ = "pc" self.name = _cast(None, name) @@ -1620,7 +2126,20 @@ def get_date(self): return self.date def set_date(self, date): self.date = date - def hasContent_(self): + def validate_typeType(self, value): + # Validate type typeType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['author', 'imageProperties', 'processingStep', 'other'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.Labels ): @@ -1642,15 +2161,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataItemType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MetadataItemType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataItemType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MetadataItemType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataItemType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MetadataItemType'): if self.type_ is not None and 'type_' not in already_processed: already_processed.add('type_') outfile.write(' type=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.type_), input_name='type')), )) @@ -1663,7 +2182,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.date is not None and 'date' not in already_processed: already_processed.add('date') outfile.write(' date="%s"' % self.gds_format_datetime(self.date, input_name='date')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MetadataItemType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MetadataItemType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1671,7 +2190,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1685,9 +2204,11 @@ def to_etree(self, parent_element=None, name_='MetadataItemType', mapping_=None, if self.date is not None: element.set('date', self.gds_format_datetime(self.date)) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1695,16 +2216,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('type', node) if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType(self.type_) # validate type typeType value = find_attr_value_('name', node) if value is not None and 'name' not in already_processed: already_processed.add('name') @@ -1720,7 +2242,7 @@ def buildAttributes(self, node, attrs, already_processed): self.date = self.gds_parse_datetime(value) except ValueError as exp: raise ValueError('Bad date-time attribute (date): %s' % exp) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Labels': obj_ = LabelsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1732,16 +2254,26 @@ def __hash__(self): class LabelsType(GeneratedsSuper): - """Reference to external model / ontology / schema - E.g. an RDF resource identifier - (to be used as subject or object of an RDF triple) - Prefix for all labels (e.g. first part of an URI)""" + """externalModel -- + Reference to external model / ontology / schema + + * externalId -- + E.g. an RDF resource identifier + (to be used as subject or object of an RDF triple) + + * prefix -- + Prefix for all labels (e.g. first part of an URI) + + * Label -- + A semantic label / tag + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('externalModel', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('externalId', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('prefix', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('externalModel', 'string', 0, 1, {'use': 'optional', 'name': 'externalModel'}), + MemberSpec_('externalId', 'string', 0, 1, {'use': 'optional', 'name': 'externalId'}), + MemberSpec_('prefix', 'string', 0, 1, {'use': 'optional', 'name': 'prefix'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('Label', 'LabelType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Label', 'type': 'LabelType'}, None), ] subclass = None @@ -1751,7 +2283,7 @@ def __init__(self, externalModel=None, externalId=None, prefix=None, comments=No self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.externalModel = _cast(None, externalModel) self.externalModel_nsprefix_ = "pc" self.externalId = _cast(None, externalId) @@ -1806,7 +2338,7 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def has__content(self): if ( self.Label ): @@ -1828,15 +2360,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelsType'): if self.externalModel is not None and 'externalModel' not in already_processed: already_processed.add('externalModel') outfile.write(' externalModel=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.externalModel), input_name='externalModel')), )) @@ -1849,7 +2381,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -1857,7 +2389,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Label_ in self.Label: namespaceprefix_ = self.Label_nsprefix_ + ':' if (UseCapturedNS_ and self.Label_nsprefix_) else '' Label_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Label', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -1871,9 +2403,11 @@ def to_etree(self, parent_element=None, name_='LabelsType', mapping_=None, nsmap if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for Label_ in self.Label: - Label_.to_etree(element, name_='Label', mapping_=mapping_, nsmap_=nsmap_) + Label_.to_etree(element, name_='Label', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -1881,12 +2415,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('externalModel', node) if value is not None and 'externalModel' not in already_processed: already_processed.add('externalModel') @@ -1903,7 +2437,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Label': obj_ = LabelType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -1915,18 +2449,23 @@ def __hash__(self): class LabelType(GeneratedsSuper): - """Semantic label + """LabelType -- Semantic label + value -- The label / tag (e.g. 'person'). Can be an RDF resource identifier (e.g. object of an RDF triple). - Additional information on the label - (e.g. 'YYYY-mm-dd' for a date label). - Can be used as predicate of an RDF triple.""" + + * type -- + Additional information on the label + (e.g. 'YYYY-mm-dd' for a date label). + Can be used as predicate of an RDF triple. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('value', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('value', 'string', 0, 0, {'use': 'required', 'name': 'value'}), + MemberSpec_('type_', 'string', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), ] subclass = None superclass = None @@ -1935,7 +2474,7 @@ def __init__(self, value=None, type_=None, comments=None, gds_collector_=None, * self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.value = _cast(None, value) self.value_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -1969,7 +2508,7 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def has__content(self): if ( ): @@ -1991,14 +2530,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LabelType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LabelType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LabelType'): if self.value is not None and 'value' not in already_processed: already_processed.add('value') outfile.write(' value=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.value), input_name='value')), )) @@ -2008,9 +2547,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LabelType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -2023,6 +2562,8 @@ def to_etree(self, parent_element=None, name_='LabelType', mapping_=None, nsmap_ element.set('comments', self.gds_format_string(self.comments)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -2030,12 +2571,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('value', node) if value is not None and 'value' not in already_processed: already_processed.add('value') @@ -2048,7 +2589,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -2056,57 +2597,102 @@ def __hash__(self): class PageType(GeneratedsSuper): - """Contains the image file name including the file extension. - Specifies the width of the image.Specifies the height of the - image.Specifies the image resolution in width.Specifies the image - resolution in height. - Specifies the unit of the resolution information - referring to a standardised unit of measurement - (pixels per inch, pixels per centimeter or other). - For generic use - The angle the rectangle encapsulating the page - (or its Border) has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - The type of the page within the document - (e.g. cover page). - The primary language used in the page - (lower-level definitions override the page-level definition). - The secondary language used in the page - (lower-level definitions override the page-level definition). - The primary script used in the page - (lower-level definitions override the page-level definition). - The secondary script used in the page - (lower-level definitions override the page-level definition). - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder” - (lower-level definitions override the page-level definition). - The order of text lines within a block, - in addition to “readingDirection” - (lower-level definitions override the page-level definition). - Confidence value for whole page (between 0 and 1)""" + """imageFilename -- + Contains the image file name including the file extension. + + * imageWidth -- Specifies the width of the image. + * imageHeight -- Specifies the height of the image. + * imageXResolution -- Specifies the image resolution in width. + * imageYResolution -- Specifies the image resolution in height. + * imageResolutionUnit -- + Specifies the unit of the resolution information + referring to a standardised unit of measurement + (pixels per inch, pixels per centimeter or other). + + * custom -- For generic use + * orientation -- + The angle the rectangle encapsulating the page + (or its Border) has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + (The rotated image can be further referenced + via + “ + AlternativeImage + ” + .) + Range: -179.999,180 + + * type -- + The type of the page within the document + (e.g. cover page). + + * primaryLanguage -- + The primary language used in the page + (lower-level definitions override the page-level definition). + + * secondaryLanguage -- + The secondary language used in the page + (lower-level definitions override the page-level definition). + + * primaryScript -- + The primary script used in the page + (lower-level definitions override the page-level definition). + + * secondaryScript -- + The secondary script used in the page + (lower-level definitions override the page-level definition). + + * readingDirection -- + The direction in which text within lines + should be read (order of words and characters), + in addition to + “ + textLineOrder + ” + (lower-level definitions override the page-level definition). + + * textLineOrder -- + The order of text lines within a block, + in addition to + “ + readingDirection + ” + (lower-level definitions override the page-level definition). + + * conf -- Confidence value for whole page (between 0 and 1) + * AlternativeImage -- + Alternative document page images + (e.g. black-and-white). + + * ReadingOrder -- Order of blocks within the page. + * Layers -- + Unassigned regions are considered to be in the + (virtual) default layer which is to be treated + as below any other layers. + + * TextStyle -- Default text style + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('imageFilename', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('imageWidth', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('imageHeight', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('imageXResolution', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('imageYResolution', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('imageResolutionUnit', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:PageTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('imageFilename', 'string', 0, 0, {'use': 'required', 'name': 'imageFilename'}), + MemberSpec_('imageWidth', 'int', 0, 0, {'use': 'required', 'name': 'imageWidth'}), + MemberSpec_('imageHeight', 'int', 0, 0, {'use': 'required', 'name': 'imageHeight'}), + MemberSpec_('imageXResolution', 'float', 0, 1, {'use': 'optional', 'name': 'imageXResolution'}), + MemberSpec_('imageYResolution', 'float', 0, 1, {'use': 'optional', 'name': 'imageYResolution'}), + MemberSpec_('imageResolutionUnit', 'imageResolutionUnitType', 0, 1, {'use': 'optional', 'name': 'imageResolutionUnit'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:PageTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional', 'name': 'textLineOrder'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Border', 'BorderType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Border', 'type': 'BorderType'}, None), MemberSpec_('PrintSpace', 'PrintSpaceType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'PrintSpace', 'type': 'PrintSpaceType'}, None), @@ -2139,7 +2725,7 @@ def __init__(self, imageFilename=None, imageWidth=None, imageHeight=None, imageX self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.imageFilename = _cast(None, imageFilename) self.imageFilename_nsprefix_ = "pc" self.imageWidth = _cast(int, imageWidth) @@ -2548,6 +3134,19 @@ def get_conf(self): return self.conf def set_conf(self, conf): self.conf = conf + def validate_imageResolutionUnitType(self, value): + # Validate type imageResolutionUnitType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['PPI', 'PPCM', 'other'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on imageResolutionUnitType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False def validate_PageTypeSimpleType(self, value): # Validate type pc:PageTypeSimpleType, a restriction on string. if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: @@ -2628,7 +3227,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Border is not None or @@ -2673,15 +3272,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PageType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PageType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PageType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PageType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PageType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PageType'): if self.imageFilename is not None and 'imageFilename' not in already_processed: already_processed.add('imageFilename') outfile.write(' imageFilename=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.imageFilename), input_name='imageFilename')), )) @@ -2730,7 +3329,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PageType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PageType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -2807,7 +3406,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for CustomRegion_ in self.CustomRegion: namespaceprefix_ = self.CustomRegion_nsprefix_ + ':' if (UseCapturedNS_ and self.CustomRegion_nsprefix_) else '' CustomRegion_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='CustomRegion', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PageType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PageType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -2845,62 +3444,64 @@ def to_etree(self, parent_element=None, name_='PageType', mapping_=None, nsmap_= if self.conf is not None: element.set('conf', self.gds_format_float(self.conf)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Border is not None: Border_ = self.Border - Border_.to_etree(element, name_='Border', mapping_=mapping_, nsmap_=nsmap_) + Border_.to_etree(element, name_='Border', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.PrintSpace is not None: PrintSpace_ = self.PrintSpace - PrintSpace_.to_etree(element, name_='PrintSpace', mapping_=mapping_, nsmap_=nsmap_) + PrintSpace_.to_etree(element, name_='PrintSpace', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.ReadingOrder is not None: ReadingOrder_ = self.ReadingOrder - ReadingOrder_.to_etree(element, name_='ReadingOrder', mapping_=mapping_, nsmap_=nsmap_) + ReadingOrder_.to_etree(element, name_='ReadingOrder', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Layers is not None: Layers_ = self.Layers - Layers_.to_etree(element, name_='Layers', mapping_=mapping_, nsmap_=nsmap_) + Layers_.to_etree(element, name_='Layers', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Relations is not None: Relations_ = self.Relations - Relations_.to_etree(element, name_='Relations', mapping_=mapping_, nsmap_=nsmap_) + Relations_.to_etree(element, name_='Relations', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextRegion_ in self.TextRegion: - TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, nsmap_=nsmap_) + TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ImageRegion_ in self.ImageRegion: - ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, nsmap_=nsmap_) + ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for LineDrawingRegion_ in self.LineDrawingRegion: - LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, nsmap_=nsmap_) + LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphicRegion_ in self.GraphicRegion: - GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, nsmap_=nsmap_) + GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TableRegion_ in self.TableRegion: - TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, nsmap_=nsmap_) + TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChartRegion_ in self.ChartRegion: - ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, nsmap_=nsmap_) + ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MapRegion_ in self.MapRegion: - MapRegion_.to_etree(element, name_='MapRegion', mapping_=mapping_, nsmap_=nsmap_) + MapRegion_.to_etree(element, name_='MapRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for SeparatorRegion_ in self.SeparatorRegion: - SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, nsmap_=nsmap_) + SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MathsRegion_ in self.MathsRegion: - MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, nsmap_=nsmap_) + MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChemRegion_ in self.ChemRegion: - ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, nsmap_=nsmap_) + ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MusicRegion_ in self.MusicRegion: - MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, nsmap_=nsmap_) + MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for AdvertRegion_ in self.AdvertRegion: - AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, nsmap_=nsmap_) + AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NoiseRegion_ in self.NoiseRegion: - NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, nsmap_=nsmap_) + NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnknownRegion_ in self.UnknownRegion: - UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, nsmap_=nsmap_) + UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for CustomRegion_ in self.CustomRegion: - CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, nsmap_=nsmap_) + CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -2908,12 +3509,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('imageFilename', node) if value is not None and 'imageFilename' not in already_processed: already_processed.add('imageFilename') @@ -2940,6 +3541,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'imageResolutionUnit' not in already_processed: already_processed.add('imageResolutionUnit') self.imageResolutionUnit = value + self.validate_imageResolutionUnitType(self.imageResolutionUnit) # validate type imageResolutionUnitType value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -2990,7 +3592,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -3362,18 +3964,22 @@ def set_orientation(self, orientation): class CoordsType(GeneratedsSuper): - """Polygon outline of the element as a path of points. + """points -- + Polygon outline of the element as a path of points. No points may lie outside the outline of its parent, which in the case of Border is the bounding rectangle of the root image. Paths are closed by convention, i.e. the last point logically connects with the first (and at least 3 points are required to span an area). Paths must be planar (i.e. must not self-intersect). - Confidence value (between 0 and 1)""" + + * conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -3382,7 +3988,7 @@ def __init__(self, points=None, conf=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.points = _cast(None, points) self.points_nsprefix_ = "pc" self.conf = _cast(float, conf) @@ -3436,7 +4042,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -3458,23 +4064,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CoordsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CoordsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CoordsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CoordsType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CoordsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CoordsType'): if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CoordsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CoordsType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -3485,6 +4091,8 @@ def to_etree(self, parent_element=None, name_='CoordsType', mapping_=None, nsmap element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -3492,12 +4100,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('points', node) if value is not None and 'points' not in already_processed: already_processed.add('points') @@ -3509,7 +4117,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -3533,28 +4141,51 @@ def set_points(self, points): class TextLineType(GeneratedsSuper): - """Overrides primaryLanguage attribute of parent text + """primaryLanguage -- + Overrides primaryLanguage attribute of parent text region - The primary script used in the text line - The secondary script used in the text line - The direction in which text within the line - should be read (order of words and characters). - Overrides the production attribute of the parent - text region - For generic use - Position (order number) of this text line within the - parent text region.""" + + * primaryScript -- + The primary script used in the text line + + * secondaryScript -- + The secondary script used in the text line + + * readingDirection -- + The direction in which text within the line + should be read (order of words and characters). + + * production -- + Overrides the production attribute of the parent + text region + + * custom -- For generic use + * index -- + Position (order number) of this text line within the + parent text region. + + * AlternativeImage -- + Alternative text line images (e.g. + black-and-white) + + * Baseline -- + Multiple connected points that mark the baseline + of the glyphs + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('index', 'int', 0, 1, {'use': 'optional', 'name': 'index'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Baseline', 'BaselineType', 0, 1, {'minOccurs': '0', 'name': 'Baseline', 'type': 'BaselineType'}, None), @@ -3571,7 +4202,7 @@ def __init__(self, id=None, primaryLanguage=None, primaryScript=None, secondaryS self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.primaryLanguage = _cast(None, primaryLanguage) @@ -3777,7 +4408,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -3806,15 +4437,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextLineType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextLineType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextLineType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextLineType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextLineType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextLineType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -3842,7 +4473,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextLineType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextLineType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -3871,7 +4502,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -3895,27 +4526,29 @@ def to_etree(self, parent_element=None, name_='TextLineType', mapping_=None, nsm if self.index is not None: element.set('index', self.gds_format_integer(self.index)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Baseline is not None: Baseline_ = self.Baseline - Baseline_.to_etree(element, name_='Baseline', mapping_=mapping_, nsmap_=nsmap_) + Baseline_.to_etree(element, name_='Baseline', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Word_ in self.Word: - Word_.to_etree(element, name_='Word', mapping_=mapping_, nsmap_=nsmap_) + Word_.to_etree(element, name_='Word', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -3923,12 +4556,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -3970,7 +4603,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4062,25 +4695,42 @@ def set_Coords(self, Coords): class WordType(GeneratedsSuper): - """Overrides primaryLanguage attribute of parent line + """language -- + Overrides primaryLanguage attribute of parent line and/or text region - The primary script used in the word - The secondary script used in the word - The direction in which text within the word - should be read (order of characters). - Overrides the production attribute of the parent - text line and/or text region. - For generic use""" + + * primaryScript -- + The primary script used in the word + + * secondaryScript -- + The secondary script used in the word + + * readingDirection -- + The direction in which text within the word + should be read (order of characters). + + * production -- + Overrides the production attribute of the parent + text line and/or text region. + + * custom -- For generic use + * AlternativeImage -- + Alternative word images (e.g. + black-and-white) + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('language', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('language', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'language'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Glyph', 'GlyphType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Glyph', 'type': 'GlyphType'}, None), @@ -4096,7 +4746,7 @@ def __init__(self, id=None, language=None, primaryScript=None, secondaryScript=N self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.language = _cast(None, language) @@ -4290,7 +4940,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -4318,15 +4968,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='WordType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='WordType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='WordType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='WordType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='WordType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='WordType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -4351,7 +5001,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='WordType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='WordType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -4377,7 +5027,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='WordType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='WordType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -4399,24 +5049,26 @@ def to_etree(self, parent_element=None, name_='WordType', mapping_=None, nsmap_= if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Glyph_ in self.Glyph: - Glyph_.to_etree(element, name_='Glyph', mapping_=mapping_, nsmap_=nsmap_) + Glyph_.to_etree(element, name_='Glyph', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -4424,12 +5076,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -4467,7 +5119,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4554,19 +5206,34 @@ def set_Coords(self, Coords): class GlyphType(GeneratedsSuper): - """The script used for the glyph - Overrides the production attribute of the parent - word / text line / text region. - For generic use""" + """script -- + The script used for the glyph + + * production -- + Overrides the production attribute of the parent + word / text line / text region. + + * custom -- For generic use + * AlternativeImage -- + Alternative glyph images (e.g. + black-and-white) + + * Graphemes -- + Container for graphemes, grapheme groups and + non-printing characters + + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('symbol', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('script', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional', 'name': 'ligature'}), + MemberSpec_('symbol', 'boolean', 0, 1, {'use': 'optional', 'name': 'symbol'}), + MemberSpec_('script', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'script'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('Graphemes', 'GraphemesType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Graphemes', 'type': 'GraphemesType'}, None), @@ -4582,7 +5249,7 @@ def __init__(self, id=None, ligature=None, symbol=None, script=None, production= self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.ligature = _cast(bool, ligature) @@ -4735,7 +5402,7 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -4763,15 +5430,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GlyphType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GlyphType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GlyphType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GlyphType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GlyphType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GlyphType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -4793,7 +5460,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GlyphType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GlyphType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -4819,7 +5486,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Labels_ in self.Labels: namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -4839,25 +5506,27 @@ def to_etree(self, parent_element=None, name_='GlyphType', mapping_=None, nsmap_ if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Graphemes is not None: Graphemes_ = self.Graphemes - Graphemes_.to_etree(element, name_='Graphemes', mapping_=mapping_, nsmap_=nsmap_) + Graphemes_.to_etree(element, name_='Graphemes', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -4865,12 +5534,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -4911,7 +5580,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -4998,22 +5667,40 @@ def set_Coords(self, Coords): class TextEquivType(GeneratedsSuper): - """Used for sort order in case multiple TextEquivs are defined. + """index -- + Used for sort order in case multiple TextEquivs are defined. The text content with the lowest index should be interpreted as the main text content. - OCR confidence value (between 0 and 1) - Type of text content (is it free text or a number, for instance). - This is only a descriptive attribute, the text type - is not checked during XML validation. - Refinement for dataType attribute. Can be a regular expression, for - instance.""" + + * conf -- OCR confidence value (between 0 and 1) + * dataType -- + Type of text content (is it free text or a number, for instance). + This is only a descriptive attribute, the text type + is not checked during XML validation. + + * dataTypeDetails -- + Refinement for dataType attribute. Can be a regular expression, for instance. + + * PlainText -- + Text in a "simple" form (ASCII or extended ASCII + as mostly used for typing). I.e. no use of + special characters for ligatures (should be + stored as two separate characters) etc. + + * Unicode -- + Correct encoding of the original, always using + the corresponding Unicode code point. I.e. + ligatures have to be represented as one + character etc. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('dataType', 'pc:TextDataTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('dataTypeDetails', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('index', 'indexType', 0, 1, {'use': 'optional', 'name': 'index'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), + MemberSpec_('dataType', 'pc:TextDataTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'dataType'}), + MemberSpec_('dataTypeDetails', 'string', 0, 1, {'use': 'optional', 'name': 'dataTypeDetails'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('PlainText', 'string', 0, 1, {'minOccurs': '0', 'name': 'PlainText', 'type': 'string'}, None), MemberSpec_('Unicode', 'string', 0, 0, {'name': 'Unicode', 'type': 'string'}, None), ] @@ -5082,6 +5769,17 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments + def validate_indexType(self, value): + # Validate type indexType, a restriction on integer. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, int): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (int)' % {"value": value, "lineno": lineno, }) + return False + if value < 0: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd minInclusive restriction on indexType' % {"value": value, "lineno": lineno} ) + result = False def validate_ConfSimpleType(self, value): # Validate type pc:ConfSimpleType, a restriction on float. if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: @@ -5110,7 +5808,7 @@ def validate_TextDataTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on TextDataTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.PlainText is not None or self.Unicode is not None @@ -5133,15 +5831,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextEquivType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextEquivType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextEquivType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextEquivType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextEquivType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextEquivType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) @@ -5157,7 +5855,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='TextEquivType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:None="http://www.w3.org/2001/XMLSchema" ', name_='TextEquivType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5170,7 +5868,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml namespaceprefix_ = self.Unicode_nsprefix_ + ':' if (UseCapturedNS_ and self.Unicode_nsprefix_) else '' showIndent(outfile, level, pretty_print) outfile.write('<%sUnicode>%s%s' % (namespaceprefix_ , self.gds_encode(self.gds_format_string(quote_xml(self.Unicode), input_name='Unicode')), namespaceprefix_ , eol_)) - def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5193,6 +5891,8 @@ def to_etree(self, parent_element=None, name_='TextEquivType', mapping_=None, ns etree_.SubElement(element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Unicode').text = self.gds_format_string(Unicode_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5200,16 +5900,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') + self.validate_indexType(self.index) # validate type indexType value = find_attr_value_('conf', node) if value is not None and 'conf' not in already_processed: already_processed.add('conf') @@ -5229,7 +5930,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'PlainText': value_ = child_.text value_ = self.gds_parse_string(value_, node, 'PlainText') @@ -5248,7 +5949,15 @@ def __hash__(self): class GridType(GeneratedsSuper): - """Matrix of grid points defining the table grid on the page.""" + """GridType -- + Matrix of grid points defining the table grid on the page. + + * GridPoints -- + One row in the grid point matrix. + Points with x,y coordinates. + (note: for a table with n table rows there should be n+1 grid rows) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('GridPoints', 'GridPointsType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '2', 'name': 'GridPoints', 'type': 'GridPointsType'}, None), @@ -5260,7 +5969,7 @@ def __init__(self, GridPoints=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if GridPoints is None: self.GridPoints = [] else: @@ -5291,7 +6000,7 @@ def insert_GridPoints_at(self, index, value): self.GridPoints.insert(index, value) def replace_GridPoints_at(self, index, value): self.GridPoints[index] = value - def hasContent_(self): + def has__content(self): if ( self.GridPoints ): @@ -5313,17 +6022,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5331,15 +6040,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for GridPoints_ in self.GridPoints: namespaceprefix_ = self.GridPoints_nsprefix_ + ':' if (UseCapturedNS_ and self.GridPoints_nsprefix_) else '' GridPoints_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='GridPoints', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GridType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GridType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for GridPoints_ in self.GridPoints: - GridPoints_.to_etree(element, name_='GridPoints', mapping_=mapping_, nsmap_=nsmap_) + GridPoints_.to_etree(element, name_='GridPoints', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5347,14 +6058,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'GridPoints': obj_ = GridPointsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5366,12 +6077,15 @@ def __hash__(self): class GridPointsType(GeneratedsSuper): - """Points with x,y coordinates. - The grid row index""" + """GridPointsType -- Points with x,y coordinates. + index -- + The grid row index + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), ] subclass = None superclass = None @@ -5380,7 +6094,7 @@ def __init__(self, index=None, points=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.index = _cast(int, index) self.index_nsprefix_ = "pc" self.points = _cast(None, points) @@ -5419,7 +6133,7 @@ def validate_PointsType(self, value): self.validate_PointsType_patterns_, value): self.gds_collector_.add_message('Value "%s" does not match xsd pattern restrictions: %s' % (encode_str_2_3(value), self.validate_PointsType_patterns_, )) validate_PointsType_patterns_ = [['^(([0-9]+,[0-9]+ )+([0-9]+,[0-9]+))$']] - def hasContent_(self): + def has__content(self): if ( ): @@ -5441,23 +6155,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridPointsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GridPointsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridPointsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GridPointsType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridPointsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GridPointsType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridPointsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GridPointsType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5468,6 +6182,8 @@ def to_etree(self, parent_element=None, name_='GridPointsType', mapping_=None, n element.set('points', self.gds_format_string(self.points)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5475,12 +6191,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') @@ -5490,7 +6206,7 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('points') self.points = value self.validate_PointsType(self.points) # validate type PointsType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -5498,13 +6214,16 @@ def __hash__(self): class PrintSpaceType(GeneratedsSuper): - """Determines the effective area on the paper of a printed page. + """PrintSpaceType -- + Determines the effective area on the paper of a printed page. Its size is equal for all pages of a book (exceptions: titlepage, multipage pictures). It contains all living elements (except marginals) like body type, footnotes, headings, running titles. It does not contain pagenumber (if not part of running title), - marginals, signature mark, preview words.""" + marginals, signature mark, preview words. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -5516,7 +6235,7 @@ def __init__(self, Coords=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -5538,7 +6257,7 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None ): @@ -5560,17 +6279,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PrintSpaceType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='PrintSpaceType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PrintSpaceType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='PrintSpaceType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PrintSpaceType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='PrintSpaceType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PrintSpaceType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='PrintSpaceType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5578,16 +6297,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='PrintSpaceType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='PrintSpaceType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5595,14 +6316,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5614,14 +6335,18 @@ def __hash__(self): class ReadingOrderType(GeneratedsSuper): - """Definition of the reading order within the page. + """ReadingOrderType -- + Definition of the reading order within the page. To express a reading order between elements they have to be included in an OrderedGroup. Groups may contain further groups. - Confidence value (between 0 and 1)""" + + * conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), MemberSpec_('OrderedGroup', 'OrderedGroupType', 0, 0, {'name': 'OrderedGroup', 'type': 'OrderedGroupType'}, 2), MemberSpec_('UnorderedGroup', 'UnorderedGroupType', 0, 0, {'name': 'UnorderedGroup', 'type': 'UnorderedGroupType'}, 2), ] @@ -5632,7 +6357,7 @@ def __init__(self, conf=None, OrderedGroup=None, UnorderedGroup=None, gds_collec self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.conf = _cast(float, conf) self.conf_nsprefix_ = "pc" self.OrderedGroup = OrderedGroup @@ -5681,7 +6406,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.OrderedGroup is not None or self.UnorderedGroup is not None @@ -5704,19 +6429,19 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ReadingOrderType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ReadingOrderType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ReadingOrderType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ReadingOrderType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ReadingOrderType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ReadingOrderType'): if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ReadingOrderType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ReadingOrderType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -5727,7 +6452,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.UnorderedGroup is not None: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' self.UnorderedGroup.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5736,12 +6461,14 @@ def to_etree(self, parent_element=None, name_='ReadingOrderType', mapping_=None, element.set('conf', self.gds_format_float(self.conf)) if self.OrderedGroup is not None: OrderedGroup_ = self.OrderedGroup - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UnorderedGroup is not None: UnorderedGroup_ = self.UnorderedGroup - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5749,19 +6476,19 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('conf', node) if value is not None and 'conf' not in already_processed: already_processed.add('conf') value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'OrderedGroup': obj_ = OrderedGroupType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -5778,12 +6505,14 @@ def __hash__(self): class RegionRefIndexedType(GeneratedsSuper): - """Numbered regionPosition (order number) of this item within the current - hierarchy level.""" + """RegionRefIndexedType -- Numbered region + index -- Position (order number) of this item within the current hierarchy level. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required', 'name': 'regionRef'}), ] subclass = None superclass = None @@ -5792,7 +6521,7 @@ def __init__(self, index=None, regionRef=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.index = _cast(int, index) self.index_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -5820,7 +6549,7 @@ def get_regionRef(self): return self.regionRef def set_regionRef(self, regionRef): self.regionRef = regionRef - def hasContent_(self): + def has__content(self): if ( ): @@ -5842,23 +6571,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefIndexedType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefIndexedType'): if self.index is not None and 'index' not in already_processed: already_processed.add('index') outfile.write(' index="%s"' % self.gds_format_integer(self.index, input_name='index')) if self.regionRef is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') outfile.write(' regionRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.regionRef), input_name='regionRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefIndexedType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -5869,6 +6598,8 @@ def to_etree(self, parent_element=None, name_='RegionRefIndexedType', mapping_=N element.set('regionRef', self.gds_format_string(self.regionRef)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -5876,12 +6607,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('index', node) if value is not None and 'index' not in already_processed: already_processed.add('index') @@ -5890,7 +6621,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') self.regionRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -5898,25 +6629,36 @@ def __hash__(self): class OrderedGroupIndexedType(GeneratedsSuper): - """Indexed group containing ordered elements - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Position (order number) of this item within the - current hierarchy level. - Is this group a continuation of another group (from - previous column or page, for example)? - For generic use""" + """OrderedGroupIndexedType -- + Indexed group containing ordered elements + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * index -- + Position (order number) of this item within the + current hierarchy level. + + * continuation -- + Is this group a continuation of another group (from + previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRefIndexed', 'RegionRefIndexedType', 1, 0, {'name': 'RegionRefIndexed', 'type': 'RegionRefIndexedType'}, 3), @@ -5930,7 +6672,7 @@ def __init__(self, id=None, regionRef=None, index=None, caption=None, type_=None self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -6073,7 +6815,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -6099,15 +6841,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupIndexedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupIndexedType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -6132,7 +6874,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupIndexedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -6152,7 +6894,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6175,17 +6917,19 @@ def to_etree(self, parent_element=None, name_='OrderedGroupIndexedType', mapping element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRefIndexed_ in self.RegionRefIndexed: - RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, nsmap_=nsmap_) + RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroupIndexed_ in self.OrderedGroupIndexed: - OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: - UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6193,12 +6937,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -6237,7 +6981,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -6341,11 +7085,16 @@ def sort_AllIndexed(self, validate_uniqueness=True): return self.get_AllIndexed() # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -6363,30 +7112,41 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) # end class OrderedGroupIndexedType class UnorderedGroupIndexedType(GeneratedsSuper): - """Indexed group containing unordered elements - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Position (order number) of this item within the - current hierarchy level. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """UnorderedGroupIndexedType -- + Indexed group containing unordered elements + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * index -- + Position (order number) of this item within the + current hierarchy level. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('index', 'int', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'name': 'RegionRef', 'type': 'RegionRefType'}, 4), @@ -6400,7 +7160,7 @@ def __init__(self, id=None, regionRef=None, index=None, caption=None, type_=None self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -6543,7 +7303,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -6569,15 +7329,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupIndexedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupIndexedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupIndexedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupIndexedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupIndexedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupIndexedType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -6602,7 +7362,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupIndexedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupIndexedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -6622,7 +7382,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroup_ in self.UnorderedGroup: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' UnorderedGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6645,17 +7405,19 @@ def to_etree(self, parent_element=None, name_='UnorderedGroupIndexedType', mappi element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroup_ in self.OrderedGroup: - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroup_ in self.UnorderedGroup: - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6663,12 +7425,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -6707,7 +7469,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -6748,7 +7510,7 @@ def get_UnorderedGroupChildren(self): class RegionRefType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required'}), + MemberSpec_('regionRef', 'string', 0, 0, {'use': 'required', 'name': 'regionRef'}), ] subclass = None superclass = None @@ -6757,7 +7519,7 @@ def __init__(self, regionRef=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.regionRef = _cast(None, regionRef) self.regionRef_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -6779,7 +7541,7 @@ def get_regionRef(self): return self.regionRef def set_regionRef(self, regionRef): self.regionRef = regionRef - def hasContent_(self): + def has__content(self): if ( ): @@ -6801,20 +7563,20 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionRefType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionRefType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionRefType'): if self.regionRef is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') outfile.write(' regionRef=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.regionRef), input_name='regionRef')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionRefType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -6823,6 +7585,8 @@ def to_etree(self, parent_element=None, name_='RegionRefType', mapping_=None, ns element.set('regionRef', self.gds_format_string(self.regionRef)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -6830,17 +7594,17 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('regionRef', node) if value is not None and 'regionRef' not in already_processed: already_processed.add('regionRef') self.regionRef = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -6848,22 +7612,31 @@ def __hash__(self): class OrderedGroupType(GeneratedsSuper): - """Numbered group (contains ordered elements) - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """OrderedGroupType -- + Numbered group (contains ordered elements) + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRefIndexed', 'RegionRefIndexedType', 1, 0, {'name': 'RegionRefIndexed', 'type': 'RegionRefIndexedType'}, 5), @@ -6877,7 +7650,7 @@ def __init__(self, id=None, regionRef=None, caption=None, type_=None, continuati self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -7014,7 +7787,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -7040,15 +7813,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='OrderedGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='OrderedGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='OrderedGroupType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -7070,7 +7843,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7090,7 +7863,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: namespaceprefix_ = self.UnorderedGroupIndexed_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroupIndexed_nsprefix_) else '' UnorderedGroupIndexed_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroupIndexed', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -7111,17 +7884,19 @@ def to_etree(self, parent_element=None, name_='OrderedGroupType', mapping_=None, element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRefIndexed_ in self.RegionRefIndexed: - RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, nsmap_=nsmap_) + RegionRefIndexed_.to_etree(element, name_='RegionRefIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroupIndexed_ in self.OrderedGroupIndexed: - OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroupIndexed_.to_etree(element, name_='OrderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroupIndexed_ in self.UnorderedGroupIndexed: - UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroupIndexed_.to_etree(element, name_='UnorderedGroupIndexed', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7129,12 +7904,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -7169,7 +7944,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7273,11 +8048,16 @@ def sort_AllIndexed(self, validate_uniqueness=True): return self.get_AllIndexed() # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -7295,27 +8075,36 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) # end class OrderedGroupType class UnorderedGroupType(GeneratedsSuper): - """Numbered group (contains unordered elements) - Optional link to a parent region of nested regions. - The parent region doubles as reading order group. - Only the nested regions should be allowed as group members. - Is this group a continuation of another group - (from previous column or page, for example)? - For generic use""" + """UnorderedGroupType -- + Numbered group (contains unordered elements) + + * regionRef -- + Optional link to a parent region of nested regions. + The parent region doubles as reading order group. + Only the nested regions should be allowed as group members. + + * continuation -- + Is this group a continuation of another group + (from previous column or page, for example)? + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('regionRef', 'string', 0, 1, {'use': 'optional', 'name': 'regionRef'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), + MemberSpec_('type_', 'pc:GroupTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'name': 'RegionRef', 'type': 'RegionRefType'}, 6), @@ -7329,7 +8118,7 @@ def __init__(self, id=None, regionRef=None, caption=None, type_=None, continuati self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.regionRef = _cast(None, regionRef) @@ -7466,7 +8255,7 @@ def validate_GroupTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GroupTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.UserDefined is not None or self.Labels or @@ -7492,15 +8281,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnorderedGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnorderedGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnorderedGroupType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -7522,7 +8311,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnorderedGroupType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7542,7 +8331,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UnorderedGroup_ in self.UnorderedGroup: namespaceprefix_ = self.UnorderedGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.UnorderedGroup_nsprefix_) else '' UnorderedGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UnorderedGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -7563,17 +8352,19 @@ def to_etree(self, parent_element=None, name_='UnorderedGroupType', mapping_=Non element.set('comments', self.gds_format_string(self.comments)) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for OrderedGroup_ in self.OrderedGroup: - OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, nsmap_=nsmap_) + OrderedGroup_.to_etree(element, name_='OrderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnorderedGroup_ in self.UnorderedGroup: - UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, nsmap_=nsmap_) + UnorderedGroup_.to_etree(element, name_='UnorderedGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7581,12 +8372,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -7621,7 +8412,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserDefined': obj_ = UserDefinedType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7660,8 +8451,11 @@ def get_UnorderedGroupChildren(self): class BorderType(GeneratedsSuper): - """Border of the actual page (if the scanned image - contains parts not belonging to the page).""" + """BorderType -- + Border of the actual page (if the scanned image + contains parts not belonging to the page). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -7673,7 +8467,7 @@ def __init__(self, Coords=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -7695,7 +8489,7 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None ): @@ -7717,17 +8511,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BorderType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BorderType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BorderType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BorderType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BorderType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BorderType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BorderType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BorderType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7735,16 +8529,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='BorderType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='BorderType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7752,14 +8548,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7785,9 +8581,12 @@ def set_Coords(self, Coords): class LayersType(GeneratedsSuper): - """Can be used to express the z-index of overlapping + """LayersType -- + Can be used to express the z-index of overlapping regions. An element with a greater z-index is always in - front of another element with lower z-index.""" + front of another element with lower z-index. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Layer', 'LayerType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'Layer', 'type': 'LayerType'}, None), @@ -7799,7 +8598,7 @@ def __init__(self, Layer=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Layer is None: self.Layer = [] else: @@ -7830,7 +8629,7 @@ def insert_Layer_at(self, index, value): self.Layer.insert(index, value) def replace_Layer_at(self, index, value): self.Layer[index] = value - def hasContent_(self): + def has__content(self): if ( self.Layer ): @@ -7852,17 +8651,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayersType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayersType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayersType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayersType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayersType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayersType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayersType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayersType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -7870,15 +8669,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Layer_ in self.Layer: namespaceprefix_ = self.Layer_nsprefix_ + ':' if (UseCapturedNS_ and self.Layer_nsprefix_) else '' Layer_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Layer', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LayersType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LayersType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Layer_ in self.Layer: - Layer_.to_etree(element, name_='Layer', mapping_=mapping_, nsmap_=nsmap_) + Layer_.to_etree(element, name_='Layer', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -7886,14 +8687,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Layer': obj_ = LayerType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -7907,9 +8708,9 @@ def __hash__(self): class LayerType(GeneratedsSuper): __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('zIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('caption', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('zIndex', 'int', 0, 0, {'use': 'required', 'name': 'zIndex'}), + MemberSpec_('caption', 'string', 0, 1, {'use': 'optional', 'name': 'caption'}), MemberSpec_('RegionRef', 'RegionRefType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'RegionRef', 'type': 'RegionRefType'}, None), ] subclass = None @@ -7919,7 +8720,7 @@ def __init__(self, id=None, zIndex=None, caption=None, RegionRef=None, gds_colle self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.zIndex = _cast(int, zIndex) @@ -7968,7 +8769,7 @@ def get_caption(self): return self.caption def set_caption(self, caption): self.caption = caption - def hasContent_(self): + def has__content(self): if ( self.RegionRef ): @@ -7990,15 +8791,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayerType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LayerType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayerType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LayerType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayerType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LayerType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -8008,7 +8809,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.caption is not None and 'caption' not in already_processed: already_processed.add('caption') outfile.write(' caption=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.caption), input_name='caption')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayerType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LayerType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8016,7 +8817,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for RegionRef_ in self.RegionRef: namespaceprefix_ = self.RegionRef_nsprefix_ + ':' if (UseCapturedNS_ and self.RegionRef_nsprefix_) else '' RegionRef_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='RegionRef', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8028,9 +8829,11 @@ def to_etree(self, parent_element=None, name_='LayerType', mapping_=None, nsmap_ if self.caption is not None: element.set('caption', self.gds_format_string(self.caption)) for RegionRef_ in self.RegionRef: - RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, nsmap_=nsmap_) + RegionRef_.to_etree(element, name_='RegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8038,12 +8841,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -8056,7 +8859,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'caption' not in already_processed: already_processed.add('caption') self.caption = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'RegionRef': obj_ = RegionRefType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8068,11 +8871,13 @@ def __hash__(self): class BaselineType(GeneratedsSuper): - """Confidence value (between 0 and 1)""" + """conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('points', 'pc:PointsType', 0, 0, {'use': 'required', 'name': 'points'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -8081,7 +8886,7 @@ def __init__(self, points=None, conf=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.points = _cast(None, points) self.points_nsprefix_ = "pc" self.conf = _cast(float, conf) @@ -8135,7 +8940,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -8157,23 +8962,23 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BaselineType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='BaselineType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BaselineType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='BaselineType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BaselineType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='BaselineType'): if self.points is not None and 'points' not in already_processed: already_processed.add('points') outfile.write(' points=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.points), input_name='points')), )) if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BaselineType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='BaselineType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8184,6 +8989,8 @@ def to_etree(self, parent_element=None, name_='BaselineType', mapping_=None, nsm element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8191,12 +8998,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('points', node) if value is not None and 'points' not in already_processed: already_processed.add('points') @@ -8208,7 +9015,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -8216,9 +9023,12 @@ def __hash__(self): class RelationsType(GeneratedsSuper): - """Container for one-to-one relations between layout + """RelationsType -- + Container for one-to-one relations between layout objects (for example: DropCap - paragraph, caption - - image).""" + image). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Relation', 'RelationType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'Relation', 'type': 'RelationType'}, None), @@ -8230,7 +9040,7 @@ def __init__(self, Relation=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Relation is None: self.Relation = [] else: @@ -8261,7 +9071,7 @@ def insert_Relation_at(self, index, value): self.Relation.insert(index, value) def replace_Relation_at(self, index, value): self.Relation[index] = value - def hasContent_(self): + def has__content(self): if ( self.Relation ): @@ -8283,17 +9093,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationsType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationsType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationsType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationsType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationsType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationsType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationsType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationsType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8301,15 +9111,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for Relation_ in self.Relation: namespaceprefix_ = self.Relation_nsprefix_ + ':' if (UseCapturedNS_ and self.Relation_nsprefix_) else '' Relation_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Relation', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RelationsType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RelationsType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Relation_ in self.Relation: - Relation_.to_etree(element, name_='Relation', mapping_=mapping_, nsmap_=nsmap_) + Relation_.to_etree(element, name_='Relation', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8317,14 +9129,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Relation': obj_ = RelationType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8336,7 +9148,8 @@ def __hash__(self): class RelationType(GeneratedsSuper): - """One-to-one relation between to layout object. Use 'link' + """RelationType -- + One-to-one relation between to layout object. Use 'link' for loose relations and 'join' for strong relations (where something is fragmented for instance). Examples for 'link': caption - image floating - @@ -8350,13 +9163,17 @@ class RelationType(GeneratedsSuper): pragraph is split across columns and the last word of the first paragraph DOES continue in the second paragraph) - For generic use""" + + * custom -- For generic use + * Labels -- Semantic labels / tags + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('type_', 'typeType1', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('Labels', 'LabelsType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'Labels', 'type': 'LabelsType'}, None), MemberSpec_('SourceRegionRef', 'RegionRefType', 0, 0, {'maxOccurs': '1', 'minOccurs': '1', 'name': 'SourceRegionRef', 'type': 'RegionRefType'}, None), MemberSpec_('TargetRegionRef', 'RegionRefType', 0, 0, {'maxOccurs': '1', 'minOccurs': '1', 'name': 'TargetRegionRef', 'type': 'RegionRefType'}, None), @@ -8368,7 +9185,7 @@ def __init__(self, id=None, type_=None, custom=None, comments=None, Labels=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -8435,7 +9252,20 @@ def get_comments(self): return self.comments def set_comments(self, comments): self.comments = comments - def hasContent_(self): + def validate_typeType1(self, value): + # Validate type typeType1, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['link', 'join'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType1' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.Labels or self.SourceRegionRef is not None or @@ -8459,15 +9289,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RelationType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RelationType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RelationType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -8480,7 +9310,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.comments is not None and 'comments' not in already_processed: already_processed.add('comments') outfile.write(' comments=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.comments), input_name='comments')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RelationType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -8494,7 +9324,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.TargetRegionRef is not None: namespaceprefix_ = self.TargetRegionRef_nsprefix_ + ':' if (UseCapturedNS_ and self.TargetRegionRef_nsprefix_) else '' self.TargetRegionRef.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TargetRegionRef', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8508,15 +9338,17 @@ def to_etree(self, parent_element=None, name_='RelationType', mapping_=None, nsm if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.SourceRegionRef is not None: SourceRegionRef_ = self.SourceRegionRef - SourceRegionRef_.to_etree(element, name_='SourceRegionRef', mapping_=mapping_, nsmap_=nsmap_) + SourceRegionRef_.to_etree(element, name_='SourceRegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TargetRegionRef is not None: TargetRegionRef_ = self.TargetRegionRef - TargetRegionRef_.to_etree(element, name_='TargetRegionRef', mapping_=mapping_, nsmap_=nsmap_) + TargetRegionRef_.to_etree(element, name_='TargetRegionRef', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8524,12 +9356,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -8538,6 +9370,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType1(self.type_) # validate type typeType1 value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -8546,7 +9379,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'comments' not in already_processed: already_processed.add('comments') self.comments = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Labels': obj_ = LabelsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -8568,49 +9401,69 @@ def __hash__(self): class TextStyleType(GeneratedsSuper): - """Monospace (fixed-pitch, non-proportional) or + """TextStyleType -- + Monospace (fixed-pitch, non-proportional) or proportional font. - For instance: Arial, Times New Roman. - Add more information if necessary - (e.g. blackletter, antiqua). - Serif or sans-serif typeface. - The size of the characters in points. - The x-height or corpus size refers to the distance - between the baseline and the mean line of - lower-case letters in a typeface. - The unit is assumed to be pixels. - The degree of space (in points) between - the characters in a string of text. - Text colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - Background colour - Background colour in RGB encoded format - (red value) + (256 x green value) + (65536 x blue value). - Specifies whether the colour of the text appears - reversed against a background colour. - Line style details if "underlined" is TRUE""" + + * fontFamily -- + For instance: Arial, Times New Roman. + Add more information if necessary + (e.g. blackletter, antiqua). + + * serif -- + Serif or sans-serif typeface. + + * fontSize -- + The size of the characters in points. + + * xHeight -- + The x-height or corpus size refers to the distance + between the baseline and the mean line of + lower-case letters in a typeface. + The unit is assumed to be pixels. + + * kerning -- + The degree of space (in points) between + the characters in a string of text. + + * textColourRgb -- + Text colour in RGB encoded format + (red value) + (256 x green value) + (65536 x blue value). + + * bgColour -- Background colour + * bgColourRgb -- + Background colour in RGB encoded format + (red value) + (256 x green value) + (65536 x blue value). + + * reverseVideo -- + Specifies whether the colour of the text appears + reversed against a background colour. + + * underlineStyle -- Line style details if "underlined" is TRUE + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('fontFamily', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('serif', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('monospace', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('fontSize', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('xHeight', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('kerning', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('textColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textColourRgb', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColourRgb', 'integer', 0, 1, {'use': 'optional'}), - MemberSpec_('reverseVideo', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('bold', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('italic', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('underlined', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('underlineStyle', 'pc:UnderlineStyleSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('subscript', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('superscript', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('strikethrough', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('smallCaps', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('letterSpaced', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('fontFamily', 'string', 0, 1, {'use': 'optional', 'name': 'fontFamily'}), + MemberSpec_('serif', 'boolean', 0, 1, {'use': 'optional', 'name': 'serif'}), + MemberSpec_('monospace', 'boolean', 0, 1, {'use': 'optional', 'name': 'monospace'}), + MemberSpec_('fontSize', 'float', 0, 1, {'use': 'optional', 'name': 'fontSize'}), + MemberSpec_('xHeight', 'integer', 0, 1, {'use': 'optional', 'name': 'xHeight'}), + MemberSpec_('kerning', 'int', 0, 1, {'use': 'optional', 'name': 'kerning'}), + MemberSpec_('textColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'textColour'}), + MemberSpec_('textColourRgb', 'integer', 0, 1, {'use': 'optional', 'name': 'textColourRgb'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('bgColourRgb', 'integer', 0, 1, {'use': 'optional', 'name': 'bgColourRgb'}), + MemberSpec_('reverseVideo', 'boolean', 0, 1, {'use': 'optional', 'name': 'reverseVideo'}), + MemberSpec_('bold', 'boolean', 0, 1, {'use': 'optional', 'name': 'bold'}), + MemberSpec_('italic', 'boolean', 0, 1, {'use': 'optional', 'name': 'italic'}), + MemberSpec_('underlined', 'boolean', 0, 1, {'use': 'optional', 'name': 'underlined'}), + MemberSpec_('underlineStyle', 'pc:UnderlineStyleSimpleType', 0, 1, {'use': 'optional', 'name': 'underlineStyle'}), + MemberSpec_('subscript', 'boolean', 0, 1, {'use': 'optional', 'name': 'subscript'}), + MemberSpec_('superscript', 'boolean', 0, 1, {'use': 'optional', 'name': 'superscript'}), + MemberSpec_('strikethrough', 'boolean', 0, 1, {'use': 'optional', 'name': 'strikethrough'}), + MemberSpec_('smallCaps', 'boolean', 0, 1, {'use': 'optional', 'name': 'smallCaps'}), + MemberSpec_('letterSpaced', 'boolean', 0, 1, {'use': 'optional', 'name': 'letterSpaced'}), ] subclass = None superclass = None @@ -8619,7 +9472,7 @@ def __init__(self, fontFamily=None, serif=None, monospace=None, fontSize=None, x self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.fontFamily = _cast(None, fontFamily) self.fontFamily_nsprefix_ = "pc" self.serif = _cast(bool, serif) @@ -8781,7 +9634,7 @@ def validate_UnderlineStyleSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on UnderlineStyleSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -8803,14 +9656,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextStyleType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextStyleType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextStyleType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextStyleType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextStyleType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextStyleType'): if self.fontFamily is not None and 'fontFamily' not in already_processed: already_processed.add('fontFamily') outfile.write(' fontFamily=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.fontFamily), input_name='fontFamily')), )) @@ -8871,9 +9724,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.letterSpaced is not None and 'letterSpaced' not in already_processed: already_processed.add('letterSpaced') outfile.write(' letterSpaced="%s"' % self.gds_format_boolean(self.letterSpaced, input_name='letterSpaced')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextStyleType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextStyleType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -8920,6 +9773,8 @@ def to_etree(self, parent_element=None, name_='TextStyleType', mapping_=None, ns element.set('letterSpaced', self.gds_format_boolean(self.letterSpaced)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -8927,12 +9782,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('fontFamily', node) if value is not None and 'fontFamily' not in already_processed: already_processed.add('fontFamily') @@ -9072,7 +9927,7 @@ def buildAttributes(self, node, attrs, already_processed): self.letterSpaced = False else: raise_parse_error(node, 'Bad boolean attribute') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -9080,15 +9935,27 @@ def __hash__(self): class RegionType(GeneratedsSuper): - """For generic use + """custom -- For generic use + continuation -- Is this region a continuation of another region - (in previous column or page, for example)?""" + (in previous column or page, for example)? + + * AlternativeImage -- + Alternative region images + (e.g. black-and-white). + + * Labels -- Semantic labels / tags + * Roles -- + Roles the region takes + (e.g. in context of a parent region). + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('continuation', 'boolean', 0, 1, {'use': 'optional', 'name': 'continuation'}), MemberSpec_('AlternativeImage', 'AlternativeImageType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'AlternativeImage', 'type': 'AlternativeImageType'}, None), MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), MemberSpec_('UserDefined', 'UserDefinedType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'UserDefined', 'type': 'UserDefinedType'}, None), @@ -9116,7 +9983,7 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.custom = _cast(None, custom) @@ -9417,7 +10284,7 @@ def set_continuation(self, continuation): self.continuation = continuation def get_extensiontype_(self): return self.extensiontype_ def set_extensiontype_(self, extensiontype_): self.extensiontype_ = extensiontype_ - def hasContent_(self): + def has__content(self): if ( self.AlternativeImage or self.Coords is not None or @@ -9457,15 +10324,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RegionType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -9486,7 +10353,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' outfile.write(' xsi:type="%s%s"' % (imported_ns_type_prefix_, self.extensiontype_)) else: outfile.write(' xsi:type="%s"' % self.extensiontype_) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RegionType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -9548,7 +10415,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for CustomRegion_ in self.CustomRegion: namespaceprefix_ = self.CustomRegion_nsprefix_ + ':' if (UseCapturedNS_ and self.CustomRegion_nsprefix_) else '' CustomRegion_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='CustomRegion', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -9564,48 +10431,50 @@ def to_etree(self, parent_element=None, name_='RegionType', mapping_=None, nsmap if self.continuation is not None: element.set('continuation', self.gds_format_boolean(self.continuation)) for AlternativeImage_ in self.AlternativeImage: - AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, nsmap_=nsmap_) + AlternativeImage_.to_etree(element, name_='AlternativeImage', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.UserDefined is not None: UserDefined_ = self.UserDefined - UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, nsmap_=nsmap_) + UserDefined_.to_etree(element, name_='UserDefined', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for Labels_ in self.Labels: - Labels_.to_etree(element, name_='Labels', mapping_=mapping_, nsmap_=nsmap_) + Labels_.to_etree(element, name_='Labels', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.Roles is not None: Roles_ = self.Roles - Roles_.to_etree(element, name_='Roles', mapping_=mapping_, nsmap_=nsmap_) + Roles_.to_etree(element, name_='Roles', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextRegion_ in self.TextRegion: - TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, nsmap_=nsmap_) + TextRegion_.to_etree(element, name_='TextRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ImageRegion_ in self.ImageRegion: - ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, nsmap_=nsmap_) + ImageRegion_.to_etree(element, name_='ImageRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for LineDrawingRegion_ in self.LineDrawingRegion: - LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, nsmap_=nsmap_) + LineDrawingRegion_.to_etree(element, name_='LineDrawingRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphicRegion_ in self.GraphicRegion: - GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, nsmap_=nsmap_) + GraphicRegion_.to_etree(element, name_='GraphicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TableRegion_ in self.TableRegion: - TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, nsmap_=nsmap_) + TableRegion_.to_etree(element, name_='TableRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChartRegion_ in self.ChartRegion: - ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, nsmap_=nsmap_) + ChartRegion_.to_etree(element, name_='ChartRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for SeparatorRegion_ in self.SeparatorRegion: - SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, nsmap_=nsmap_) + SeparatorRegion_.to_etree(element, name_='SeparatorRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MathsRegion_ in self.MathsRegion: - MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, nsmap_=nsmap_) + MathsRegion_.to_etree(element, name_='MathsRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for ChemRegion_ in self.ChemRegion: - ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, nsmap_=nsmap_) + ChemRegion_.to_etree(element, name_='ChemRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for MusicRegion_ in self.MusicRegion: - MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, nsmap_=nsmap_) + MusicRegion_.to_etree(element, name_='MusicRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for AdvertRegion_ in self.AdvertRegion: - AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, nsmap_=nsmap_) + AdvertRegion_.to_etree(element, name_='AdvertRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NoiseRegion_ in self.NoiseRegion: - NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, nsmap_=nsmap_) + NoiseRegion_.to_etree(element, name_='NoiseRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for UnknownRegion_ in self.UnknownRegion: - UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, nsmap_=nsmap_) + UnknownRegion_.to_etree(element, name_='UnknownRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for CustomRegion_ in self.CustomRegion: - CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, nsmap_=nsmap_) + CustomRegion_.to_etree(element, name_='CustomRegion', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -9613,12 +10482,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -9644,7 +10513,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'xsi:type' not in already_processed: already_processed.add('xsi:type') self.extensiontype_ = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'AlternativeImage': obj_ = AlternativeImageType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -9791,12 +10660,14 @@ def set_Coords(self, Coords): class AlternativeImageType(GeneratedsSuper): - """Confidence value (between 0 and 1)""" + """conf -- Confidence value (between 0 and 1) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('filename', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('filename', 'string', 0, 0, {'use': 'required', 'name': 'filename'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), + MemberSpec_('conf', 'pc:ConfSimpleType', 0, 1, {'use': 'optional', 'name': 'conf'}), ] subclass = None superclass = None @@ -9805,7 +10676,7 @@ def __init__(self, filename=None, comments=None, conf=None, gds_collector_=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.filename = _cast(None, filename) self.filename_nsprefix_ = "pc" self.comments = _cast(None, comments) @@ -9854,7 +10725,7 @@ def validate_ConfSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd maxInclusive restriction on ConfSimpleType' % {"value": value, "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( ): @@ -9876,14 +10747,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AlternativeImageType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AlternativeImageType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AlternativeImageType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AlternativeImageType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AlternativeImageType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AlternativeImageType'): if self.filename is not None and 'filename' not in already_processed: already_processed.add('filename') outfile.write(' filename=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.filename), input_name='filename')), )) @@ -9893,9 +10764,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.conf is not None and 'conf' not in already_processed: already_processed.add('conf') outfile.write(' conf="%s"' % self.gds_format_float(self.conf, input_name='conf')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AlternativeImageType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AlternativeImageType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -9908,6 +10779,8 @@ def to_etree(self, parent_element=None, name_='AlternativeImageType', mapping_=N element.set('conf', self.gds_format_float(self.conf)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -9915,12 +10788,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('filename', node) if value is not None and 'filename' not in already_processed: already_processed.add('filename') @@ -9935,7 +10808,7 @@ def buildAttributes(self, node, attrs, already_processed): value = self.gds_parse_float(value, node, 'conf') self.conf = value self.validate_ConfSimpleType(self.conf) # validate type ConfSimpleType - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -9943,8 +10816,11 @@ def __hash__(self): class GraphemesType(GeneratedsSuper): - """Container for graphemes, grapheme groups and - non-printing characters.""" + """GraphemesType -- + Container for graphemes, grapheme groups and + non-printing characters. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Grapheme', 'GraphemeType', 1, 0, {'name': 'Grapheme', 'type': 'GraphemeType'}, 8), @@ -9958,7 +10834,7 @@ def __init__(self, Grapheme=None, NonPrintingChar=None, GraphemeGroup=None, gds_ self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if Grapheme is None: self.Grapheme = [] else: @@ -10019,7 +10895,7 @@ def insert_GraphemeGroup_at(self, index, value): self.GraphemeGroup.insert(index, value) def replace_GraphemeGroup_at(self, index, value): self.GraphemeGroup[index] = value - def hasContent_(self): + def has__content(self): if ( self.Grapheme or self.NonPrintingChar or @@ -10043,17 +10919,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemesType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemesType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemesType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemesType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemesType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemesType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemesType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemesType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10067,19 +10943,21 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for GraphemeGroup_ in self.GraphemeGroup: namespaceprefix_ = self.GraphemeGroup_nsprefix_ + ':' if (UseCapturedNS_ and self.GraphemeGroup_nsprefix_) else '' GraphemeGroup_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='GraphemeGroup', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemesType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GraphemesType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for Grapheme_ in self.Grapheme: - Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, nsmap_=nsmap_) + Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NonPrintingChar_ in self.NonPrintingChar: - NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, nsmap_=nsmap_) + NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for GraphemeGroup_ in self.GraphemeGroup: - GraphemeGroup_.to_etree(element, name_='GraphemeGroup', mapping_=mapping_, nsmap_=nsmap_) + GraphemeGroup_.to_etree(element, name_='GraphemeGroup', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10087,14 +10965,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grapheme': obj_ = GraphemeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10116,20 +10994,29 @@ def __hash__(self): class GraphemeBaseType(GeneratedsSuper): - """Base type for graphemes, grapheme groups and non-printing characters. - Order index of grapheme, group, or non-printing character - within the parent container (graphemes or glyph or grapheme group). - Type of character represented by the - grapheme, group, or non-printing character element. - For generic useFor generic use""" + """GraphemeBaseType -- + Base type for graphemes, grapheme groups and non-printing characters. + + * index -- + Order index of grapheme, group, or non-printing character + within the parent container (graphemes or glyph or grapheme group). + + * charType -- + Type of character represented by the + grapheme, group, or non-printing character element. + + * custom -- For generic use + * comments -- For generic use + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('id', 'string', 0, 0, {'use': 'required'}), - MemberSpec_('index', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('charType', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('custom', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('comments', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('id', 'string', 0, 0, {'use': 'required', 'name': 'id'}), + MemberSpec_('index', 'indexType2', 0, 0, {'use': 'required', 'name': 'index'}), + MemberSpec_('ligature', 'boolean', 0, 1, {'use': 'optional', 'name': 'ligature'}), + MemberSpec_('charType', 'charTypeType', 0, 1, {'use': 'optional', 'name': 'charType'}), + MemberSpec_('custom', 'string', 0, 1, {'use': 'optional', 'name': 'custom'}), + MemberSpec_('comments', 'string', 0, 1, {'use': 'optional', 'name': 'comments'}), MemberSpec_('TextEquiv', 'TextEquivType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextEquiv', 'type': 'TextEquivType'}, None), ] subclass = None @@ -10139,7 +11026,7 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.id = _cast(None, id) self.id_nsprefix_ = "pc" self.index = _cast(int, index) @@ -10209,7 +11096,31 @@ def set_comments(self, comments): self.comments = comments def get_extensiontype_(self): return self.extensiontype_ def set_extensiontype_(self, extensiontype_): self.extensiontype_ = extensiontype_ - def hasContent_(self): + def validate_indexType2(self, value): + # Validate type indexType2, a restriction on int. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, int): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (int)' % {"value": value, "lineno": lineno, }) + return False + if value < 0: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd minInclusive restriction on indexType2' % {"value": value, "lineno": lineno} ) + result = False + def validate_charTypeType(self, value): + # Validate type charTypeType, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['base', 'combining'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on charTypeType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( self.TextEquiv ): @@ -10231,15 +11142,15 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeBaseType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeBaseType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeBaseType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeBaseType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeBaseType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeBaseType'): if self.id is not None and 'id' not in already_processed: already_processed.add('id') outfile.write(' id=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.id), input_name='id')), )) @@ -10266,7 +11177,7 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' outfile.write(' xsi:type="%s%s"' % (imported_ns_type_prefix_, self.extensiontype_)) else: outfile.write(' xsi:type="%s"' % self.extensiontype_) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeBaseType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeBaseType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10274,7 +11185,7 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for TextEquiv_ in self.TextEquiv: namespaceprefix_ = self.TextEquiv_nsprefix_ + ':' if (UseCapturedNS_ and self.TextEquiv_nsprefix_) else '' TextEquiv_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TextEquiv', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -10294,9 +11205,11 @@ def to_etree(self, parent_element=None, name_='GraphemeBaseType', mapping_=None, if self.comments is not None: element.set('comments', self.gds_format_string(self.comments)) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10304,12 +11217,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('id', node) if value is not None and 'id' not in already_processed: already_processed.add('id') @@ -10318,6 +11231,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'index' not in already_processed: already_processed.add('index') self.index = self.gds_parse_integer(value, node, 'index') + self.validate_indexType2(self.index) # validate type indexType2 value = find_attr_value_('ligature', node) if value is not None and 'ligature' not in already_processed: already_processed.add('ligature') @@ -10331,6 +11245,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'charType' not in already_processed: already_processed.add('charType') self.charType = value + self.validate_charTypeType(self.charType) # validate type charTypeType value = find_attr_value_('custom', node) if value is not None and 'custom' not in already_processed: already_processed.add('custom') @@ -10343,7 +11258,7 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'xsi:type' not in already_processed: already_processed.add('xsi:type') self.extensiontype_ = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TextEquiv': obj_ = TextEquivType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10355,9 +11270,12 @@ def __hash__(self): class GraphemeType(GraphemeBaseType): - """Represents a sub-element of a glyph. + """GraphemeType -- + Represents a sub-element of a glyph. Smallest graphical unit that can be - assigned a Unicode code point.""" + assigned a Unicode code point. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('Coords', 'CoordsType', 0, 0, {'name': 'Coords', 'type': 'CoordsType'}, None), @@ -10369,8 +11287,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphemeType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphemeType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) self.Coords = Coords self.Coords_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -10392,10 +11310,10 @@ def get_Coords(self): return self.Coords def set_Coords(self, Coords): self.Coords = Coords - def hasContent_(self): + def has__content(self): if ( self.Coords is not None or - super(GraphemeType, self).hasContent_() + super(GraphemeType, self).has__content() ): return True else: @@ -10415,18 +11333,18 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeType'): - super(GraphemeType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeType', fromsubclass_=False, pretty_print=True): - super(GraphemeType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeType'): + super(GraphemeType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeType', fromsubclass_=False, pretty_print=True): + super(GraphemeType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -10434,13 +11352,15 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.Coords is not None: namespaceprefix_ = self.Coords_nsprefix_ + ':' if (UseCapturedNS_ and self.Coords_nsprefix_) else '' self.Coords.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Coords', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeType', mapping_=None, nsmap_=None): - element = super(GraphemeType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='GraphemeType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphemeType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.Coords is not None: Coords_ = self.Coords - Coords_.to_etree(element, name_='Coords', mapping_=mapping_, nsmap_=nsmap_) + Coords_.to_etree(element, name_='Coords', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10448,30 +11368,33 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(GraphemeType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildAttributes(self, node, attrs, already_processed): + super(GraphemeType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Coords': obj_ = CoordsType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) self.Coords = obj_ obj_.original_tagname_ = 'Coords' - super(GraphemeType, self).buildChildren(child_, node, nodeName_, True) + super(GraphemeType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) # end class GraphemeType class NonPrintingCharType(GraphemeBaseType): - """A glyph component without visual representation + """NonPrintingCharType -- + A glyph component without visual representation but with Unicode code point. Non-visual / non-printing / control character. - Part of grapheme container (of glyph) or grapheme sub group.""" + Part of grapheme container (of glyph) or grapheme sub group. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -10482,8 +11405,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(NonPrintingCharType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("NonPrintingCharType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -10499,9 +11422,9 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(NonPrintingCharType, self).hasContent_() + super(NonPrintingCharType, self).has__content() ): return True else: @@ -10521,22 +11444,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NonPrintingCharType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NonPrintingCharType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NonPrintingCharType'): - super(NonPrintingCharType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NonPrintingCharType', fromsubclass_=False, pretty_print=True): - super(NonPrintingCharType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='NonPrintingCharType', mapping_=None, nsmap_=None): - element = super(NonPrintingCharType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NonPrintingCharType'): + super(NonPrintingCharType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NonPrintingCharType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NonPrintingCharType', fromsubclass_=False, pretty_print=True): + super(NonPrintingCharType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='NonPrintingCharType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(NonPrintingCharType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10544,15 +11469,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(NonPrintingCharType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(NonPrintingCharType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(NonPrintingCharType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(NonPrintingCharType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -10572,8 +11497,8 @@ def __init__(self, id=None, index=None, ligature=None, charType=None, custom=Non self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphemeGroupType, self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphemeGroupType"), self).__init__(id, index, ligature, charType, custom, comments, TextEquiv, **kwargs_) if Grapheme is None: self.Grapheme = [] else: @@ -10619,11 +11544,11 @@ def insert_NonPrintingChar_at(self, index, value): self.NonPrintingChar.insert(index, value) def replace_NonPrintingChar_at(self, index, value): self.NonPrintingChar[index] = value - def hasContent_(self): + def has__content(self): if ( self.Grapheme or self.NonPrintingChar or - super(GraphemeGroupType, self).hasContent_() + super(GraphemeGroupType, self).has__content() ): return True else: @@ -10643,18 +11568,18 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeGroupType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphemeGroupType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeGroupType'): - super(GraphemeGroupType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeGroupType', fromsubclass_=False, pretty_print=True): - super(GraphemeGroupType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphemeGroupType'): + super(GraphemeGroupType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphemeGroupType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphemeGroupType', fromsubclass_=False, pretty_print=True): + super(GraphemeGroupType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -10665,14 +11590,16 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for NonPrintingChar_ in self.NonPrintingChar: namespaceprefix_ = self.NonPrintingChar_nsprefix_ + ':' if (UseCapturedNS_ and self.NonPrintingChar_nsprefix_) else '' NonPrintingChar_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='NonPrintingChar', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphemeGroupType', mapping_=None, nsmap_=None): - element = super(GraphemeGroupType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='GraphemeGroupType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphemeGroupType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) for Grapheme_ in self.Grapheme: - Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, nsmap_=nsmap_) + Grapheme_.to_etree(element, name_='Grapheme', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for NonPrintingChar_ in self.NonPrintingChar: - NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, nsmap_=nsmap_) + NonPrintingChar_.to_etree(element, name_='NonPrintingChar', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10680,14 +11607,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(GraphemeGroupType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildAttributes(self, node, attrs, already_processed): + super(GraphemeGroupType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grapheme': obj_ = GraphemeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10698,14 +11625,16 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.build(child_, gds_collector_=gds_collector_) self.NonPrintingChar.append(obj_) obj_.original_tagname_ = 'NonPrintingChar' - super(GraphemeGroupType, self).buildChildren(child_, node, nodeName_, True) + super(GraphemeGroupType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) # end class GraphemeGroupType class UserDefinedType(GeneratedsSuper): - """Container for user-defined attributes""" + """UserDefinedType -- Container for user-defined attributes + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('UserAttribute', 'UserAttributeType', 1, 0, {'maxOccurs': 'unbounded', 'minOccurs': '1', 'name': 'UserAttribute', 'type': 'UserAttributeType'}, None), @@ -10717,7 +11646,7 @@ def __init__(self, UserAttribute=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" if UserAttribute is None: self.UserAttribute = [] else: @@ -10748,7 +11677,7 @@ def insert_UserAttribute_at(self, index, value): self.UserAttribute.insert(index, value) def replace_UserAttribute_at(self, index, value): self.UserAttribute[index] = value - def hasContent_(self): + def has__content(self): if ( self.UserAttribute ): @@ -10770,17 +11699,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserDefinedType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserDefinedType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserDefinedType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserDefinedType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserDefinedType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserDefinedType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserDefinedType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserDefinedType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -10788,15 +11717,17 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml for UserAttribute_ in self.UserAttribute: namespaceprefix_ = self.UserAttribute_nsprefix_ + ':' if (UseCapturedNS_ and self.UserAttribute_nsprefix_) else '' UserAttribute_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserAttribute', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UserDefinedType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UserDefinedType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) for UserAttribute_ in self.UserAttribute: - UserAttribute_.to_etree(element, name_='UserAttribute', mapping_=mapping_, nsmap_=nsmap_) + UserAttribute_.to_etree(element, name_='UserAttribute', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10804,14 +11735,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'UserAttribute': obj_ = UserAttributeType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -10823,13 +11754,15 @@ def __hash__(self): class UserAttributeType(GeneratedsSuper): - """Structured custom data defined by name, type and value.""" + """UserAttributeType -- Structured custom data defined by name, type and value. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('name', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('description', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), - MemberSpec_('value', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('name', 'string', 0, 1, {'use': 'optional', 'name': 'name'}), + MemberSpec_('description', 'string', 0, 1, {'use': 'optional', 'name': 'description'}), + MemberSpec_('type_', 'typeType3', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('value', 'string', 0, 1, {'use': 'optional', 'name': 'value'}), ] subclass = None superclass = None @@ -10838,7 +11771,7 @@ def __init__(self, name=None, description=None, type_=None, value=None, gds_coll self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.name = _cast(None, name) self.name_nsprefix_ = "pc" self.description = _cast(None, description) @@ -10878,7 +11811,20 @@ def get_value(self): return self.value def set_value(self, value): self.value = value - def hasContent_(self): + def validate_typeType3(self, value): + # Validate type typeType3, a restriction on string. + if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: + if not isinstance(value, str): + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) + return False + value = value + enumerations = ['xsd:string', 'xsd:integer', 'xsd:boolean', 'xsd:float'] + if value not in enumerations: + lineno = self.gds_get_node_lineno_() + self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on typeType3' % {"value" : encode_str_2_3(value), "lineno": lineno} ) + result = False + def has__content(self): if ( ): @@ -10900,14 +11846,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserAttributeType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UserAttributeType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserAttributeType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UserAttributeType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserAttributeType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UserAttributeType'): if self.name is not None and 'name' not in already_processed: already_processed.add('name') outfile.write(' name=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.name), input_name='name')), )) @@ -10920,9 +11866,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.value is not None and 'value' not in already_processed: already_processed.add('value') outfile.write(' value=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.value), input_name='value')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserAttributeType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UserAttributeType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -10937,6 +11883,8 @@ def to_etree(self, parent_element=None, name_='UserAttributeType', mapping_=None element.set('value', self.gds_format_string(self.value)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -10944,12 +11892,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('name', node) if value is not None and 'name' not in already_processed: already_processed.add('name') @@ -10962,11 +11910,12 @@ def buildAttributes(self, node, attrs, already_processed): if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value + self.validate_typeType3(self.type_) # validate type typeType3 value = find_attr_value_('value', node) if value is not None and 'value' not in already_processed: already_processed.add('value') self.value = value - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -10974,17 +11923,21 @@ def __hash__(self): class TableCellRoleType(GeneratedsSuper): - """Cell position in table starting with row 0Cell position in table - starting with column 0Number of rows the cell spans (optional; default - is 1)Number of columns the cell spans (optional; default is 1) - Is the cell a column or row header?""" + """rowIndex -- Cell position in table starting with row 0 + columnIndex -- Cell position in table starting with column 0 + rowSpan -- Number of rows the cell spans (optional; default is 1) + colSpan -- Number of columns the cell spans (optional; default is 1) + header -- + Is the cell a column or row header? + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('rowIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('columnIndex', 'int', 0, 0, {'use': 'required'}), - MemberSpec_('rowSpan', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('colSpan', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('header', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('rowIndex', 'int', 0, 0, {'use': 'required', 'name': 'rowIndex'}), + MemberSpec_('columnIndex', 'int', 0, 0, {'use': 'required', 'name': 'columnIndex'}), + MemberSpec_('rowSpan', 'int', 0, 1, {'use': 'optional', 'name': 'rowSpan'}), + MemberSpec_('colSpan', 'int', 0, 1, {'use': 'optional', 'name': 'colSpan'}), + MemberSpec_('header', 'boolean', 0, 1, {'use': 'optional', 'name': 'header'}), ] subclass = None superclass = None @@ -10993,7 +11946,7 @@ def __init__(self, rowIndex=None, columnIndex=None, rowSpan=None, colSpan=None, self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.rowIndex = _cast(int, rowIndex) self.rowIndex_nsprefix_ = "pc" self.columnIndex = _cast(int, columnIndex) @@ -11039,7 +11992,7 @@ def get_header(self): return self.header def set_header(self, header): self.header = header - def hasContent_(self): + def has__content(self): if ( ): @@ -11061,14 +12014,14 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableCellRoleType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableCellRoleType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableCellRoleType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableCellRoleType', pretty_print=pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableCellRoleType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableCellRoleType'): if self.rowIndex is not None and 'rowIndex' not in already_processed: already_processed.add('rowIndex') outfile.write(' rowIndex="%s"' % self.gds_format_integer(self.rowIndex, input_name='rowIndex')) @@ -11084,9 +12037,9 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.header is not None and 'header' not in already_processed: already_processed.add('header') outfile.write(' header="%s"' % self.gds_format_boolean(self.header, input_name='header')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableCellRoleType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableCellRoleType', fromsubclass_=False, pretty_print=True): pass - def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: @@ -11103,6 +12056,8 @@ def to_etree(self, parent_element=None, name_='TableCellRoleType', mapping_=None element.set('header', self.gds_format_boolean(self.header)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11110,12 +12065,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('rowIndex', node) if value is not None and 'rowIndex' not in already_processed: already_processed.add('rowIndex') @@ -11141,7 +12096,7 @@ def buildAttributes(self, node, attrs, already_processed): self.header = False else: raise_parse_error(node, 'Bad boolean attribute') - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): pass def __hash__(self): return hash(self.id) @@ -11149,6 +12104,11 @@ def __hash__(self): class RolesType(GeneratedsSuper): + """TableCellRole -- + Data for a region that takes on the role + of a table cell within a parent table region. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ MemberSpec_('TableCellRole', 'TableCellRoleType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'TableCellRole', 'type': 'TableCellRoleType'}, None), @@ -11160,7 +12120,7 @@ def __init__(self, TableCellRole=None, gds_collector_=None, **kwargs_): self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None + self.ns_prefix_ = "pc" self.TableCellRole = TableCellRole self.TableCellRole_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11182,7 +12142,7 @@ def get_TableCellRole(self): return self.TableCellRole def set_TableCellRole(self, TableCellRole): self.TableCellRole = TableCellRole - def hasContent_(self): + def has__content(self): if ( self.TableCellRole is not None ): @@ -11204,17 +12164,17 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="h showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RolesType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='RolesType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RolesType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='RolesType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RolesType'): + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='RolesType'): pass - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RolesType', fromsubclass_=False, pretty_print=True): + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='RolesType', fromsubclass_=False, pretty_print=True): if pretty_print: eol_ = '\n' else: @@ -11222,16 +12182,18 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xml if self.TableCellRole is not None: namespaceprefix_ = self.TableCellRole_nsprefix_ + ':' if (UseCapturedNS_ and self.TableCellRole_nsprefix_) else '' self.TableCellRole.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TableCellRole', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='RolesType', mapping_=None, nsmap_=None): + def to_etree(self, parent_element=None, name_='RolesType', mapping_=None, reverse_mapping_=None, nsmap_=None): if parent_element is None: element = etree_.Element('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) else: element = etree_.SubElement(parent_element, '{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}' + name_, nsmap=nsmap_) if self.TableCellRole is not None: TableCellRole_ = self.TableCellRole - TableCellRole_.to_etree(element, name_='TableCellRole', mapping_=mapping_, nsmap_=nsmap_) + TableCellRole_.to_etree(element, name_='TableCellRole', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11239,14 +12201,14 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): pass - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TableCellRole': obj_ = TableCellRoleType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -11258,14 +12220,19 @@ def __hash__(self): class CustomRegionType(RegionType): - """Regions containing content that is not covered + """CustomRegionType -- + Regions containing content that is not covered by the default types (text, graphic, image, line drawing, chart, table, separator, maths, map, music, chem, advert, noise, unknown). - Information on the type of content represented by this region""" + + * type -- + Information on the type of content represented by this region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('type_', 'string', 0, 1, {'use': 'optional'}), + MemberSpec_('type_', 'string', 0, 1, {'use': 'optional', 'name': 'type_'}), ] subclass = None superclass = RegionType @@ -11274,8 +12241,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(CustomRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("CustomRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.type_ = _cast(None, type_) self.type__nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11297,14 +12264,14 @@ def get_type(self): return self.type_ def set_type(self, type_): self.type_ = type_ - def hasContent_(self): + def has__content(self): if ( - super(CustomRegionType, self).hasContent_() + super(CustomRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CustomRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CustomRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('CustomRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11319,27 +12286,29 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CustomRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='CustomRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CustomRegionType'): - super(CustomRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='CustomRegionType'): + super(CustomRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='CustomRegionType') if self.type_ is not None and 'type_' not in already_processed: already_processed.add('type_') outfile.write(' type=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.type_), input_name='type')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='CustomRegionType', fromsubclass_=False, pretty_print=True): - super(CustomRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='CustomRegionType', mapping_=None, nsmap_=None): - element = super(CustomRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='CustomRegionType', fromsubclass_=False, pretty_print=True): + super(CustomRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='CustomRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(CustomRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.type_ is not None: element.set('type', self.gds_format_string(self.type_)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11347,19 +12316,19 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('type', node) if value is not None and 'type' not in already_processed: already_processed.add('type') self.type_ = value - super(CustomRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(CustomRegionType, self).buildChildren(child_, node, nodeName_, True) + super(CustomRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(CustomRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11367,7 +12336,10 @@ def __hash__(self): class UnknownRegionType(RegionType): - """To be used if the region type cannot be ascertained.""" + """UnknownRegionType -- + To be used if the region type cannot be ascertained. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -11378,8 +12350,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(UnknownRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("UnknownRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -11395,14 +12367,14 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(UnknownRegionType, self).hasContent_() + super(UnknownRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='UnknownRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnknownRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('UnknownRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11417,22 +12389,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='U showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnknownRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='UnknownRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnknownRegionType'): - super(UnknownRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='UnknownRegionType', fromsubclass_=False, pretty_print=True): - super(UnknownRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='UnknownRegionType', mapping_=None, nsmap_=None): - element = super(UnknownRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='UnknownRegionType'): + super(UnknownRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='UnknownRegionType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='UnknownRegionType', fromsubclass_=False, pretty_print=True): + super(UnknownRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='UnknownRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(UnknownRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11440,15 +12414,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(UnknownRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(UnknownRegionType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(UnknownRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(UnknownRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11456,9 +12430,12 @@ def __hash__(self): class NoiseRegionType(RegionType): - """Noise regions are regions where no real data lies, only + """NoiseRegionType -- + Noise regions are regions where no real data lies, only false data created by artifacts on the document or - scanner noise.""" + scanner noise. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ ] @@ -11469,8 +12446,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(NoiseRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("NoiseRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) def factory(*args_, **kwargs_): if CurrentSubclassModule_ is not None: subclass = getSubclassFromModule_( @@ -11486,14 +12463,14 @@ def get_ns_prefix_(self): return self.ns_prefix_ def set_ns_prefix_(self, ns_prefix): self.ns_prefix_ = ns_prefix - def hasContent_(self): + def has__content(self): if ( - super(NoiseRegionType, self).hasContent_() + super(NoiseRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='NoiseRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NoiseRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('NoiseRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11508,22 +12485,24 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='N showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NoiseRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='NoiseRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NoiseRegionType'): - super(NoiseRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='NoiseRegionType', fromsubclass_=False, pretty_print=True): - super(NoiseRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='NoiseRegionType', mapping_=None, nsmap_=None): - element = super(NoiseRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='NoiseRegionType'): + super(NoiseRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='NoiseRegionType') + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='NoiseRegionType', fromsubclass_=False, pretty_print=True): + super(NoiseRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='NoiseRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(NoiseRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11531,15 +12510,15 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): - super(NoiseRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(NoiseRegionType, self).buildChildren(child_, node, nodeName_, True) + def _buildAttributes(self, node, attrs, already_processed): + super(NoiseRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(NoiseRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11547,17 +12526,24 @@ def __hash__(self): class AdvertRegionType(RegionType): - """Regions containing advertisements. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + """AdvertRegionType -- + Regions containing advertisements. + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11566,8 +12552,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(AdvertRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("AdvertRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -11608,14 +12594,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(AdvertRegionType, self).hasContent_() + super(AdvertRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='AdvertRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('AdvertRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11630,32 +12616,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='A showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AdvertRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AdvertRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AdvertRegionType'): - super(AdvertRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AdvertRegionType'): + super(AdvertRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='AdvertRegionType', fromsubclass_=False, pretty_print=True): - super(AdvertRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='AdvertRegionType', mapping_=None, nsmap_=None): - element = super(AdvertRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', fromsubclass_=False, pretty_print=True): + super(AdvertRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='AdvertRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(AdvertRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11663,12 +12651,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -11679,9 +12667,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(AdvertRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(AdvertRegionType, self).buildChildren(child_, node, nodeName_, True) + super(AdvertRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(AdvertRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11700,17 +12688,24 @@ def set_orientation(self, orientation): class MusicRegionType(RegionType): - """Regions containing musical notations. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + """MusicRegionType -- + Regions containing musical notations. + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11719,8 +12714,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MusicRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MusicRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -11761,14 +12756,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(MusicRegionType, self).hasContent_() + super(MusicRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MusicRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MusicRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11783,32 +12778,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MusicRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MusicRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MusicRegionType'): - super(MusicRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MusicRegionType'): + super(MusicRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MusicRegionType', fromsubclass_=False, pretty_print=True): - super(MusicRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MusicRegionType', mapping_=None, nsmap_=None): - element = super(MusicRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', fromsubclass_=False, pretty_print=True): + super(MusicRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MusicRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MusicRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11816,12 +12813,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -11832,9 +12829,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(MusicRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MusicRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MusicRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MusicRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11853,16 +12850,21 @@ def set_orientation(self, orientation): class MapRegionType(RegionType): - """Regions containing maps. - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180""" + """MapRegionType -- + Regions containing maps. + + * orientation -- + The angle the rectangle encapsulating a + region has to be rotated in clockwise + direction in order to correct the present + skew (negative values indicate + anti-clockwise rotation). Range: + -179.999,180 + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), ] subclass = None superclass = RegionType @@ -11871,8 +12873,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MapRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MapRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" def factory(*args_, **kwargs_): @@ -11894,14 +12896,14 @@ def get_orientation(self): return self.orientation def set_orientation(self, orientation): self.orientation = orientation - def hasContent_(self): + def has__content(self): if ( - super(MapRegionType, self).hasContent_() + super(MapRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MapRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MapRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MapRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -11916,27 +12918,29 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MapRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MapRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MapRegionType'): - super(MapRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MapRegionType'): + super(MapRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MapRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MapRegionType', fromsubclass_=False, pretty_print=True): - super(MapRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MapRegionType', mapping_=None, nsmap_=None): - element = super(MapRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MapRegionType', fromsubclass_=False, pretty_print=True): + super(MapRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MapRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MapRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -11944,20 +12948,20 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') value = self.gds_parse_float(value, node, 'orientation') self.orientation = value - super(MapRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MapRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MapRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MapRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -11976,18 +12980,25 @@ def set_orientation(self, orientation): class ChemRegionType(RegionType): - """Regions containing chemical formulas. - The angle the rectangle encapsulating a - region has to be rotated in clockwise - direction in order to correct the present - skew (negative values indicate - anti-clockwise rotation). Range: - -179.999,180 - The background colour of the region""" + """ChemRegionType -- + Regions containing chemical formulas. + + * orientation -- + The angle the rectangle encapsulating a + region has to be rotated in clockwise + direction in order to correct the present + skew (negative values indicate + anti-clockwise rotation). Range: + -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -11996,8 +13007,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ChemRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -12038,14 +13049,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ChemRegionType, self).hasContent_() + super(ChemRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChemRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ChemRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12060,32 +13071,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChemRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChemRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChemRegionType'): - super(ChemRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChemRegionType'): + super(ChemRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChemRegionType', fromsubclass_=False, pretty_print=True): - super(ChemRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ChemRegionType', mapping_=None, nsmap_=None): - element = super(ChemRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', fromsubclass_=False, pretty_print=True): + super(ChemRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ChemRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ChemRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12093,12 +13106,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12109,9 +13122,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(ChemRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ChemRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ChemRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ChemRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12130,18 +13143,25 @@ def set_orientation(self, orientation): class MathsRegionType(RegionType): - """Regions containing equations and mathematical symbols + """MathsRegionType -- + Regions containing equations and mathematical symbols should be marked as maths regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The background colour of the region""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * bgColour -- + The background colour of the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), ] subclass = None superclass = RegionType @@ -12150,8 +13170,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(MathsRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("MathsRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.bgColour = _cast(None, bgColour) @@ -12192,14 +13212,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(MathsRegionType, self).hasContent_() + super(MathsRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MathsRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('MathsRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12214,32 +13234,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='M showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MathsRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MathsRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MathsRegionType'): - super(MathsRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MathsRegionType'): + super(MathsRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.bgColour is not None and 'bgColour' not in already_processed: already_processed.add('bgColour') outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='MathsRegionType', fromsubclass_=False, pretty_print=True): - super(MathsRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='MathsRegionType', mapping_=None, nsmap_=None): - element = super(MathsRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', fromsubclass_=False, pretty_print=True): + super(MathsRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='MathsRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(MathsRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.bgColour is not None: element.set('bgColour', self.gds_format_string(self.bgColour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12247,12 +13269,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12263,9 +13285,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('bgColour') self.bgColour = value self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType - super(MathsRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(MathsRegionType, self).buildChildren(child_, node, nodeName_, True) + super(MathsRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(MathsRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12284,19 +13306,26 @@ def set_orientation(self, orientation): class SeparatorRegionType(RegionType): - """Separators are lines that lie between columns and + """SeparatorRegionType -- + Separators are lines that lie between columns and paragraphs and can be used to logically separate different articles from each other. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The colour of the separator""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * colour -- + The colour of the separator + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('colour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('colour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'colour'}), ] subclass = None superclass = RegionType @@ -12305,8 +13334,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(SeparatorRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("SeparatorRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.colour = _cast(None, colour) @@ -12347,14 +13376,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(SeparatorRegionType, self).hasContent_() + super(SeparatorRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='SeparatorRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('SeparatorRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12369,32 +13398,34 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='S showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='SeparatorRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='SeparatorRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='SeparatorRegionType'): - super(SeparatorRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='SeparatorRegionType'): + super(SeparatorRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) if self.colour is not None and 'colour' not in already_processed: already_processed.add('colour') outfile.write(' colour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.colour), input_name='colour')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='SeparatorRegionType', fromsubclass_=False, pretty_print=True): - super(SeparatorRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='SeparatorRegionType', mapping_=None, nsmap_=None): - element = super(SeparatorRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', fromsubclass_=False, pretty_print=True): + super(SeparatorRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='SeparatorRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(SeparatorRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.colour is not None: element.set('colour', self.gds_format_string(self.colour)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12402,12 +13433,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12418,9 +13449,9 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('colour') self.colour = value self.validate_ColourSimpleType(self.colour) # validate type ColourSimpleType - super(SeparatorRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(SeparatorRegionType, self).buildChildren(child_, node, nodeName_, True) + super(SeparatorRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(SeparatorRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12439,26 +13470,39 @@ def set_orientation(self, orientation): class ChartRegionType(RegionType): - """Regions containing charts or graphs of any type, should + """ChartRegionType -- + Regions containing charts or graphs of any type, should be marked as chart regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The type of chart in the region - An approximation of the number of colours - used in the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * type -- + The type of chart in the region + + * numColours -- + An approximation of the number of colours + used in the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:ChartTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:ChartTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional', 'name': 'numColours'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -12467,8 +13511,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ChartRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ChartRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -12540,14 +13584,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ChartRegionType, self).hasContent_() + super(ChartRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChartRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChartRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ChartRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12562,16 +13606,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='C showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChartRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChartRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChartRegionType'): - super(ChartRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChartRegionType'): + super(ChartRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChartRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -12587,10 +13631,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ChartRegionType', fromsubclass_=False, pretty_print=True): - super(ChartRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, nsmap_=None): - element = super(ChartRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChartRegionType', fromsubclass_=False, pretty_print=True): + super(ChartRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ChartRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -12603,6 +13647,8 @@ def to_etree(self, parent_element=None, name_='ChartRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12610,12 +13656,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12644,9 +13690,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(ChartRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ChartRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ChartRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ChartRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -12665,30 +13711,49 @@ def set_orientation(self, orientation): class TableRegionType(RegionType): - """Tabular data in any form is represented with a table + """TableRegionType -- + Tabular data in any form is represented with a table region. Rows and columns may or may not have separator lines; these lines are not separator regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The number of rows present in the table - The number of columns present in the table - The colour of the lines used in the region - The background colour of the region - Specifies the presence of line separators - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * rows -- + The number of rows present in the table + + * columns -- + The number of columns present in the table + + * lineColour -- + The colour of the lines used in the region + + * bgColour -- + The background colour of the region + + * lineSeparators -- + Specifies the presence of line separators + + * embText -- + Specifies whether the region also contains + text + + * Grid -- Table grid (visible or virtual grid lines) + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('rows', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('columns', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('lineColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('lineSeparators', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('rows', 'int', 0, 1, {'use': 'optional', 'name': 'rows'}), + MemberSpec_('columns', 'int', 0, 1, {'use': 'optional', 'name': 'columns'}), + MemberSpec_('lineColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'lineColour'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('lineSeparators', 'boolean', 0, 1, {'use': 'optional', 'name': 'lineSeparators'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), MemberSpec_('Grid', 'GridType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'Grid', 'type': 'GridType'}, None), ] subclass = None @@ -12698,8 +13763,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(TableRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("TableRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.rows = _cast(int, rows) @@ -12776,15 +13841,15 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.Grid is not None or - super(TableRegionType, self).hasContent_() + super(TableRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TableRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('TableRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -12799,16 +13864,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='T showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TableRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableRegionType'): - super(TableRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TableRegionType'): + super(TableRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TableRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -12830,8 +13895,8 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TableRegionType', fromsubclass_=False, pretty_print=True): - super(TableRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TableRegionType', fromsubclass_=False, pretty_print=True): + super(TableRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -12839,8 +13904,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', if self.Grid is not None: namespaceprefix_ = self.Grid_nsprefix_ + ':' if (UseCapturedNS_ and self.Grid_nsprefix_) else '' self.Grid.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Grid', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, nsmap_=None): - element = super(TableRegionType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(TableRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.rows is not None: @@ -12857,9 +13922,11 @@ def to_etree(self, parent_element=None, name_='TableRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if self.Grid is not None: Grid_ = self.Grid - Grid_.to_etree(element, name_='Grid', mapping_=mapping_, nsmap_=nsmap_) + Grid_.to_etree(element, name_='Grid', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -12867,12 +13934,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -12914,14 +13981,14 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(TableRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(TableRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'Grid': obj_ = GridType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) self.Grid = obj_ obj_.original_tagname_ = 'Grid' - super(TableRegionType, self).buildChildren(child_, node, nodeName_, True) + super(TableRegionType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) def set_orientation(self, orientation): @@ -12939,24 +14006,35 @@ def set_orientation(self, orientation): class GraphicRegionType(RegionType): - """Regions containing simple graphics, such as a company + """GraphicRegionType -- + Regions containing simple graphics, such as a company logo, should be marked as graphic regions. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The type of graphic in the region - An approximation of the number of colours - used in the region - Specifies whether the region also contains - text.""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * type -- + The type of graphic in the region + + * numColours -- + An approximation of the number of colours + used in the region + + * embText -- + Specifies whether the region also contains + text. + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:GraphicsTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:GraphicsTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('numColours', 'int', 0, 1, {'use': 'optional', 'name': 'numColours'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -12965,8 +14043,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(GraphicRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("GraphicRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -13019,14 +14097,14 @@ def validate_GraphicsTypeSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on GraphicsTypeSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(GraphicRegionType, self).hasContent_() + super(GraphicRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='GraphicRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphicRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('GraphicRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13041,16 +14119,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='G showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphicRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='GraphicRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphicRegionType'): - super(GraphicRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='GraphicRegionType'): + super(GraphicRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='GraphicRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13063,10 +14141,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='GraphicRegionType', fromsubclass_=False, pretty_print=True): - super(GraphicRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None, nsmap_=None): - element = super(GraphicRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='GraphicRegionType', fromsubclass_=False, pretty_print=True): + super(GraphicRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(GraphicRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -13077,6 +14155,8 @@ def to_etree(self, parent_element=None, name_='GraphicRegionType', mapping_=None element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13084,12 +14164,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13113,9 +14193,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(GraphicRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(GraphicRegionType, self).buildChildren(child_, node, nodeName_, True) + super(GraphicRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(GraphicRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13134,23 +14214,34 @@ def set_orientation(self, orientation): class LineDrawingRegionType(RegionType): - """A line drawing is a single colour illustration without + """LineDrawingRegionType -- + A line drawing is a single colour illustration without solid areas. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The pen (foreground) colour of the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * penColour -- + The pen (foreground) colour of the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('penColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('penColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'penColour'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -13159,8 +14250,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(LineDrawingRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("LineDrawingRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.penColour = _cast(None, penColour) @@ -13213,14 +14304,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(LineDrawingRegionType, self).hasContent_() + super(LineDrawingRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='LineDrawingRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LineDrawingRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('LineDrawingRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13235,16 +14326,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='L showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LineDrawingRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='LineDrawingRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LineDrawingRegionType'): - super(LineDrawingRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='LineDrawingRegionType'): + super(LineDrawingRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='LineDrawingRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13257,10 +14348,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='LineDrawingRegionType', fromsubclass_=False, pretty_print=True): - super(LineDrawingRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_=None, nsmap_=None): - element = super(LineDrawingRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='LineDrawingRegionType', fromsubclass_=False, pretty_print=True): + super(LineDrawingRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(LineDrawingRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.penColour is not None: @@ -13271,6 +14362,8 @@ def to_etree(self, parent_element=None, name_='LineDrawingRegionType', mapping_= element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13278,12 +14371,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13308,9 +14401,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(LineDrawingRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(LineDrawingRegionType, self).buildChildren(child_, node, nodeName_, True) + super(LineDrawingRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(LineDrawingRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13329,23 +14422,34 @@ def set_orientation(self, orientation): class ImageRegionType(RegionType): - """An image is considered to be more intricate and complex + """ImageRegionType -- + An image is considered to be more intricate and complex than a graphic. These can be photos or drawings. - The angle the rectangle encapsulating a region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - The colour bit depth required for the region - The background colour of the region - Specifies whether the region also contains - text""" + + * orientation -- + The angle the rectangle encapsulating a region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * colourDepth -- + The colour bit depth required for the region + + * bgColour -- + The background colour of the region + + * embText -- + Specifies whether the region also contains + text + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('colourDepth', 'pc:ColourDepthSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('colourDepth', 'pc:ColourDepthSimpleType', 0, 1, {'use': 'optional', 'name': 'colourDepth'}), + MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), + MemberSpec_('embText', 'boolean', 0, 1, {'use': 'optional', 'name': 'embText'}), ] subclass = None superclass = RegionType @@ -13354,8 +14458,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(ImageRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("ImageRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.colourDepth = _cast(None, colourDepth) @@ -13421,14 +14525,14 @@ def validate_ColourSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( - super(ImageRegionType, self).hasContent_() + super(ImageRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ImageRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ImageRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('ImageRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13443,16 +14547,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='I showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ImageRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ImageRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ImageRegionType'): - super(ImageRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ImageRegionType'): + super(ImageRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ImageRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13465,10 +14569,10 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.embText is not None and 'embText' not in already_processed: already_processed.add('embText') outfile.write(' embText="%s"' % self.gds_format_boolean(self.embText, input_name='embText')) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='ImageRegionType', fromsubclass_=False, pretty_print=True): - super(ImageRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, nsmap_=None): - element = super(ImageRegionType, self).to_etree(parent_element, name_, mapping_) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ImageRegionType', fromsubclass_=False, pretty_print=True): + super(ImageRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(ImageRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.colourDepth is not None: @@ -13479,6 +14583,8 @@ def to_etree(self, parent_element=None, name_='ImageRegionType', mapping_=None, element.set('embText', self.gds_format_boolean(self.embText)) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13486,12 +14592,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -13516,9 +14622,9 @@ def buildAttributes(self, node, attrs, already_processed): self.embText = False else: raise_parse_error(node, 'Bad boolean attribute') - super(ImageRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): - super(ImageRegionType, self).buildChildren(child_, node, nodeName_, True) + super(ImageRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(ImageRegionType, self)._buildChildren(child_, node, nodeName_, True) pass def __hash__(self): return hash(self.id) @@ -13537,52 +14643,92 @@ def set_orientation(self, orientation): class TextRegionType(RegionType): - """Pure text is represented as a text region. This includes + """TextRegionType -- + Pure text is represented as a text region. This includes drop capitals, but practically ornate text may be considered as a graphic. - The angle the rectangle encapsulating the region - has to be rotated in clockwise direction - in order to correct the present skew - (negative values indicate anti-clockwise rotation). - (The rotated image can be further referenced - via “AlternativeImage”.) - Range: -179.999,180 - The nature of the text in the region - The degree of space in points between the lines of - text (line spacing) - The direction in which text within lines - should be read (order of words and characters), - in addition to “textLineOrder”. - The order of text lines within the block, - in addition to “readingDirection”. - The angle the baseline of text within the region - has to be rotated (relative to the rectangle - encapsulating the region) in clockwise direction - in order to correct the present skew, - in addition to “orientation” - (negative values indicate anti-clockwise rotation). - Range: -179.999,180 - Defines whether a region of text is indented or not - Text align - The primary language used in the region - The secondary language used in the region - The primary script used in the region - The secondary script used in the region""" + + * orientation -- + The angle the rectangle encapsulating the region + has to be rotated in clockwise direction + in order to correct the present skew + (negative values indicate anti-clockwise rotation). + (The rotated image can be further referenced + via + “ + AlternativeImage + ” + .) + Range: -179.999,180 + + * type -- + The nature of the text in the region + + * leading -- + The degree of space in points between the lines of + text (line spacing) + + * readingDirection -- + The direction in which text within lines + should be read (order of words and characters), + in addition to + “ + textLineOrder + ” + . + + * textLineOrder -- + The order of text lines within the block, + in addition to + “ + readingDirection + ” + . + + * readingOrientation -- + The angle the baseline of text within the region + has to be rotated (relative to the rectangle + encapsulating the region) in clockwise direction + in order to correct the present skew, + in addition to + “ + orientation + ” + (negative values indicate anti-clockwise rotation). + Range: -179.999,180 + + * indented -- + Defines whether a region of text is indented or not + + * align -- Text align + * primaryLanguage -- + The primary language used in the region + + * secondaryLanguage -- + The secondary language used in the region + + * primaryScript -- + The primary script used in the region + + * secondaryScript -- + The secondary script used in the region + + """ __hash__ = GeneratedsSuper.__hash__ member_data_items_ = [ - MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('type_', 'pc:TextTypeSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('leading', 'int', 0, 1, {'use': 'optional'}), - MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('readingOrientation', 'float', 0, 1, {'use': 'optional'}), - MemberSpec_('indented', 'boolean', 0, 1, {'use': 'optional'}), - MemberSpec_('align', 'pc:AlignSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional'}), - MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional'}), + MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), + MemberSpec_('type_', 'pc:TextTypeSimpleType', 0, 1, {'use': 'optional', 'name': 'type_'}), + MemberSpec_('leading', 'int', 0, 1, {'use': 'optional', 'name': 'leading'}), + MemberSpec_('readingDirection', 'pc:ReadingDirectionSimpleType', 0, 1, {'use': 'optional', 'name': 'readingDirection'}), + MemberSpec_('textLineOrder', 'pc:TextLineOrderSimpleType', 0, 1, {'use': 'optional', 'name': 'textLineOrder'}), + MemberSpec_('readingOrientation', 'float', 0, 1, {'use': 'optional', 'name': 'readingOrientation'}), + MemberSpec_('indented', 'boolean', 0, 1, {'use': 'optional', 'name': 'indented'}), + MemberSpec_('align', 'pc:AlignSimpleType', 0, 1, {'use': 'optional', 'name': 'align'}), + MemberSpec_('primaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryLanguage'}), + MemberSpec_('secondaryLanguage', 'pc:LanguageSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryLanguage'}), + MemberSpec_('primaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'primaryScript'}), + MemberSpec_('secondaryScript', 'pc:ScriptSimpleType', 0, 1, {'use': 'optional', 'name': 'secondaryScript'}), + MemberSpec_('production', 'pc:ProductionSimpleType', 0, 1, {'use': 'optional', 'name': 'production'}), MemberSpec_('TextLine', 'TextLineType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextLine', 'type': 'TextLineType'}, None), MemberSpec_('TextEquiv', 'TextEquivType', 1, 1, {'maxOccurs': 'unbounded', 'minOccurs': '0', 'name': 'TextEquiv', 'type': 'TextEquivType'}, None), MemberSpec_('TextStyle', 'TextStyleType', 0, 1, {'maxOccurs': '1', 'minOccurs': '0', 'name': 'TextStyle', 'type': 'TextStyleType'}, None), @@ -13594,8 +14740,8 @@ def __init__(self, id=None, custom=None, comments=None, continuation=None, Alter self.gds_elementtree_node_ = None self.original_tagname_ = None self.parent_object_ = kwargs_.get('parent_object_') - self.ns_prefix_ = None - super(TextRegionType, self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) + self.ns_prefix_ = "pc" + super(globals().get("TextRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) self.orientation = _cast(float, orientation) self.orientation_nsprefix_ = "pc" self.type_ = _cast(None, type_) @@ -13816,17 +14962,17 @@ def validate_ProductionSimpleType(self, value): lineno = self.gds_get_node_lineno_() self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ProductionSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) result = False - def hasContent_(self): + def has__content(self): if ( self.TextLine or self.TextEquiv or self.TextStyle is not None or - super(TextRegionType, self).hasContent_() + super(TextRegionType, self).has__content() ): return True else: return False - def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TextRegionType', pretty_print=True): + def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextRegionType', pretty_print=True): imported_ns_def_ = GenerateDSNamespaceDefs_.get('TextRegionType') if imported_ns_def_ is not None: namespacedef_ = imported_ns_def_ @@ -13841,16 +14987,16 @@ def export(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='T showIndent(outfile, level, pretty_print) outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) already_processed = set() - self.exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') - if self.hasContent_(): + self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') + if self.has__content(): outfile.write('>%s' % (eol_, )) - self.exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextRegionType', pretty_print=pretty_print) + self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='TextRegionType', pretty_print=pretty_print) showIndent(outfile, level, pretty_print) outfile.write('%s' % (namespaceprefix_, name_, eol_)) else: outfile.write('/>%s' % (eol_, )) - def exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextRegionType'): - super(TextRegionType, self).exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') + def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='TextRegionType'): + super(TextRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='TextRegionType') if self.orientation is not None and 'orientation' not in already_processed: already_processed.add('orientation') outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) @@ -13890,8 +15036,8 @@ def exportAttributes(self, outfile, level, already_processed, namespaceprefix_=' if self.production is not None and 'production' not in already_processed: already_processed.add('production') outfile.write(' production=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.production), input_name='production')), )) - def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', name_='TextRegionType', fromsubclass_=False, pretty_print=True): - super(TextRegionType, self).exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) + def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='TextRegionType', fromsubclass_=False, pretty_print=True): + super(TextRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) if pretty_print: eol_ = '\n' else: @@ -13905,8 +15051,8 @@ def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='', if self.TextStyle is not None: namespaceprefix_ = self.TextStyle_nsprefix_ + ':' if (UseCapturedNS_ and self.TextStyle_nsprefix_) else '' self.TextStyle.export(outfile, level, namespaceprefix_, namespacedef_='', name_='TextStyle', pretty_print=pretty_print) - def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, nsmap_=None): - element = super(TextRegionType, self).to_etree(parent_element, name_, mapping_) + def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): + element = super(TextRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) if self.orientation is not None: element.set('orientation', self.gds_format_float(self.orientation)) if self.type_ is not None: @@ -13934,14 +15080,16 @@ def to_etree(self, parent_element=None, name_='TextRegionType', mapping_=None, n if self.production is not None: element.set('production', self.gds_format_string(self.production)) for TextLine_ in self.TextLine: - TextLine_.to_etree(element, name_='TextLine', mapping_=mapping_, nsmap_=nsmap_) + TextLine_.to_etree(element, name_='TextLine', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) for TextEquiv_ in self.TextEquiv: - TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, nsmap_=nsmap_) + TextEquiv_.to_etree(element, name_='TextEquiv', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if self.TextStyle is not None: TextStyle_ = self.TextStyle - TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, nsmap_=nsmap_) + TextStyle_.to_etree(element, name_='TextStyle', mapping_=mapping_, reverse_mapping_=reverse_mapping_, nsmap_=nsmap_) if mapping_ is not None: mapping_[id(self)] = element + if reverse_mapping_ is not None: + reverse_mapping_[element] = self return element def build(self, node, gds_collector_=None): self.gds_collector_ = gds_collector_ @@ -13949,12 +15097,12 @@ def build(self, node, gds_collector_=None): self.gds_elementtree_node_ = node already_processed = set() self.ns_prefix_ = node.prefix - self.buildAttributes(node, node.attrib, already_processed) + self._buildAttributes(node, node.attrib, already_processed) for child in node: nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] - self.buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) + self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) return self - def buildAttributes(self, node, attrs, already_processed): + def _buildAttributes(self, node, attrs, already_processed): value = find_attr_value_('orientation', node) if value is not None and 'orientation' not in already_processed: already_processed.add('orientation') @@ -14023,8 +15171,8 @@ def buildAttributes(self, node, attrs, already_processed): already_processed.add('production') self.production = value self.validate_ProductionSimpleType(self.production) # validate type ProductionSimpleType - super(TextRegionType, self).buildAttributes(node, attrs, already_processed) - def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): + super(TextRegionType, self)._buildAttributes(node, attrs, already_processed) + def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): if nodeName_ == 'TextLine': obj_ = TextLineType.factory(parent_object_=self) obj_.build(child_, gds_collector_=gds_collector_) @@ -14040,7 +15188,7 @@ def buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collec obj_.build(child_, gds_collector_=gds_collector_) self.TextStyle = obj_ obj_.original_tagname_ = 'TextStyle' - super(TextRegionType, self).buildChildren(child_, node, nodeName_, True) + super(TextRegionType, self)._buildChildren(child_, node, nodeName_, True) def __hash__(self): return hash(self.id) def set_orientation(self, orientation): @@ -14057,6 +15205,11 @@ def set_orientation(self, orientation): # end class TextRegionType +# +# End data representation classes. +# + + GDSClassesMapping = { 'PcGts': PcGtsType, } @@ -14074,9 +15227,10 @@ def usage(): def get_root_tag(node): tag = Tag_pattern_.match(node.tag).groups()[-1] - rootClass = GDSClassesMapping.get(tag) + prefix_tag = TagNamePrefix + tag + rootClass = GDSClassesMapping.get(prefix_tag) if rootClass is None: - rootClass = globals().get(tag) + rootClass = globals().get(prefix_tag) return tag, rootClass @@ -14130,7 +15284,7 @@ def parse(inFileName, silence=False, print_warnings=True): def parseEtree(inFileName, silence=False, print_warnings=True, - mapping=None, nsmap=None): + mapping=None, reverse_mapping=None, nsmap=None): parser = None doc = parsexml_(inFileName, parser) gds_collector = GdsCollector_() @@ -14141,12 +15295,15 @@ def parseEtree(inFileName, silence=False, print_warnings=True, rootClass = PcGts rootObj = rootClass.factory() rootObj.build(rootNode, gds_collector_=gds_collector) - # Enable Python to collect the space used by the DOM. if mapping is None: mapping = {} + if reverse_mapping is None: + reverse_mapping = {} rootElement = rootObj.to_etree( - None, name_=rootTag, mapping_=mapping, nsmap_=nsmap) - reverse_mapping = rootObj.gds_reverse_node_mapping(mapping) + None, name_=rootTag, mapping_=mapping, + reverse_mapping_=reverse_mapping, nsmap_=nsmap) + reverse_node_mapping = rootObj.gds_reverse_node_mapping(mapping) + # Enable Python to collect the space used by the DOM. if not SaveElementTreeNode: doc = None rootNode = None @@ -14163,7 +15320,7 @@ def parseEtree(inFileName, silence=False, print_warnings=True, len(gds_collector.get_messages()), )) gds_collector.write_messages(sys.stderr) sys.stderr.write(separator) - return rootObj, rootElement, mapping, reverse_mapping + return rootObj, rootElement, mapping, reverse_node_mapping def parseString(inString, silence=False, print_warnings=True): @@ -14247,6 +15404,224 @@ def main(): RenameMappings_ = { } +# +# Mapping of namespaces to types defined in them +# and the file in which each is defined. +# simpleTypes are marked "ST" and complexTypes "CT". +NamespaceToDefMappings_ = {'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15': [('ColourSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ReadingDirectionSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextLineOrderSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PageTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ConfSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('LanguageSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ScriptSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ColourDepthSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('GraphicsTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ChartTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PointsType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('ProductionSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('AlignSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('GroupTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('TextDataTypeSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('UnderlineStyleSimpleType', + 'src/ocrd_validators/page.xsd', + 'ST'), + ('PcGtsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MetadataType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MetadataItemType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LabelsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LabelType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('PageType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('CoordsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextLineType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('WordType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GlyphType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextEquivType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ImageRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LineDrawingRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphicRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TableRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GridType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GridPointsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ChartRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('SeparatorRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MathsRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ChemRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MapRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('MusicRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('AdvertRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('NoiseRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnknownRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('CustomRegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('PrintSpaceType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('ReadingOrderType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionRefIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('OrderedGroupIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnorderedGroupIndexedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionRefType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('OrderedGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UnorderedGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('BorderType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LayersType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('LayerType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('BaselineType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RelationsType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RelationType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TextStyleType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RegionType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('AlternativeImageType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemesType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeBaseType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('NonPrintingCharType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('GraphemeGroupType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UserDefinedType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('UserAttributeType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('TableCellRoleType', + 'src/ocrd_validators/page.xsd', + 'CT'), + ('RolesType', + 'src/ocrd_validators/page.xsd', + 'CT')]} + __all__ = [ "AdvertRegionType", "AlternativeImageType", diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index fe22dd89ab..9cec0b30ad 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -104,7 +104,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'clear_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'extend_AllIndexed'), _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'sort_AllIndexed'), - _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren', 'exportChildren_GroupType'), + _add_method(r'^(OrderedGroupType|OrderedGroupIndexedType)$', 'exportChildren', '_exportChildren_GroupType'), _add_method(r'^(UnorderedGroupType|UnorderedGroupIndexedType)$', 'get_UnorderedGroupChildren'), _add_method(r'^(PcGtsType|PageType)$', 'id'), _add_method(r'^(PageType)$', 'get_AllRegions'), diff --git a/src/ocrd_page_user_methods/exportChildren_GroupType.py b/src/ocrd_page_user_methods/_exportChildren_GroupType.py similarity index 65% rename from src/ocrd_page_user_methods/exportChildren_GroupType.py rename to src/ocrd_page_user_methods/_exportChildren_GroupType.py index 924ee63146..9dea9c422e 100644 --- a/src/ocrd_page_user_methods/exportChildren_GroupType.py +++ b/src/ocrd_page_user_methods/_exportChildren_GroupType.py @@ -1,9 +1,14 @@ # pylint: disable=line-too-long,invalid-name,missing-module-docstring,missing-function-docstring -def exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments - namespaceprefix_ = 'pc:' +def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='OrderedGroupType', fromsubclass_=False, pretty_print=True): # pylint: disable=unused-argument,too-many-arguments + if pretty_print: + eol_ = '\n' + else: + eol_ = '' if self.UserDefined is not None: + namespaceprefix_ = self.UserDefined_nsprefix_ + ':' if (UseCapturedNS_ and self.UserDefined_nsprefix_) else '' self.UserDefined.export(outfile, level, namespaceprefix_, namespacedef_='', name_='UserDefined', pretty_print=pretty_print) for Labels_ in self.Labels: + namespaceprefix_ = self.Labels_nsprefix_ + ':' if (UseCapturedNS_ and self.Labels_nsprefix_) else '' Labels_.export(outfile, level, namespaceprefix_, namespacedef_='', name_='Labels', pretty_print=pretty_print) cleaned = [] def replaceWithRRI(group): @@ -21,4 +26,4 @@ def replaceWithRRI(group): else: cleaned.append(entry) for entry in cleaned: - entry.export(outfile, level, namespaceprefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) + entry.export(outfile, level, entry.ns_prefix_, namespacedef_='', name_=entry.__class__.__name__[:-4], pretty_print=pretty_print) From c1b92c8fbe30b86804f7012467e2fdde10e16b80 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:52:08 +0200 Subject: [PATCH 011/191] ocrd_modelfactory.page_from_file: set OcrdPage.revmap to actual reverse mapping --- src/ocrd_modelfactory/__init__.py | 8 +++++++- src/ocrd_models/ocrd_page.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 828949fe96..3f7d675f86 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -101,5 +101,11 @@ def page_from_file(input_file, **kwargs) -> OcrdPage: if input_file.mimetype.startswith('image'): return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: - return OcrdPage(*parseEtree(input_file.local_filename, silence=True)) + revmap = {} + # the old/default gds.reverse_node_mapping is useless + # since 2.39.4, we can actually get the exact reverse mapping for perfect round-trip + # but awkwardly, we have to pass the dict in for that + page = OcrdPage(*parseEtree(input_file.local_filename, reverse_mapping=revmap, silence=True)) + page.revmap = revmap + return page raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 6a8ea4586f..b491d402a2 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -2,7 +2,7 @@ API to PAGE-XML, generated with generateDS from XML schema. """ from io import StringIO -from typing import Dict, Union +from typing import Dict, Union, Any from lxml import etree as ET from elementpath import XPath2Parser, XPathContext @@ -191,7 +191,7 @@ def __init__( pcgts : PcGtsType, etree : ET._Element, mapping : Dict[str, ET._Element], - revmap : Dict[ET._Element, str], + revmap : Dict[ET._Element, Any], ): self._pcgts = pcgts self.etree = etree From 9c36854513dd799b9d91d4691a2435ad91b7ad44 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:53:29 +0200 Subject: [PATCH 012/191] ocrd_page.to_xml: also allow non-root nodes --- src/ocrd_models/ocrd_page.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index b491d402a2..046606100f 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -223,11 +223,15 @@ def to_xml(el, skip_declaration=False) -> str: # XXX remove potential empty ReadingOrder if hasattr(el, 'prune_ReadingOrder'): el.prune_ReadingOrder() + if hasattr(el, 'original_tagname_'): + name = el.original_tagname_ or 'PcGts' + else: + name = 'PcGts' sio = StringIO() el.export( outfile=sio, level=0, - name_='PcGts', + name_=name, namespaceprefix_='pc:', namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( NAMESPACES['page'], From c70748153f62f64a27a70561a1a8232d34028a59 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 23:54:01 +0200 Subject: [PATCH 013/191] ocrd-filter: simplify further --- .../processor/builtin/filter_processor.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index 10b5572c3f..c81517b0e5 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -8,6 +8,27 @@ from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_models import OcrdPage +_SEGTYPES = [ + "NoiseRegion", + "LineDrawingRegion", + "AdvertRegion", + "ImageRegion", + "ChartRegion", + "MusicRegion", + "GraphicRegion", + "UnknownRegion", + "CustomRegion", + "SeparatorRegion", + "MathsRegion", + "TextRegion", + "MapRegion", + "ChemRegion", + "TableRegion", + "TextLine", + "Word", + "Glyph" +] + class FilterProcessor(Processor): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ @@ -31,18 +52,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional """ pcgts = input_pcgts[0] result = OcrdPageResult(pcgts) - nodes = [node.attrib['id'] - for node in pcgts.xpath(self.parameter['select']) - if 'id' in node.attrib] + nodes = pcgts.xpath(self.parameter['select']) # get PAGE objects from matching etree nodes - # FIXME: this should be easier (OcrdPage should have id lookup mechanism) - regions = pcgts.get_Page().get_AllRegions() - textregions = [region for region in regions if region.original_tagname_ == 'TextRegion'] - lines = [line for region in textregions for line in region.get_TextLine() or []] - words = [word for line in lines for word in line.get_Word() or []] - glyphs = [glyph for word in words for glyph in word.get_Glyph() or []] - segments = [segment for segment in regions + lines + words + glyphs - if segment.id in nodes] + # but allow only hierarchy segments + segments = [segment for segment in map(pcgts.revmap.get, nodes) + if segment.__class__.__name__.replace('Type', '') in _SEGTYPES] if not(len(segments)): self.logger.info("no matches") return result @@ -50,8 +64,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if self.parameter['plot']: page_image, page_coords, _ = self.workspace.image_from_page(pcgts.get_Page(), page_id) for segment in segments: - node = pcgts.mapping[id(segment)] - assert isinstance(node, etree._Element) segtype = segment.original_tagname_ self.logger.info("matched %s segment %s", segtype, segment.id) parent = segment.parent_object_ From 687b06f90784fcf9eac510ecc3442ea8d8c08bb3 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 16 Sep 2024 13:29:26 +0200 Subject: [PATCH 014/191] :package: v3.0.0b5 --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ VERSION | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec12c8934..bbb91c0782 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,35 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b5] - 2024-09-16 + +TODO + - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) + - OcrdPage: add PageType.get_ReadingOrderGroups() + - tests: make sure ocrd_utils.config gets reset whenever changing it globally + - test processors: no need for 'force' kwarg anymore + - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE + - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) + - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) + - lib.bash: fix errexit + - run_processor: be robust if ocrd_tool is missing steps + - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) + - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself + - ocrd.cli.validate tasks: pass on --mets-server-url, too + - ocrd.cli.bashlib input-files: pass on --mets-server-url, too + - ocrd.cli.workspace server: add 'reload' and 'save' + - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) + - METS Server: also export+delegate physical_pages + - processor CLI: delegate --resolve-resource, too + - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) + - PcGts.Page.id / make_xml_id: replace '/' with '_' + - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType + - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) + - typing, extend docs + - test_processor: add test for force (overwrite) w/ METS Server + - test_mets_server: add test for force (overwrite) + - OcrdMetsServer.add_file: pass on 'force' kwarg, too + ## [3.0.0b4] - 2024-09-02 Fixed: diff --git a/VERSION b/VERSION index 9414e12700..09fb39d267 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b4 +3.0.0b5 From a43098e9ee01a15a753ace19a8eddcdff4849352 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:27:50 +0200 Subject: [PATCH 015/191] :memo: improve b5 changelog --- CHANGELOG.md | 53 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbb91c0782..abbfd5a4d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,32 +7,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## [3.0.0b5] - 2024-09-16 -TODO - - update OcrdPage from generateds (HEAD -> new-processor-api, bertsky/new-processor-api) - - OcrdPage: add PageType.get_ReadingOrderGroups() - - tests: make sure ocrd_utils.config gets reset whenever changing it globally - - test processors: no need for 'force' kwarg anymore - - Processor / Workspace.add_file: always force if config.OCRD_EXISTING_OUTPUT==OVERWRITE - - lib.bash input-files: do not try to validate tasks here (impossible to get right with required parameters, and now covered by wrapped Processor.verify() already) - - lib.bash input-files: pass on --mets-server-url, --overwrite, and parameters (necessary for required params) - - lib.bash: fix errexit - - run_processor: be robust if ocrd_tool is missing steps - - Processor.verify: check output fileGrps as well (or OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP or disjoint --page-id) - - Processor.process_workspace(): do not show NotImplementedError context if fallback process() raises anything itself - - ocrd.cli.validate tasks: pass on --mets-server-url, too - - ocrd.cli.bashlib input-files: pass on --mets-server-url, too - - ocrd.cli.workspace server: add 'reload' and 'save' - - ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) - - METS Server: also export+delegate physical_pages - - processor CLI: delegate --resolve-resource, too - - ocrd.cli.ocrd-tool resolve-resource: fix (forgot to print result) - - PcGts.Page.id / make_xml_id: replace '/' with '_' - - Processor.process_page_file / OcrdPageResultImage: allow None instead of AlternativeImageType - - Processor.verify: revert 5819c816 (we still have no defaults in json loaded from v2) - - typing, extend docs - - test_processor: add test for force (overwrite) w/ METS Server - - test_mets_server: add test for force (overwrite) - - OcrdMetsServer.add_file: pass on 'force' kwarg, too +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + +Changed: + - :fire: `Processor` / `Workspace.add_file`: always `force` if `OCRD_EXISTING_OUTPUT==OVERWRITE` + - :fire: `Processor.verify`: revert 3.0.0b1 enforcing cardinality checks (stay backwards compatible) + - :fire: `Processor.verify`: check output fileGrps, too + (must not exist unless `OCRD_EXISTING_OUTPUT=OVERWRITE|SKIP` or disjoint `--page-id` range) + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + - `Processor.process_page_file` / `OcrdPageResultImage`: allow `None` besides `AlternativeImageType` ## [3.0.0b4] - 2024-09-02 @@ -2288,6 +2287,8 @@ Fixed Initial Release +[3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 +[3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 [3.0.0b2]: ../../compare/v3.0.0b2..v3.0.0b1 [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 From d2cb0fb663c15c6179bbcf05477051f3d7737149 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 016/191] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca4e8629db..05b37b6bcc 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -149,7 +149,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -185,7 +186,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From 5663f4882834fd1430c5c1d55ca438a2406ce9ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:10:12 +0200 Subject: [PATCH 017/191] processor CLI: delegate --resolve-resource, too --- src/ocrd/decorators/__init__.py | 4 +++- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/helpers.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 580a75b0c0..7c2dd9717c 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -50,7 +51,7 @@ def ocrd_cli_wrap_processor( if not sys.argv[1:]: processorClass(None, show_help=True) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: + if dump_json or dump_module_dir or help or version or resolve_resource or show_resource or list_resources: processorClass( None, dump_json=dump_json, @@ -58,6 +59,7 @@ def ocrd_cli_wrap_processor( show_help=help, subcommand=subcommand, show_version=version, + resolve_resource=resolve_resource, show_resource=show_resource, list_resources=list_resources ) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..9c87034ab4 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -41,6 +41,7 @@ def cli(mets_url): option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b6010636..921cfeac80 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -290,6 +290,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) ''' information_options = '''\ + -R, --resolve-resource RESNAME Show the full path of processor resource RESNAME -C, --show-resource RESNAME Dump the content of processor resource RESNAME -L, --list-resources List names of processor resources -J, --dump-json Dump tool description as JSON From 853bdb570c861b98debf1c2af60e84f39db47fbf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:12:49 +0200 Subject: [PATCH 018/191] test_mets_server: fix arg vs kwarg --- tests/test_mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 58ff6e2a9b..a313ed5239 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,10 +55,10 @@ def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' ) From 33c73866e5a289d83354c382b9cc34d7038027cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:13:46 +0200 Subject: [PATCH 019/191] mets_server: ClientSideOcrdMets needs OcrdMets-like kwargs (without deprecation) --- src/ocrd/mets_server.py | 19 +++++++++---------- tests/test_mets_server.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..da6e873c06 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -247,11 +247,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,14 +275,14 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) if not self.multiplexing_mode: @@ -297,8 +295,9 @@ def add_file( raise RuntimeError(f"Add file failed: Msg: {r['error']}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index a313ed5239..1487617a71 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -236,7 +236,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' From 37f7cda00f53c3f8f01a722c87c2f965dc7c7b68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:09 +0200 Subject: [PATCH 020/191] use up-to-date kwargs (avoiding old deprecations) --- tests/data/__init__.py | 4 ++-- tests/processor/test_processor.py | 10 +++++----- tests/validator/test_page_validator.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a9..c7fcfb021c 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -52,9 +52,9 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3d..3a47d2c23f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -125,8 +125,8 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -135,10 +135,10 @@ def test_run_output0(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') ws.overwrite_mode = False with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90fa..e6aaff1523 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') From 44946baa17d1c44d9896ef35103a97e2f48a6d2a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:59 +0200 Subject: [PATCH 021/191] hide/test expected deprecation warnings --- tests/test_resolver.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..c2575b6086 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -292,20 +292,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) From d0962d67ee2e5da332ff0385e417925ab1581481 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:53:25 +0200 Subject: [PATCH 022/191] improve output in case of assertion failures --- tests/cli/test_validate.py | 22 ++++++++++----------- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e5995..bf74a84c59 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,11 +84,11 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, _, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), @@ -96,7 +96,7 @@ def test_validate_tasks(self): "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) if __name__ == '__main__': diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d8645..6d4616c2db 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 061f0231a148f09943d1c5ee35f456ad502f2755 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:34:43 +0200 Subject: [PATCH 023/191] allow "from ocrd_models import OcrdPage --- src/ocrd_models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..330fefe97d 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport From d2f92d1e4814d810d10b5d31a63f730568c11e29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:13:58 +0200 Subject: [PATCH 024/191] ocrd_utils: forgot to export scale_coordinates at toplvl --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index b5bbcae121..836f01dce4 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -148,6 +149,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, From c6c5c42a1d37478a6c8a4c43b5fd61c69249f7b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:19 +0200 Subject: [PATCH 025/191] fix imports --- src/ocrd/decorators/parameter_option.py | 2 +- src/ocrd/workspace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e0577..55abbc2a53 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,10 +1,10 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index ff856011be..b4795f3e89 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -24,6 +24,7 @@ coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, From 245778c74a373c07a007d5deb982197d0b22d569 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 026/191] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8303413933..5113faf3da 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -377,16 +377,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -431,13 +424,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From 1f7b57fc70fe26cb5399db54edb4a4748184327d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 027/191] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5113faf3da..9e5f5aead6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -426,7 +426,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 35bdb39773dd26d238d00c00f9d3f7c2c711ac4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:29 +0200 Subject: [PATCH 028/191] tests report.is_valid: improve output on failure --- tests/cli/test_validate.py | 23 +++++++++---------- tests/validator/test_json_validator.py | 6 ++--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 2 +- .../validator/test_resource_list_validator.py | 3 +-- tests/validator/test_xsd_validator.py | 8 +++---- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index bf74a84c59..cc58df6540 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, err = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b6..bd756879bc 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -20,18 +20,18 @@ def setUp(self): def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 6d4616c2db..70d40c2f2a 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2c..297a149064 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1ea..cc63c30ea7 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338dd..50b3851ffc 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__) From e595996d91ae05577cbd3bc133c2f2429d462ff2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:49:08 +0200 Subject: [PATCH 029/191] fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor --- src/ocrd/decorators/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 7c2dd9717c..464bb67ed8 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,6 +10,7 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator from ocrd_network import ProcessingWorker, ProcessorServer, AgentType @@ -141,7 +143,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: @@ -150,8 +152,13 @@ def exit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): From f21b8d24eaa8320b2ff1c405355ce0b40f116256 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:54:07 +0200 Subject: [PATCH 030/191] fix exception --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081bc..e63c5fd015 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -248,7 +248,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 0cbd3ea906e8c93f940e012f3f7383a1a372c135 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:27:33 +0200 Subject: [PATCH 031/191] adapt to PIL.Image moved constants --- src/ocrd/workspace.py | 8 +++---- src/ocrd_utils/image.py | 50 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index b4795f3e89..8b8e89bfca 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1151,9 +1151,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1221,5 +1221,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e6612..6f2524608c 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; From 8f8912c14dcccdc485d03e94efe33d9097fcdb78 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:31:35 +0200 Subject: [PATCH 032/191] cli.workspace: pass fileGrp as well, improve description --- src/ocrd/cli/workspace.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..062a373608 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -118,7 +118,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,8 +129,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: @@ -143,6 +145,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -407,7 +410,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() From 6dccfb388209a7e14b61a46e139ad07e72926c3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:35:37 +0200 Subject: [PATCH 033/191] OcrdMets.add_agent: does not have positional args --- src/ocrd/mets_server.py | 2 +- src/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 2 +- tests/test_workspace.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index da6e873c06..7c22da278d 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -236,7 +236,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..66251a54dc 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]: """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625a..89742a507e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444b..75e9b6886f 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -734,7 +734,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) From 2d85f14d00bd112553e6ee4a0751436e8d1131f7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:15:13 +0200 Subject: [PATCH 034/191] update pylintrc --- .pylintrc | 18 ++++++++---------- src/ocrd/resource_manager.py | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index b2125d824c..a4106a1bb7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,21 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +27,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +38,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e63c5fd015..1fc0409250 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -13,12 +13,16 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json From ea68370e223a7b8af2843ca16c0ebd8f223b6574 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:18:53 +0200 Subject: [PATCH 035/191] pylint: try ignoring generateds (again) --- .pylintrc | 1 + src/ocrd/cli/ocrd_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index a4106a1bb7..2e3af4288b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=lxml,pydantic ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py ignore-patterns=.*generateds.* [MESSAGES CONTROL] diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec9..3c024ec668 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -29,6 +29,8 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) From 18ac2c0ab954268811a2ed8654cafc44924e01a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:11:49 +0200 Subject: [PATCH 036/191] ClientSideOcrdMets: use same logger name prefix as server --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7c22da278d..9b66871349 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -120,7 +120,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None From da37967357f4d1bf9076498342319fddc35db070 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:15:03 +0200 Subject: [PATCH 037/191] test_mets_server: use tmpdir to avoid side effects between suites --- tests/test_mets_server.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 1487617a71..8f94b95645 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,13 +22,16 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] +initLogging() +setOverrideLogLevel(10) + @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + tmpdir = str(tmpdir) def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -39,21 +42,22 @@ def _start_mets_server(*args, **kwargs): if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) p.start() sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) + rmtree(tmpdir, ignore_errors=True) def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( 'FOO', local_filename=f'local_filename{i}', @@ -64,8 +68,8 @@ def add_file_server(x): ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +86,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +114,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -125,13 +132,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +152,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +192,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups From ccb416b13e7f91781568fda8e60ad8182bfea88c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:04:04 +0200 Subject: [PATCH 038/191] disableLogging: re-instate root logger, to --- src/ocrd_utils/logging.py | 4 +++- tests/test_decorators.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0ce..8f45f9c7fc 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -212,11 +212,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: + for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: logging.getLogger(logger_name).setLevel(logging.NOTSET) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) # Initializing stream handlers at module level # would cause message output in all runtime contexts, diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab2880053..df8d6422be 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -64,6 +64,7 @@ def test_loglevel_override(self): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO From 7e3cdf4ec014efe5b4cddb8d9554981f9181a6d5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:15:56 +0200 Subject: [PATCH 039/191] test-logging: also remove ocrd.log from tempdir --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4997066d1b..b5cd2f276e 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir From 4f45b12027fb0d53301dbbf17e2dcfa5637a1497 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:34 +0200 Subject: [PATCH 040/191] bashlib: re-add --log-filename, implement as stderr redirect --- src/ocrd/lib.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6eb..febaf92ae6 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -141,6 +141,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; From 7b70c90957bd8fe4ccfa78328ff860cff69cc87b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:01 +0200 Subject: [PATCH 041/191] ocrd_utils.config: add reset_defaults() --- src/ocrd_utils/config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 063af930c8..4182456435 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -68,14 +68,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) From 48bb3c2316e6838ff235a2badc985da14ee8b1b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:31 +0200 Subject: [PATCH 042/191] add test for OcrdEnvConfig.reset_defaults() --- tests/utils/test_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864c..a94eb5d3cc 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default From ed924032cc959c15f5f6fdd5a2cb34efa4d925a6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:14:13 +0200 Subject: [PATCH 043/191] Workspace.reload_mets: fix for METS server case --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 8b8e89bfca..4ef59252a0 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -123,7 +123,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") From 9c3c3997b5039ca68192d7046808aa5d1cfb83cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 14:59:42 +0200 Subject: [PATCH 044/191] OcrdMetsServer.add_file: pass on 'force' kwarg, too --- src/ocrd/mets_server.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 9b66871349..8a18f01682 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -284,15 +284,17 @@ def add_file( file_id=ID, page_id=pageId, mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( None, fileGrp=file_grp, @@ -506,7 +508,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -518,7 +521,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # From c077e957f256c21ec46c2b18cf5881e815a55fac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:00:38 +0200 Subject: [PATCH 045/191] test_mets_server: add test for force (overwrite) --- tests/test_mets_server.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 8f94b95645..dc94d6c560 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,7 +55,7 @@ def _start_mets_server(*args, **kwargs): p.terminate() rmtree(tmpdir, ignore_errors=True) -def add_file_server(x): +def add_file_server(x, force=False): mets_server_url, directory, i = x workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( @@ -65,6 +65,7 @@ def add_file_server(x): page_id=f'page{i}', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): @@ -123,6 +124,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 From 4492168ddabaf835b70c91602f905469c4ce6f3d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:59:51 +0200 Subject: [PATCH 046/191] PcGts.Page.id / make_xml_id: replace '/' with '_' --- src/ocrd_utils/str.py | 3 ++- tests/model/test_ocrd_page.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index dea3715bf4..b3d3ef496f 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -105,10 +105,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809f..97335775d6 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': From 83d52d888a4d403c3ce35a7db50c90db83253f7e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 16:32:55 +0200 Subject: [PATCH 047/191] METS Server: also export+delegate physical_pages --- src/ocrd/mets_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 8a18f01682..c85368e305 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -349,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -469,6 +493,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} From 4eccefc43b39e26337d0542e633fda077097d079 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:08 +0200 Subject: [PATCH 048/191] ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) --- src/ocrd/cli/workspace.py | 87 ++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 062a373608..6add3f839f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -37,6 +37,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -139,6 +150,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -174,10 +186,11 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -201,13 +214,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -454,13 +455,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + ret = [] + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -528,7 +524,7 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -651,7 +646,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() find_kwargs = {} if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range @@ -724,7 +719,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -841,7 +837,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -850,7 +846,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -871,7 +867,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -888,13 +884,8 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() @workspace_serve_cli.command('start') From 083df27664f4a40eb2d2baddcbb6bf0fd214df5d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:32 +0200 Subject: [PATCH 049/191] ocrd.cli.workspace server: add 'reload' and 'save' --- src/ocrd/cli/workspace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 6add3f839f..ff4aeef7c5 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -888,6 +888,20 @@ def workspace_serve_stop(ctx): # pylint: disable=unused-argument workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument From b2c01610bffd277ef7a3345427ff016280efc3a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:36:03 +0200 Subject: [PATCH 050/191] ocrd.cli.validate tasks: pass on --mets-server-url, too --- src/ocrd/cli/validate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..9d0cafd064 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) From 203a06a2a36ac5a74a5ab73ba9c693902e89fc38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:47:14 +0200 Subject: [PATCH 051/191] run_processor: be robust if ocrd_tool is missing steps --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 921cfeac80..fb5ca1bb0f 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -98,7 +98,7 @@ def run_processor( ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() From 4fbdd00439b9121dd5f01dd6b4ba2d5f24c251ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:38:11 +0200 Subject: [PATCH 052/191] lib.bash: fix errexit --- src/ocrd/lib.bash | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index febaf92ae6..745bc52fe4 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,6 +27,7 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { + set -e local minversion="$1" local version=$(ocrd --version|sed 's/ocrd, version //') #echo "$minversion < $version?" @@ -108,6 +109,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -250,6 +252,7 @@ $params_parsed" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" From c86507951e85ab13412cb6264841272f809ba07e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:03:43 +0200 Subject: [PATCH 053/191] tests: make sure ocrd_utils.config gets reset whenever changing it globally --- tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++-- tests/test_decorators.py | 6 +++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 3a47d2c23f..f2261d0ffb 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -6,8 +6,9 @@ from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli @@ -28,6 +29,10 @@ def setUp(self): self.workspace = self.resolver.workspace_from_url('mets.xml') self.addCleanup(stack.pop_all().close) + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_incomplete_processor(self): proc = IncompleteProcessor(None) with self.assertRaises(NotImplementedError): @@ -242,7 +247,29 @@ class ZipTestProcessor(Processor): pass proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = True + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = False + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index df8d6422be..c36577020a 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -45,6 +45,10 @@ def setUp(self): super().setUp() disableLogging() + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_minimal(self): exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) print(out, err) From 1a13cd394fd7f8a0a12259f7aefc0c3e1b1c8acc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 054/191] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ff4aeef7c5..415b8e6e2f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -150,7 +150,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -186,7 +187,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From bba597e1d5d4fe72044fb1024de548906cd599d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:37 +0200 Subject: [PATCH 055/191] OcrdPage: add PageType.get_ReadingOrderGroups() --- src/ocrd_page_user_methods.py | 1 + .../get_ReadingOrderGroups.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/ocrd_page_user_methods/get_ReadingOrderGroups.py diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e5..fe22dd89ab 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 0000000000..e7d6c02b77 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) From fa0fadaa536c0daed62abb136dad9a0af15d2e5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:58 +0200 Subject: [PATCH 056/191] update OcrdPage from generateds --- src/ocrd_models/ocrd_page_generateds.py | 55 ++++++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c8635..f2b7c0551e 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. From 8c566d76fce9940626e358370a31abc7ca5322e6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 057/191] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index c3fb11f600..de068567e2 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From dd62418e55fae345d0613d4813432cb0d25ec135 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 058/191] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ca4e8629db..4baab8f934 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -673,19 +673,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 1cfa6e309ca4591f55864e13bdecc7806646262e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:44:45 +0200 Subject: [PATCH 059/191] Processor.process_page_file: avoid process_page_pcgts() if OCRD_EXISTING_OUTPUT!=OVERWRITE --- src/ocrd/processor/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 26ea532d16..28cbaf7269 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -618,6 +618,12 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' From f678dca0e42b66d5742209ffb692103fa7f15528 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 060/191] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index c3fb11f600..de068567e2 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -599,7 +599,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 9064db01380cfca0327320cfcfa7c0fd02e2cb21 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 061/191] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 05b37b6bcc..77797b3037 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 9530fcd346357d23f6e914534f87436c206fa038 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:44:45 +0200 Subject: [PATCH 062/191] Processor.process_page_file: avoid process_page_pcgts() if OCRD_EXISTING_OUTPUT!=OVERWRITE --- src/ocrd/processor/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 26ea532d16..28cbaf7269 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -618,6 +618,12 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: # not PAGE and not an image to generate PAGE for self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) result = self.process_page_pcgts(*input_pcgts, page_id=page_id) for image_result in result.images: image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' From 9641d4abc5436fb2925bc288790984cd0239f80b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 063/191] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 66251a54dc..9eedf9fa34 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -598,7 +598,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 19ce7d992f567129af74f858e9f0f1ccd8482fce Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 064/191] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 415b8e6e2f..f66a1e3360 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 372f7259cc7a53d211a4ac072d91f335eeb41bf0 Mon Sep 17 00:00:00 2001 From: Markus Barth Date: Fri, 27 Sep 2024 09:48:42 +0200 Subject: [PATCH 065/191] Added space after %U in imagemagick identify format prameter. --- src/ocrd_models/ocrd_exif.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 406e60a85a..b5701771a5 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -48,11 +48,11 @@ def run_identify(self, img): for prop in ['compression', 'photometric_interpretation']: setattr(self, prop, img.info[prop] if prop in img.info else None) if img.filename: - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', img.filename], check=False, stderr=PIPE, stdout=PIPE) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE) else: with BytesIO() as bio: img.save(bio, format=img.format) - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) if ret.returncode: stderr = ret.stderr.decode('utf-8') if 'no decode delegate for this image format' in stderr: From 44deb80434dbcf40289f7ce451e416f5f021a54d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 20:11:36 +0200 Subject: [PATCH 066/191] ocrd_exif: add multi-frame TIFF example --- tests/model/test_exif.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index f6771fb8ee..077247521c 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -24,7 +24,10 @@ ('leptonica_samples/data/OCR-D-IMG/OCR-D-IMG_1555_007.jpg', 944, 1472, 1, 1, 1, 'inches', 'RGB', None), ('kant_aufklaerung_1784-jp2/data/OCR-D-IMG/INPUT_0020.jp2', - 1457, 2084, 1, 1, 1, 'inches', 'RGB', None) + 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), + # tolerate multi-frame TIFF: + ('gutachten/data/IMG/IMG_1.tif', + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw') ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats From 606915ba9e796b7e5642ac8f6cdf86ac8bcccbf3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:02:56 +0200 Subject: [PATCH 067/191] disableLogging: clearer comment Co-authored-by: Konstantin Baierer --- src/ocrd_utils/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 8f45f9c7fc..ac2b3416a4 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -211,7 +211,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): _initialized_flag = False # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger + # remove all handlers for the 'ocrd.' and root logger for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) From 3b908a678f524b37d406022bb05b76515d8303f6 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:02:44 +0200 Subject: [PATCH 068/191] :memo: changelog --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 351f5a56aa..0d759cb03d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,36 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.69.0] - 2024-09-30 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `disableLogging`: also re-instate root logger to Python defaults + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + - :fire: `OcrdMets.add_agent` without positional arguments + +Changed: + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + ## [2.68.0] - 2024-08-23 Changed: @@ -2164,6 +2194,7 @@ Fixed Initial Release +[2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 From 343a66afcb259d0cafaffdff3e050547f9f8d314 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:16:54 +0200 Subject: [PATCH 069/191] :memo: changelog: remove spurious entries --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d759cb03d..88f6b6cadc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,6 @@ Fixed: - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` - - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters - `lib.bash`: fix `errexit` handling - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - `Workspace.reload_mets`: handle ClientSideOcrdMets as well @@ -24,7 +22,6 @@ Fixed: - :fire: `OcrdMets.add_agent` without positional arguments Changed: - - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` From f808b726227d5502426b29dd7ab3a97af83a75e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:46:34 +0200 Subject: [PATCH 070/191] :memo: update changelog again --- CHANGELOG.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f6b6cadc..d058ebce96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,28 +9,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.workspace`: make `list-page` work w/ METS Server - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - `lib.bash`: fix `errexit` handling - - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - - `Workspace.reload_mets`: handle ClientSideOcrdMets as well - - `disableLogging`: also re-instate root logger to Python defaults - actually apply CLI `--log-filename`, and show in `--help` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) - - :fire: `OcrdMets.add_agent` without positional arguments + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + - `disableLogging`: also re-instate root logger to Python defaults Changed: - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + - `ClientSideOcrdMets`: use same logger name prefix as METS Server + - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise Added: - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict - - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - METS Server: export and delegate `physical_pages` + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - processor CLI: delegate `--resolve-resource`, too - * `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `ocrd_utils.scale_coordinates` for resizing images ## [2.68.0] - 2024-08-23 From 4d25fcfa63c98dec7a66fcf5fdf7c959e6bb2713 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:11:18 +0200 Subject: [PATCH 071/191] update assets --- repo/assets | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/assets b/repo/assets index 05568aaa2d..ca108faf0e 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 +Subproject commit ca108faf0e95cc823a9e84cd0a1602282ae006b1 From bdfb41080a8291f3f87280669c684f8a191cb7d5 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:17:03 +0200 Subject: [PATCH 072/191] test_exif: add example provided by @mexthecat --- tests/model/test_exif.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index 077247521c..18c5e4c467 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -27,7 +27,10 @@ 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), # tolerate multi-frame TIFF: ('gutachten/data/IMG/IMG_1.tif', - 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw') + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw'), + # multi-frame TIFF with metric pixel density (is actually YCBCR not RGB but Pillow thinks otherwise...) + ('indian-ferns/data/OCR-D-IMG/0004.tif', + 2626, 3620, 28, 28, 28, 'cm', 'RGB', 'jpeg'), ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats From e6d1f857d4e53e5d9658e90dc87e761f9a13bc63 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:32:14 +0200 Subject: [PATCH 073/191] :memo: changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d058ebce96..3b1036ab84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,6 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased -## [2.69.0] - 2024-09-30 - Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` @@ -20,6 +18,7 @@ Fixed: - `Workspace.reload_mets`: handle ClientSideOcrdMets as well - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` - `disableLogging`: also re-instate root logger to Python defaults + - `OcrdExif`: handle multi-frame TIFFs gracefully in `identify` callout, #1276 Changed: - `run_processor`: be robust if `ocrd_tool` is missing `steps` From ff81c6b571852ed44523d305eb4a566e461be386 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 18:32:30 +0200 Subject: [PATCH 074/191] :package: v2.69.0 --- CHANGELOG.md | 2 ++ VERSION | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1036ab84..34ec973570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.69.0] - 2024-09-30 + Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` diff --git a/VERSION b/VERSION index 0f1ddc8105..a740b92f5e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.68.0 +2.69.0 From f44e28b13328f8060f921a9686ebd47aef49cb1e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:32:47 +0200 Subject: [PATCH 075/191] introduce: OCRD_NETWORK_CLIENT_POLLING_PRINT --- src/ocrd_network/client.py | 10 +++++++--- src/ocrd_network/client_utils.py | 14 +++++++++----- src/ocrd_utils/config.py | 7 ++++++- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 8ec8e541ea..c45aa3ecf3 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -19,7 +19,8 @@ def __init__( self, server_addr_processing: Optional[str], timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT, - wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP + wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP, + print_output: bool = config.OCRD_NETWORK_CLIENT_POLLING_PRINT ): self.log = getLogger(f"ocrd_network.client") if not server_addr_processing: @@ -29,6 +30,7 @@ def __init__( self.polling_timeout = timeout self.polling_wait = wait self.polling_tries = int(timeout / wait) + self.polling_print_output = print_output def check_deployed_processors(self): return get_ps_deployed_processors(ps_server_host=self.server_addr_processing) @@ -48,11 +50,13 @@ def check_workflow_status(self, workflow_job_id: str): def poll_job_status(self, job_id: str) -> str: return poll_job_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_output=self.polling_print_output) def poll_workflow_status(self, job_id: str) -> str: return poll_wf_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_output=self.polling_print_output) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a4..3ebe8d3b87 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -3,7 +3,7 @@ from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -13,18 +13,22 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) + if print_output: + print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break tries -= 1 return job_state -def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait) +def poll_job_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) -def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait) +def poll_wf_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) def get_ps_deployed_processors(ps_server_host: str): diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 4182456435..ab058c7830 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -160,13 +160,18 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP", description="How many seconds to sleep before trying again.", parser=int, - default=(True, 30)) + default=(True, 10)) config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT", description="Timeout for a blocking ocrd network client (in seconds).", parser=int, default=(True, 3600)) +config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", + description="Timeout for a blocking ocrd network client (in seconds).", + parser=bool, + default=(True, False)) + config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW", description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", default=(True, '')) From 7177eb147f6234417e20dbeeba7c0f707375cd02 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:35:50 +0200 Subject: [PATCH 076/191] fix: config value description --- src/ocrd_utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index ab058c7830..03d654bc74 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -168,7 +168,7 @@ def _ocrd_download_timeout_parser(val): default=(True, 3600)) config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", - description="Timeout for a blocking ocrd network client (in seconds).", + description="Whether the blocking client commands should print status output each iteration.", parser=bool, default=(True, False)) From df8e8eede7548f74f195b884559a73b600de2f4a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:41:53 +0200 Subject: [PATCH 077/191] add default value param to preserver backwards compatibility --- src/ocrd_network/client_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 3ebe8d3b87..d3534b4b3f 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -3,7 +3,8 @@ from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool): +def _poll_endpoint_status( + ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool = False): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -22,12 +23,12 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries def poll_job_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) def poll_wf_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool) -> JobState: + ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) From b183cfcb007d627399b3a18e527c8a3ed298010d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 15:56:25 +0200 Subject: [PATCH 078/191] make -b/--block as flags --- src/ocrd_network/cli/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88f..39ef62c5fe 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -104,7 +104,7 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--result-queue-name') @click.option('--callback-url') @click.option('--agent-type', default='worker') -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') def send_processing_job_request( address: Optional[str], @@ -176,7 +176,7 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') def send_workflow_job_request( address: Optional[str], From 342ef3a78f3620ff3e63200b2a9bc4c11639c581 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:00:12 +0200 Subject: [PATCH 079/191] implement feedback --- src/ocrd_network/cli/client.py | 8 ++++++-- src/ocrd_network/client.py | 12 +++++------- src/ocrd_network/client_utils.py | 12 ++++++------ src/ocrd_utils/config.py | 5 ----- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 39ef62c5fe..5dd7fd0f78 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -106,6 +106,8 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--agent-type', default='worker') @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_processing_job_request( address: Optional[str], processor_name: str, @@ -146,7 +148,7 @@ def send_processing_job_request( assert processing_job_id print(f"Processing job id: {processing_job_id}") if block: - client.poll_job_status(job_id=processing_job_id) + client.poll_job_status(job_id=processing_job_id, print_state=print_state) @client_cli.group('workflow') @@ -178,6 +180,8 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): @click.option('-w', '--path-to-workflow', required=True) @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, @@ -192,7 +196,7 @@ def send_workflow_job_request( assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) + client.poll_workflow_status(job_id=workflow_job_id, print_state=print_state) @client_cli.group('workspace') diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index c45aa3ecf3..5a6831bea7 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -19,8 +19,7 @@ def __init__( self, server_addr_processing: Optional[str], timeout: int = config.OCRD_NETWORK_CLIENT_POLLING_TIMEOUT, - wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP, - print_output: bool = config.OCRD_NETWORK_CLIENT_POLLING_PRINT + wait: int = config.OCRD_NETWORK_CLIENT_POLLING_SLEEP ): self.log = getLogger(f"ocrd_network.client") if not server_addr_processing: @@ -30,7 +29,6 @@ def __init__( self.polling_timeout = timeout self.polling_wait = wait self.polling_tries = int(timeout / wait) - self.polling_print_output = print_output def check_deployed_processors(self): return get_ps_deployed_processors(ps_server_host=self.server_addr_processing) @@ -48,15 +46,15 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str) -> str: + def poll_job_status(self, job_id: str, print_state: bool) -> str: return poll_job_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, - print_output=self.polling_print_output) + print_state=print_state) - def poll_workflow_status(self, job_id: str) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool) -> str: return poll_wf_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, - print_output=self.polling_print_output) + print_state=print_state) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index d3534b4b3f..87649d5ad4 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -4,7 +4,7 @@ def _poll_endpoint_status( - ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_output: bool = False): + ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_state: bool = False): if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -14,7 +14,7 @@ def _poll_endpoint_status( job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) - if print_output: + if print_state: print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break @@ -23,13 +23,13 @@ def _poll_endpoint_status( def poll_job_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_output) + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_state) def poll_wf_status_till_timeout_fail_or_success( - ps_server_host: str, job_id: str, tries: int, wait: int, print_output: bool = False) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_output) + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_state) def get_ps_deployed_processors(ps_server_host: str): diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 03d654bc74..d2cc4efce1 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -167,11 +167,6 @@ def _ocrd_download_timeout_parser(val): parser=int, default=(True, 3600)) -config.add("OCRD_NETWORK_CLIENT_POLLING_PRINT", - description="Whether the blocking client commands should print status output each iteration.", - parser=bool, - default=(True, False)) - config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW", description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", default=(True, '')) From 0e80a7cf84a5db1073ea5ba1363819ed40d16020 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:02:30 +0200 Subject: [PATCH 080/191] fix: missed params --- src/ocrd_network/cli/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 5dd7fd0f78..fd28552866 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -122,7 +122,8 @@ def send_processing_job_request( # TODO: This is temporally available to toggle # between the ProcessingWorker/ProcessorServer agent_type: Optional[str], - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a processing job to the processing server. @@ -186,7 +187,8 @@ def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a workflow job to the processing server. From d7df20049fe3175e001a1feb60ec42b17ee3a2f0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 1 Oct 2024 16:08:57 +0200 Subject: [PATCH 081/191] fix: integration client tests --- src/ocrd_network/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 5a6831bea7..c4315ded4d 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -46,12 +46,12 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str, print_state: bool) -> str: + def poll_job_status(self, job_id: str, print_state: bool = False) -> str: return poll_job_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, print_state=print_state) - def poll_workflow_status(self, job_id: str, print_state: bool) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool = False) -> str: return poll_wf_status_till_timeout_fail_or_success( ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, print_state=print_state) From 0bfef64ec694e6695f1c95a5fab343c268b25ec0 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 16:25:43 +0200 Subject: [PATCH 082/191] post_ps_workflow_request: pagewise configurable --- src/ocrd_network/cli/client.py | 20 +++++++++++++++++--- src/ocrd_network/client_utils.py | 26 +++++++++++++++++--------- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88f..a57cb88b82 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -2,6 +2,7 @@ from json import dumps from typing import List, Optional, Tuple from ocrd.decorators.parameter_option import parameter_option, parameter_override_option +from ocrd_network.constants import JobState from ocrd_utils import DEFAULT_METS_BASENAME from ocrd_utils.introspect import set_json_key_value_overrides from ocrd_utils.str import parse_json_string_or_file @@ -176,23 +177,36 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('-p/-P', '--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('-b', '--block', is_flag=True, default=False, help='If set, the client will block till job timeout, fail or success.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, + page_wise : bool, block: Optional[bool] ): """ Submit a workflow job to the processing server. """ client = Client(server_addr_processing=address) - workflow_job_id = client.send_workflow_job_request(path_to_wf=path_to_workflow, path_to_mets=path_to_mets) + workflow_job_id = client.send_workflow_job_request( + path_to_wf=path_to_workflow, + path_to_mets=path_to_mets, + page_wise=page_wise, + ) assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) + print(f"Polling state of workflow job {workflow_job_id}") + state = client.poll_workflow_status(job_id=workflow_job_id) + if state != JobState.success: + print(f"Workflow failed with {state}") + exit(1) + else: + print(f"Workflow succeeded") + exit(0) @client_cli.group('workspace') diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a4..24f3da105c 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -1,9 +1,10 @@ +import json from requests import get as request_get, post as request_post from time import sleep from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int) -> JobState: if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -47,22 +48,21 @@ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str): return response -def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str: +def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> JobState: request_url = f"{ps_server_host}/processor/job/{processing_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state - + return getattr(JobState, job_state.lower()) -def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str: +def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState: request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state + return getattr(JobState, job_state.lower()) def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str: @@ -79,8 +79,13 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d # TODO: Can be extended to include other parameters such as page_wise -def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str: - request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True" +def post_ps_workflow_request( + ps_server_host: str, + path_to_wf: str, + path_to_mets: str, + page_wise : bool, +) -> str: + request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( url=request_url, headers={"accept": "application/json; charset=utf-8"}, @@ -88,8 +93,11 @@ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: ) # print(response.json()) # print(response.__dict__) + json_resp_raw = response.text + # print(f'post_ps_workflow_request >> {response.status_code}') + # print(f'post_ps_workflow_request >> {json_resp_raw}') assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" - wf_job_id = response.json()["job_id"] + wf_job_id = json.loads(json_resp_raw)["job_id"] assert wf_job_id return wf_job_id From 1f5c4bbb756d05c55758968a610aa810111cbf48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:16:17 +0200 Subject: [PATCH 083/191] Dockerfile.cuda-torch: do NOT rm /build/core since we installed core in editable mode! --- Dockerfile.cuda-torch | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile.cuda-torch b/Dockerfile.cuda-torch index 8d6c3aa624..59ce1144be 100644 --- a/Dockerfile.cuda-torch +++ b/Dockerfile.cuda-torch @@ -9,7 +9,5 @@ RUN make deps-torch WORKDIR /data -RUN rm -fr /build - CMD ["/usr/local/bin/ocrd", "--help"] From 611b6b566e565873648c4a112adbb6d8bedc155d Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 1 Oct 2024 18:01:30 +0200 Subject: [PATCH 084/191] deployer: Remove any pre-existing socket file before starting the server (again) --- src/ocrd_network/runtime_data/deployer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index b956904d07..7b064961c5 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -146,6 +146,11 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: if is_mets_server_running(mets_server_url=str(mets_server_url)): self.log.debug(f"The UDS mets server for {ws_dir_path} is already started: {mets_server_url}") return mets_server_url + elif Path(mets_server_url).is_socket(): + self.log.warning( + f"The UDS mets server for {ws_dir_path} is not running but the socket file exists: {mets_server_url}." + "Removing to avoid any weird behavior before starting the server.") + Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) self.mets_servers[mets_server_url] = pid From 9a71d048dd8ddc1dceba3fa24d34af719690eaf5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:11:07 +0200 Subject: [PATCH 085/191] remove UDS socket files --- src/ocrd/mets_server.py | 2 +- src/ocrd_network/runtime_data/deployer.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c85368e305..a8f766289c 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -434,7 +434,7 @@ def kill_process(mets_server_pid: int): def shutdown(self): if self.is_uds: if Path(self.url).exists(): - self.log.debug(f'UDS socket {self.url} still exists, removing it') + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") Path(self.url).unlink() # os._exit because uvicorn catches SystemExit raised by sys.exit _exit(0) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 7b064961c5..90f7c6d5c7 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -165,6 +165,9 @@ def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False raise Exception(message) mets_server_pid = self.mets_servers[Path(mets_server_url)] OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) + if Path(mets_server_url).exists(): + self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url}") + Path(mets_server_url).unlink() return # TODO: Reconsider this again # Not having this sleep here causes connection errors From 854403de6ea880c31b82463bba3850c07565327d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:38:07 +0200 Subject: [PATCH 086/191] remove shortcuts for page-wise --- src/ocrd_network/cli/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 6733f893aa..450cce43fb 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -180,7 +180,7 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-p/-P', '--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") @click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') @click.option('-p', '--print-state', default=False, is_flag=True, From 4d01e66229bcd63872f4fd93699aa0084792c02c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:40:19 +0200 Subject: [PATCH 087/191] fix: pass page-wise argument to relevant methods --- src/ocrd_network/cli/client.py | 2 +- src/ocrd_network/client.py | 5 +++-- src/ocrd_network/client_utils.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 450cce43fb..350cf64b90 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -189,7 +189,7 @@ def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - page_wise : bool, + page_wise: bool, block: bool, print_state: bool ): diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index c4315ded4d..1521997942 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -60,6 +60,7 @@ def send_processing_job_request(self, processor_name: str, req_params: dict) -> return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool): return post_ps_workflow_request( - ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets) + ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, + page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index b23442e502..456398ecf8 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -87,7 +87,7 @@ def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, path_to_mets: str, - page_wise : bool, + page_wise: bool, ) -> str: request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( From 97427e07326bddc0ff83e4d1ed5eba4cb6631829 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 10:42:00 +0200 Subject: [PATCH 088/191] Update src/ocrd_network/client_utils.py Co-authored-by: Konstantin Baierer --- src/ocrd_network/client_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 456398ecf8..51db2681a6 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -82,7 +82,6 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d return processing_job_id -# TODO: Can be extended to include other parameters such as page_wise def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, From 745484588ab9c77481397a9daaabee086f7790ee Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:07:19 +0200 Subject: [PATCH 089/191] add endpoint DELETE /workflow/kill-mets-server-zombies to kill -SIGTERM METS servers with ctime > 60mins ago --- src/ocrd/mets_server.py | 5 ++-- src/ocrd_network/processing_server.py | 12 ++++++++++ src/ocrd_network/server_utils.py | 33 +++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c85368e305..c46a99a2d8 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -428,8 +430,7 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return + return os.kill(mets_server_pid, signal.SIGTERM) def shutdown(self): if self.is_uds: diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..29061c5645 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -314,6 +315,14 @@ def add_api_routes_workflow(self): status_code=status.HTTP_200_OK, summary="Get information about a workflow run" ) + workflow_router.add_api_route( + path="/workflow/kill-mets-server-zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(workflow_router) async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict: @@ -817,6 +826,9 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self) -> None: + kill_mets_server_zombies(minutes_ago=60) + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170c..1897f3a62e 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,22 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago=60): + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = procdir.name + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + os.kill(int(pid), signal.SIGTERM) From 0506e9d5f5edca7e7f6198ad93c0ac4a04f0061d Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:28:45 +0200 Subject: [PATCH 090/191] move mets-zombie killer to / and return list of killed PIDs --- src/ocrd_network/processing_server.py | 21 +++++++++++---------- src/ocrd_network/server_utils.py | 5 ++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 29061c5645..04305a6fbb 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -201,6 +201,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill-mets-server-zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -315,14 +323,6 @@ def add_api_routes_workflow(self): status_code=status.HTTP_200_OK, summary="Get information about a workflow run" ) - workflow_router.add_api_route( - path="/workflow/kill-mets-server-zombies", - endpoint=self.kill_mets_server_zombies, - methods=["DELETE"], - tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], - status_code=status.HTTP_200_OK, - summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." - ) self.include_router(workflow_router) async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict: @@ -826,8 +826,9 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response - async def kill_mets_server_zombies(self) -> None: - kill_mets_server_zombies(minutes_ago=60) + async def kill_mets_server_zombies(self) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=60) + return pids_killed async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 1897f3a62e..b143e344af 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -249,9 +249,10 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60): +def kill_mets_server_zombies(minutes_ago=60) -> list[int]: now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): if not procdir.is_dir(): continue @@ -264,5 +265,7 @@ def kill_mets_server_zombies(minutes_ago=60): cmdline = cmdline_file.read_text().replace('\x00', ' ') if re.match(cmdline_pat, cmdline): pid = procdir.name + ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) os.kill(int(pid), signal.SIGTERM) + return ret From ad81356d32178c53814ff1293f35d3dd7827b793 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:31:56 +0200 Subject: [PATCH 091/191] /kill_mets_server_zombies use underscores not slashes --- src/ocrd_network/processing_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 04305a6fbb..505e106ba2 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -202,7 +202,7 @@ def add_api_routes_others(self): summary="Forward a TCP request to UDS mets server" ) others_router.add_api_route( - path="/kill-mets-server-zombies", + path="/kill_mets_server_zombies", endpoint=self.kill_mets_server_zombies, methods=["DELETE"], tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], From 4862d72fe6f7149ff4ce97d56ac870837bafddc5 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 2 Oct 2024 14:41:32 +0200 Subject: [PATCH 092/191] use 3.8 compatible typing --- src/ocrd_network/server_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index b143e344af..773668f5b7 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -249,7 +249,7 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60) -> list[int]: +def kill_mets_server_zombies(minutes_ago=60) -> List[int]: now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' ret = [] From 4f6775f358fdf0c7d3164d30e01ecb63106b4a6a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 2 Oct 2024 15:13:38 +0200 Subject: [PATCH 093/191] OcrdMetsServer.kill_process: try the easy way (SIGINT) then the hard way (SIGKILL) --- src/ocrd/mets_server.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b6a8f140ba..4b4ffa728f 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -430,7 +430,12 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - return os.kill(mets_server_pid, signal.SIGTERM) + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): if self.is_uds: From 3882e7abf397650ece1e36798232cb148922a43d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 2 Oct 2024 15:17:46 +0200 Subject: [PATCH 094/191] fix: add default to page_wise param --- src/ocrd_network/client.py | 2 +- src/ocrd_network/client_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 1521997942..bb7cf4dbf2 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -60,7 +60,7 @@ def send_processing_job_request(self, processor_name: str, req_params: dict) -> return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool = False): return post_ps_workflow_request( ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 51db2681a6..4eaf4ea95b 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -86,7 +86,7 @@ def post_ps_workflow_request( ps_server_host: str, path_to_wf: str, path_to_mets: str, - page_wise: bool, + page_wise: bool = False, ) -> str: request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( From 7b6552b0c7e213fcd0c4d6879c7e65d411445aca Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 13:36:01 +0200 Subject: [PATCH 095/191] previous state --- src/ocrd/mets_server.py | 10 +++++-- src/ocrd_network/processing_server.py | 20 +++++++++++-- src/ocrd_network/runtime_data/deployer.py | 32 ++++++++++++-------- src/ocrd_network/server_utils.py | 36 ++++++++++++++++++++--- src/ocrd_network/utils.py | 4 +-- 5 files changed, 79 insertions(+), 23 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index a8f766289c..4b4ffa728f 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -428,8 +430,12 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) - return + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): if self.is_uds: diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..50078be377 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -200,6 +201,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill_mets_server_zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -574,7 +583,7 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - ) async def _consume_cached_jobs_of_workspace( - self, workspace_key: str, mets_server_url: str + self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: # Check whether the internal queue for the workspace key still exists @@ -593,7 +602,8 @@ async def _consume_cached_jobs_of_workspace( # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url) + self.deployer.stop_uds_mets_server( + mets_server_url=mets_server_url, path_to_mets=path_to_mets, stop_with_pid=True) try: # The queue is empty - delete it @@ -643,7 +653,7 @@ async def remove_job_from_request_cache(self, result_message: PYResultMessage): raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message, error) consumed_cached_jobs = await self._consume_cached_jobs_of_workspace( - workspace_key=workspace_key, mets_server_url=mets_server_url + workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets ) await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs) @@ -817,6 +827,10 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=60) + return pids_killed + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 90f7c6d5c7..f60194ce4e 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,7 +8,6 @@ """ from __future__ import annotations from pathlib import Path -from subprocess import Popen, run as subprocess_run from time import sleep from typing import Dict, List, Union @@ -30,6 +29,8 @@ def __init__(self, config_path: str) -> None: self.data_hosts: List[DataHost] = parse_hosts_data(ps_config["hosts"]) self.internal_callback_url = ps_config.get("internal_callback_url", None) self.mets_servers: Dict = {} # {"mets_server_url": "mets_server_pid"} + # This is required to store UDS urls that are multiplexed through the TCP proxy and are not preserved anywhere + self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"} self.use_tcp_mets = ps_config.get("use_tcp_mets", False) # TODO: Reconsider this. @@ -153,26 +154,33 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) - self.mets_servers[mets_server_url] = pid + self.mets_servers[str(mets_server_url)] = pid + self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") + self.log.info(f"Path to the mets file: {path_to_mets}") + self.log.info(f"mets_server: {self.mets_servers}") + self.log.info(f"mets_server_paths: {self.mets_servers_paths}") if stop_with_pid: - if Path(mets_server_url) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url}" - self.log.exception(message) - raise Exception(message) - mets_server_pid = self.mets_servers[Path(mets_server_url)] + mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] + if Path(mets_server_url_uds) not in self.mets_servers: + message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" + self.log.warning(message) + mets_server_pid = self.mets_servers[str(mets_server_url_uds)] + self.log.info(f"Killing mets server pid: {mets_server_pid} of {mets_server_url_uds}") OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - if Path(mets_server_url).exists(): - self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url}") - Path(mets_server_url).unlink() + self.log.info(f"Returning after the kill process") + if Path(mets_server_url_uds).exists(): + self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") + Path(mets_server_url_uds).unlink() + self.log.info(f"Returning from the stop_uds_mets_server") return # TODO: Reconsider this again # Not having this sleep here causes connection errors # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - stop_mets_server(mets_server_url=mets_server_url) + stop_mets_server(mets_server_url=mets_server_url, ws_dir_path=Path(path_to_mets).parent) return diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170c..773668f5b7 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,25 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago=60) -> List[int]: + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = procdir.name + ret.append(pid) + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + os.kill(int(pid), signal.SIGTERM) + return ret diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de43..13bbee7dbb 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -151,7 +151,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -160,7 +160,7 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: if 'tcp_mets' in mets_server_url: if not ws_dir_path: return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) + response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(str(ws_dir_path))) else: response = session.delete(url=f"{mets_server_url}/") except Exception: From 637a40e452b981d7cc8b74937bc149a568efcb68 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 13:55:49 +0200 Subject: [PATCH 096/191] do not use pid killing --- src/ocrd_network/processing_server.py | 3 +-- src/ocrd_network/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 50078be377..edae6733c0 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -602,8 +602,7 @@ async def _consume_cached_jobs_of_workspace( # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server( - mets_server_url=mets_server_url, path_to_mets=path_to_mets, stop_with_pid=True) + self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: # The queue is empty - delete it diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 13bbee7dbb..a2f563de43 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -151,7 +151,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: +def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -160,7 +160,7 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: Path = None) -> bool: if 'tcp_mets' in mets_server_url: if not ws_dir_path: return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(str(ws_dir_path))) + response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) else: response = session.delete(url=f"{mets_server_url}/") except Exception: From 387dc3085ebe831fd1beb3937f7a8b4b60197123 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:04:26 +0200 Subject: [PATCH 097/191] add logger param to stop mets server --- src/ocrd_network/processing_server.py | 1 - src/ocrd_network/runtime_data/deployer.py | 2 +- src/ocrd_network/utils.py | 11 +++++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index edae6733c0..59243d52fe 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -601,7 +601,6 @@ async def _consume_cached_jobs_of_workspace( # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index f60194ce4e..16207154b3 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -182,5 +182,5 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - stop_mets_server(mets_server_url=mets_server_url, ws_dir_path=Path(path_to_mets).parent) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) return diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de43..7747e5ea6f 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -4,6 +4,7 @@ from functools import wraps from hashlib import md5 from json import loads +from logging import Logger from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -151,7 +152,7 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str = None) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" session = Session_TCP() if protocol == "tcp" else Session_UDS() if protocol == "uds": @@ -159,9 +160,15 @@ def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: try: if 'tcp_mets' in mets_server_url: if not ws_dir_path: + logger.warning("Multiplexing through the Processing Server to reach a mets server but no workspace " + "path is specified. There is no way for the Processing Server to know to which Mets " + "Server the incoming requests should be forwarded.") return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = session.post(url=f"{mets_server_url}", json=request_json) else: + logger.info(f"Sending DELETE request to: {mets_server_url}/") response = session.delete(url=f"{mets_server_url}/") except Exception: return False From 07953f76042f977a9a60df70da8f30688357bde9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:06:21 +0200 Subject: [PATCH 098/191] add extensive logging to mets proxy --- src/ocrd_network/tcp_to_uds_mets_proxy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 176f4f1442..4fa2f3ea70 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -34,6 +34,10 @@ def forward_tcp_request(self, request_body) -> Dict: ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' uds_request_url = f"{ws_unix_socket_url}/{request_url}" + self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") + self.log.info(f"Forwarding method type {method_type}, request data: {request_data}, " + f"expected response type: {response_type}") + if not request_data: response = self.session.request(method_type, uds_request_url) elif "params" in request_data: From 3a9e1479f722465452d70905466418d09ff2f4f7 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 18:12:13 +0200 Subject: [PATCH 099/191] return empty response type earlier --- src/ocrd_network/tcp_to_uds_mets_proxy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 4fa2f3ea70..e110978713 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -49,12 +49,11 @@ def forward_tcp_request(self, request_body) -> Dict: else: raise ValueError("Expecting request_data to be empty or containing single key: params," f"form, or class but not {request_data.keys}") - + if response_type == "empty": + return {} if not response: self.log.error(f"Uds-Mets-Server gives unexpected error. Response: {response.__dict__}") return {"error": response.text} - elif response_type == "empty": - return {} elif response_type == "text": return {"text": response.text} elif response_type == "class" or response_type == "dict": From 00655b82f0409b4811324cf40a788e83ae9dd6c8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:02:00 +0200 Subject: [PATCH 100/191] fix: change UDS file deletion place --- src/ocrd/mets_server.py | 11 +++--- src/ocrd_network/tcp_to_uds_mets_proxy.py | 4 +-- src/ocrd_network/utils.py | 42 +++++++++++++---------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 4b4ffa728f..f3dfd5ea64 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -437,11 +437,8 @@ def kill_process(mets_server_pid: int): except ProcessLookupError as e: pass - def shutdown(self): - if self.is_uds: - if Path(self.url).exists(): - self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") - Path(self.url).unlink() + @staticmethod + def shutdown(): # os._exit because uvicorn catches SystemExit raised by sys.exit _exit(0) @@ -472,7 +469,8 @@ def save(): """ Write current changes to the file system """ - return workspace.save_mets() + workspace.save_mets() + return Response(status_code=200, content="The Mets Server is writing changes to disk.") @app.delete(path='/') async def stop(): @@ -482,6 +480,7 @@ async def stop(): getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() self.shutdown() + return Response(status_code=200, content="The Mets Server is shutting down...") @app.post(path='/reload') async def workspace_reload_mets(): diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index e110978713..3f335435ab 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -1,5 +1,5 @@ from requests_unixsocket import Session as requests_unixsocket_session -from .utils import get_uds_path +from .utils import get_uds_path, convert_url_to_uds_format from typing import Dict from ocrd_utils import getLogger @@ -31,7 +31,7 @@ def forward_tcp_request(self, request_body) -> Dict: if method_type not in SUPPORTED_METHOD_TYPES: raise NotImplementedError(f"Method type: {method_type} not recognized") ws_socket_file = str(get_uds_path(ws_dir_path=ws_dir_path)) - ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' + ws_unix_socket_url = convert_url_to_uds_format(ws_socket_file) uds_request_url = f"{ws_unix_socket_url}/{request_url}" self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 7747e5ea6f..eebb5a3ba1 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -152,28 +152,32 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" - session = Session_TCP() if protocol == "tcp" else Session_UDS() + # If the mets server URL is the proxy endpoint + if protocol == "tcp" and "tcp_mets" in mets_server_url: + # Convert the mets server url to UDS format + ws_socket_file = str(get_uds_path(ws_dir_path)) + mets_server_url = convert_url_to_uds_format(ws_socket_file) + protocol = "uds" + if protocol == "tcp": + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = Session_TCP().post(url=f"{mets_server_url}", json=request_json) + return response.status_code == 200 + elif protocol == "uds": + logger.info(f"Sending DELETE request to: {mets_server_url}/") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + else: + ValueError(f"Unexpected protocol type: {protocol}") if protocol == "uds": - mets_server_url = convert_url_to_uds_format(mets_server_url) - try: - if 'tcp_mets' in mets_server_url: - if not ws_dir_path: - logger.warning("Multiplexing through the Processing Server to reach a mets server but no workspace " - "path is specified. There is no way for the Processing Server to know to which Mets " - "Server the incoming requests should be forwarded.") - return False - request_json = MpxReq.stop(ws_dir_path) - logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") - response = session.post(url=f"{mets_server_url}", json=request_json) + ws_socket_file = str(get_uds_path(ws_dir_path)) + if Path(ws_socket_file).exists(): + logger.info(f"Removing the inactive UDS file: {ws_socket_file}") + Path(ws_socket_file).unlink() else: - logger.info(f"Sending DELETE request to: {mets_server_url}/") - response = session.delete(url=f"{mets_server_url}/") - except Exception: - return False - return response.status_code == 200 - + logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") From 810f8111a6a85db465a6becad0ca721d91ed4b73 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:22:50 +0200 Subject: [PATCH 101/191] return response from mets server before dying --- src/ocrd/mets_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index f3dfd5ea64..b5773d978e 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -440,7 +440,8 @@ def kill_process(mets_server_pid: int): @staticmethod def shutdown(): # os._exit because uvicorn catches SystemExit raised by sys.exit - _exit(0) + # _exit(0) + os.kill(os.getpid(), signal.SIGTERM) def startup(self): self.log.info("Starting up METS server") From 4970e6238cd51d03abc358b82cb8b100175061f1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:23:25 +0200 Subject: [PATCH 102/191] fix: remove UDS file correctly --- src/ocrd_network/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index eebb5a3ba1..3dfa71e5f3 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -167,17 +167,19 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> return response.status_code == 200 elif protocol == "uds": logger.info(f"Sending DELETE request to: {mets_server_url}/") - response = Session_UDS().delete(url=f"{mets_server_url}/") - return response.status_code == 200 + try: + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + finally: + if protocol == "uds": + ws_socket_file = str(get_uds_path(ws_dir_path)) + if Path(ws_socket_file).exists(): + logger.info(f"Removing the inactive UDS file: {ws_socket_file}") + Path(ws_socket_file).unlink() + else: + logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") else: ValueError(f"Unexpected protocol type: {protocol}") - if protocol == "uds": - ws_socket_file = str(get_uds_path(ws_dir_path)) - if Path(ws_socket_file).exists(): - logger.info(f"Removing the inactive UDS file: {ws_socket_file}") - Path(ws_socket_file).unlink() - else: - logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") From 906766d38f4dcc583511f51fbe6d9b39b48ab74c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 20:33:52 +0200 Subject: [PATCH 103/191] comment out irrelevant code --- src/ocrd/mets_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b5773d978e..b8bd99b6a3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -444,7 +444,7 @@ def shutdown(): os.kill(os.getpid(), signal.SIGTERM) def startup(self): - self.log.info("Starting up METS server") + self.log.info(f"Starting up METS server: {self.url}") workspace = self.workspace @@ -564,9 +564,12 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + # TODO: Not required after #1284, consider removing + """ if Path(self.url).exists() and not is_socket_in_use(self.url): # remove leftover unused socket which blocks startup Path(self.url).unlink() + """ server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -581,7 +584,7 @@ async def add_file( self.log.debug("Starting uvicorn") uvicorn.run(app, **uvicorn_kwargs) - +# TODO: Not required after #1284, consider removing def is_socket_in_use(socket_path): if Path(socket_path).exists(): client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) From a87a2e111a681ebed356401e75560edd5cd1ba7b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:04:20 +0200 Subject: [PATCH 104/191] fix: no more zombies, yay! --- src/ocrd_network/runtime_data/deployer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 16207154b3..7aec568071 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,6 +8,7 @@ """ from __future__ import annotations from pathlib import Path +import psutil from time import sleep from typing import Dict, List, Union @@ -182,5 +183,13 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) + mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] + self.log.info(f"Terminating mets server with pid: {mets_server_pid}") + p = psutil.Process(mets_server_pid) stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) + if p.is_running(): + p.wait() + self.log.info(f"Terminated mets server with pid: {mets_server_pid}") + else: + self.log.info(f"Mets server has already terminated with pid: {mets_server_pid}") return From e0ff4ebd3ea200a73b200375e75a4886eb1941fc Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:37:39 +0200 Subject: [PATCH 105/191] add: extensive logging of mets server to file --- src/ocrd/mets_server.py | 56 ++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b8bd99b6a3..c6448b1d81 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -437,14 +437,15 @@ def kill_process(mets_server_pid: int): except ProcessLookupError as e: pass - @staticmethod - def shutdown(): + def shutdown(self): # os._exit because uvicorn catches SystemExit raised by sys.exit # _exit(0) - os.kill(os.getpid(), signal.SIGTERM) + pid = os.getpid() + self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") + os.kill(pid, signal.SIGTERM) def startup(self): - self.log.info(f"Starting up METS server: {self.url}") + self.log.info(f"Configuring up the Mets Server") workspace = self.workspace @@ -471,17 +472,20 @@ def save(): Write current changes to the file system """ workspace.save_mets() - return Response(status_code=200, content="The Mets Server is writing changes to disk.") + response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain') + self.log.info(f"PUT / -> {response.__dict__}") + return response @app.delete(path='/') async def stop(): """ Stop the mets server """ - getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() + response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() - return Response(status_code=200, content="The Mets Server is shutting down...") + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.post(path='/reload') async def workspace_reload_mets(): @@ -489,34 +493,48 @@ async def workspace_reload_mets(): Reload mets file from the file system """ workspace.reload_mets() - return Response(content=f'Reloaded from {workspace.directory}', media_type="text/plain") + response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain') + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.get(path='/unique_identifier', response_model=str) async def unique_identifier(): - return Response(content=workspace.mets.unique_identifier, media_type='text/plain') + response = Response(content=workspace.mets.unique_identifier, media_type='text/plain') + self.log.info(f"GET /unique_identifier -> {response.__dict__}") + return response @app.get(path='/workspace_path', response_model=str) async def workspace_path(): - return Response(content=workspace.directory, media_type="text/plain") + response = Response(content=workspace.directory, media_type="text/plain") + self.log.info(f"GET /workspace_path -> {response.__dict__}") + return response @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): - return {'physical_pages': workspace.mets.physical_pages} + response = {'physical_pages': workspace.mets.physical_pages} + self.log.info(f"GET /physical_pages -> {response.__dict__}") + return response @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): - return {'file_groups': workspace.mets.file_groups} + response = {'file_groups': workspace.mets.file_groups} + self.log.info(f"GET /file_groups -> {response.__dict__}") + return response @app.get(path='/agent', response_model=OcrdAgentListModel) async def agents(): - return OcrdAgentListModel.create(workspace.mets.agents) + response = OcrdAgentListModel.create(workspace.mets.agents) + self.log.info(f"GET /agent -> {response.__dict__}") + return response @app.post(path='/agent', response_model=OcrdAgentModel) async def add_agent(agent: OcrdAgentModel): kwargs = agent.dict() kwargs['_type'] = kwargs.pop('type') workspace.mets.add_agent(**kwargs) - return agent + response = agent + self.log.info(f"POST /agent -> {response.__dict__}") + return response @app.get(path="/file", response_model=OcrdFileListModel) async def find_files( @@ -533,7 +551,9 @@ async def find_files( found = workspace.mets.find_all_files( fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url ) - return OcrdFileListModel.create(found) + response = OcrdFileListModel.create(found) + self.log.info(f"GET /file -> {response.__dict__}") + return response @app.post(path='/file', response_model=OcrdFileModel) async def add_file( @@ -556,7 +576,9 @@ async def add_file( # Add to workspace kwargs = file_resource.dict() workspace.add_file(**kwargs, force=force) - return file_resource + response = file_resource + self.log.info(f"POST /file -> {response.__dict__}") + return response # ------------- # @@ -581,7 +603,7 @@ async def add_file( uvicorn_kwargs['log_config'] = None uvicorn_kwargs['access_log'] = False - self.log.debug("Starting uvicorn") + self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) # TODO: Not required after #1284, consider removing From 53c8f3f5ed2f3acb4d63eee01c2570801a8178ee Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:44:07 +0200 Subject: [PATCH 106/191] change cache debug -> info for extensive logging to file --- src/ocrd_network/server_cache.py | 45 ++++++++++++++++---------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index b57f3fd235..78e53bd238 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -31,7 +31,7 @@ def check_if_locked_pages_for_output_file_grps( self, workspace_key: str, output_file_grps: List[str], page_ids: List[str] ) -> bool: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return False debug_message = f"Caching the received request due to locked output file grp pages." for file_group in output_file_grps: @@ -46,46 +46,45 @@ def check_if_locked_pages_for_output_file_grps( def get_locked_pages(self, workspace_key: str) -> Dict[str, List[str]]: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No locked pages available for workspace key: {workspace_key}") + self.log.info(f"No locked pages available for workspace key: {workspace_key}") return {} return self.locked_pages[workspace_key] def lock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") - self.log.debug(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") self.locked_pages[workspace_key] = {} for file_group in output_file_grps: if file_group not in self.locked_pages[workspace_key]: - self.log.debug(f"Creating an empty list for output file grp: {file_group}") + self.log.info(f"Creating an empty list for output file grp: {file_group}") self.locked_pages[workspace_key][file_group] = [] # The page id list is not empty - only some pages are in the request if page_ids: - self.log.debug(f"Locking pages for '{file_group}': {page_ids}") + self.log.info(f"Locking pages for '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group].extend(page_ids) - self.log.debug(f"Locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Locked pages of '{file_group}': {self.locked_pages[workspace_key][file_group]}") else: # Lock all pages with a single value - self.log.debug(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") + self.log.info(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") self.locked_pages[workspace_key][file_group].append(self.placeholder_all_pages) def unlock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return for file_group in output_file_grps: if file_group in self.locked_pages[workspace_key]: if page_ids: # Unlock the previously locked pages - self.log.debug(f"Unlocking pages of '{file_group}': {page_ids}") + self.log.info(f"Unlocking pages of '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group] = \ [x for x in self.locked_pages[workspace_key][file_group] if x not in page_ids] - self.log.debug(f"Remaining locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Remaining locked pages of '{file_group}': " + f"{self.locked_pages[workspace_key][file_group]}") else: # Remove the single variable used to indicate all pages are locked - self.log.debug(f"Unlocking all pages for: {file_group}") + self.log.info(f"Unlocking all pages for: {file_group}") self.locked_pages[workspace_key][file_group].remove(self.placeholder_all_pages) @@ -127,11 +126,11 @@ def __print_job_input_debug_message(self, job_input: PYJobInput): debug_message += f", page ids: {job_input.page_id}" debug_message += f", job id: {job_input.job_id}" debug_message += f", job depends on: {job_input.depends_on}" - self.log.debug(debug_message) + self.log.info(debug_message) async def consume_cached_requests(self, workspace_key: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be consumed for workspace key: {workspace_key}") + self.log.info(f"No jobs to be consumed for workspace key: {workspace_key}") return [] found_consume_requests = [] for current_element in self.processing_requests[workspace_key]: @@ -165,7 +164,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: # If a record counter of this workspace key does not exist # in the requests counter cache yet, create one and assign 0 if not self.processing_counter.get(workspace_key, None): - self.log.debug(f"Creating an internal request counter for workspace key: {workspace_key}") + self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value return self.processing_counter[workspace_key] @@ -173,7 +172,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: def cache_request(self, workspace_key: str, data: PYJobInput): # If a record queue of this workspace key does not exist in the requests cache if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"Creating an internal request queue for workspace_key: {workspace_key}") + self.log.info(f"Creating an internal request queue for workspace_key: {workspace_key}") self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue @@ -181,9 +180,9 @@ def cache_request(self, workspace_key: str, data: PYJobInput): async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be cancelled for workspace key: {workspace_key}") + self.log.info(f"No jobs to be cancelled for workspace key: {workspace_key}") return [] - self.log.debug(f"Cancelling jobs dependent on job id: {processing_job_id}") + self.log.info(f"Cancelling jobs dependent on job id: {processing_job_id}") found_cancel_requests = [] for i, current_element in enumerate(self.processing_requests[workspace_key]): if processing_job_id in current_element.depends_on: @@ -192,7 +191,7 @@ async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str for cancel_element in found_cancel_requests: try: self.processing_requests[workspace_key].remove(cancel_element) - self.log.debug(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") + self.log.info(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") cancelled_jobs.append(cancel_element) await db_update_processing_job(job_id=cancel_element.job_id, state=JobState.cancelled) # Recursively cancel dependent jobs for the cancelled job @@ -225,9 +224,9 @@ async def sync_is_caching_required(self, job_dependencies: List[str]) -> bool: def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"In processing requests cache, no workspace key found: {workspace_key}") + self.log.info(f"In processing requests cache, no workspace key found: {workspace_key}") return False if not len(self.processing_requests[workspace_key]): - self.log.debug(f"The processing requests cache is empty for workspace key: {workspace_key}") + self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False return True From fe41223efe29bfeb6bb7e58d3c69db4e14a6f248 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:48:25 +0200 Subject: [PATCH 107/191] set log from info to debug --- src/ocrd_network/runtime_data/deployer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 7aec568071..aa7ff5eb05 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -162,8 +162,8 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") self.log.info(f"Path to the mets file: {path_to_mets}") - self.log.info(f"mets_server: {self.mets_servers}") - self.log.info(f"mets_server_paths: {self.mets_servers_paths}") + self.log.debug(f"mets_server: {self.mets_servers}") + self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") if stop_with_pid: mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] if Path(mets_server_url_uds) not in self.mets_servers: From 55c2f6357f1b83508b3e2eb305bdb9e65afb4fa2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 22:56:36 +0200 Subject: [PATCH 108/191] fix: typo --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index c6448b1d81..d2e0bb51e0 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -484,7 +484,7 @@ async def stop(): workspace.save_mets() response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() - self.log.info(f"POST /reload -> {response.__dict__}") + self.log.info(f"DELETE / -> {response.__dict__}") return response @app.post(path='/reload') From bf6616f1821e33fcda2376338d7419f8fba73a04 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:04:53 +0200 Subject: [PATCH 109/191] improve: delete socket file more appropriately --- src/ocrd/mets_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d2e0bb51e0..57db0e4653 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -443,6 +443,10 @@ def shutdown(self): pid = os.getpid() self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") os.kill(pid, signal.SIGTERM) + if self.is_uds: + if Path(self.url).exists(): + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") + Path(self.url).unlink() def startup(self): self.log.info(f"Configuring up the Mets Server") From bc8a03bd8f8771790d14e51ec054d70b476454f6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:07:54 +0200 Subject: [PATCH 110/191] remove: unnecessary code --- src/ocrd_network/utils.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 3dfa71e5f3..5abe2104fd 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -167,17 +167,8 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> return response.status_code == 200 elif protocol == "uds": logger.info(f"Sending DELETE request to: {mets_server_url}/") - try: - response = Session_UDS().delete(url=f"{mets_server_url}/") - return response.status_code == 200 - finally: - if protocol == "uds": - ws_socket_file = str(get_uds_path(ws_dir_path)) - if Path(ws_socket_file).exists(): - logger.info(f"Removing the inactive UDS file: {ws_socket_file}") - Path(ws_socket_file).unlink() - else: - logger.warning(f"The UDS file to be removed is not existing: {ws_socket_file}") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 else: ValueError(f"Unexpected protocol type: {protocol}") From 303488a5aa6d698f844e66107cb393be29ff1c14 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 4 Oct 2024 23:17:02 +0200 Subject: [PATCH 111/191] fix: .__dict__ of {} --- src/ocrd/mets_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 57db0e4653..b442e03bc1 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -516,13 +516,13 @@ async def workspace_path(): @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): response = {'physical_pages': workspace.mets.physical_pages} - self.log.info(f"GET /physical_pages -> {response.__dict__}") + self.log.info(f"GET /physical_pages -> {response}") return response @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): response = {'file_groups': workspace.mets.file_groups} - self.log.info(f"GET /file_groups -> {response.__dict__}") + self.log.info(f"GET /file_groups -> {response}") return response @app.get(path='/agent', response_model=OcrdAgentListModel) From c8e0c731f9180bd7f9b939c21b6cb856a655cd3a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:23:49 +0200 Subject: [PATCH 112/191] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index b442e03bc1..d7b416af66 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -438,8 +438,6 @@ def kill_process(mets_server_pid: int): pass def shutdown(self): - # os._exit because uvicorn catches SystemExit raised by sys.exit - # _exit(0) pid = os.getpid() self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") os.kill(pid, signal.SIGTERM) From 2cd4a64adc7103a1a996f686a01bcae23ccdd343 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:08 +0200 Subject: [PATCH 113/191] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index d7b416af66..261b695a14 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -447,7 +447,7 @@ def shutdown(self): Path(self.url).unlink() def startup(self): - self.log.info(f"Configuring up the Mets Server") + self.log.info(f"Configuring the Mets Server") workspace = self.workspace From 44a8cebfb91de97fc4bc9ea9910ae7ba01243e5c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:26 +0200 Subject: [PATCH 114/191] Update src/ocrd/mets_server.py Co-authored-by: Konstantin Baierer --- src/ocrd/mets_server.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 261b695a14..e45f48cef3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -588,12 +588,6 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - # TODO: Not required after #1284, consider removing - """ - if Path(self.url).exists() and not is_socket_in_use(self.url): - # remove leftover unused socket which blocks startup - Path(self.url).unlink() - """ server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() From 61c683f4c24330ae0397ad4baa7e21066473c9cb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:24:37 +0200 Subject: [PATCH 115/191] Update src/ocrd_network/runtime_data/deployer.py Co-authored-by: Konstantin Baierer --- src/ocrd_network/runtime_data/deployer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index aa7ff5eb05..57b6d90819 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -191,5 +191,5 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit p.wait() self.log.info(f"Terminated mets server with pid: {mets_server_pid}") else: - self.log.info(f"Mets server has already terminated with pid: {mets_server_pid}") + self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") return From 50553093180ac6b641273c08d559bbadf5e9b1d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 10:25:41 +0200 Subject: [PATCH 116/191] remove unnecessary method --- src/ocrd/mets_server.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index e45f48cef3..9fb39861e3 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -601,14 +601,3 @@ async def add_file( self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) - -# TODO: Not required after #1284, consider removing -def is_socket_in_use(socket_path): - if Path(socket_path).exists(): - client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - try: - client.connect(socket_path) - except OSError: - return False - client.close() - return True From 34bfbf432d042fbdbc676aff233ed708cbcdab62 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 14:40:58 +0200 Subject: [PATCH 117/191] fix: make stop() and ..reload..() sync --- src/ocrd/mets_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 9fb39861e3..774560a197 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -479,7 +479,7 @@ def save(): return response @app.delete(path='/') - async def stop(): + def stop(): """ Stop the mets server """ @@ -490,7 +490,7 @@ async def stop(): return response @app.post(path='/reload') - async def workspace_reload_mets(): + def workspace_reload_mets(): """ Reload mets file from the file system """ From ab660fbd0ff771c3e21af38185db331d2bf4121d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 15:11:29 +0200 Subject: [PATCH 118/191] fix: stop mets server when no cached requests --- src/ocrd_network/processing_server.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 59243d52fe..0431cf21f0 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -585,18 +585,13 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - async def _consume_cached_jobs_of_workspace( self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: - - # Check whether the internal queue for the workspace key still exists - if workspace_key not in self.cache_processing_requests.processing_requests: - self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") - return [] - # decrease the internal cache counter by 1 request_counter = self.cache_processing_requests.update_request_counter( workspace_key=workspace_key, by_value=-1 ) self.log.debug(f"Internal processing job cache counter value: {request_counter}") - if not len(self.cache_processing_requests.processing_requests[workspace_key]): + if (workspace_key not in self.cache_processing_requests.processing_requests or + not len(self.cache_processing_requests.processing_requests[workspace_key])): if request_counter <= 0: # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace @@ -617,6 +612,10 @@ async def _consume_cached_jobs_of_workspace( else: self.log.debug(f"Internal request cache is empty but waiting for {request_counter} result callbacks.") return [] + # Check whether the internal queue for the workspace key still exists + if workspace_key not in self.cache_processing_requests.processing_requests: + self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") + return [] consumed_requests = await self.cache_processing_requests.consume_cached_requests(workspace_key=workspace_key) return consumed_requests From 148f8d42d2910547fd8397b5b2cfbab7e80853b8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 15:55:39 +0200 Subject: [PATCH 119/191] clean: remove pid kill flag in stop mets server --- src/ocrd_network/runtime_data/deployer.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 57b6d90819..2a01c2231b 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -159,25 +159,11 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") - if stop_with_pid: - mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] - if Path(mets_server_url_uds) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" - self.log.warning(message) - mets_server_pid = self.mets_servers[str(mets_server_url_uds)] - self.log.info(f"Killing mets server pid: {mets_server_pid} of {mets_server_url_uds}") - OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - self.log.info(f"Returning after the kill process") - if Path(mets_server_url_uds).exists(): - self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") - Path(mets_server_url_uds).unlink() - self.log.info(f"Returning from the stop_uds_mets_server") - return # TODO: Reconsider this again # Not having this sleep here causes connection errors # on the last request processed by the processing worker. From dacd32517b7b2cdbc0417dffcab84b5d8710635c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 16:29:31 +0200 Subject: [PATCH 120/191] extend log: server cache requests --- src/ocrd_network/server_cache.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index 78e53bd238..179a76139d 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -167,6 +167,7 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value + self.log.info(f"The new request counter of {workspace_key}: {self.processing_counter[workspace_key]}") return self.processing_counter[workspace_key] def cache_request(self, workspace_key: str, data: PYJobInput): @@ -176,6 +177,7 @@ def cache_request(self, workspace_key: str, data: PYJobInput): self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue + self.log.info(f"Caching a processing request of {workspace_key}: {data.job_id}") self.processing_requests[workspace_key].append(data) async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: @@ -229,4 +231,6 @@ def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not len(self.processing_requests[workspace_key]): self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False + self.log.info(f"The processing requests cache has {len(self.processing_requests[workspace_key])} " + f"entries for workspace key: {workspace_key} ") return True From 05ded73dcff81aada33be32a0db976c33d0a84d1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 8 Oct 2024 16:39:14 +0200 Subject: [PATCH 121/191] improve: sleep no longer needed --- src/ocrd_network/runtime_data/deployer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 2a01c2231b..c35d94166b 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -164,11 +164,6 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") - # TODO: Reconsider this again - # Not having this sleep here causes connection errors - # on the last request processed by the processing worker. - # Sometimes 3 seconds is enough, sometimes not. - sleep(5) mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] self.log.info(f"Terminating mets server with pid: {mets_server_pid}") p = psutil.Process(mets_server_pid) From 5d755a8fb7b77d94a052f10e73c7a98ecb098a0d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 9 Oct 2024 09:17:41 +0200 Subject: [PATCH 122/191] add new env: OCRD_NETWORK_RABBITMQ_HEARTBEAT --- src/ocrd/cli/__init__.py | 2 ++ src/ocrd_network/rabbitmq_utils/connector.py | 4 ++-- src/ocrd_utils/config.py | 16 ++++++++++++++-- tests/network/config.py | 14 ++++++++++++-- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 70d738f083..863b9af0d7 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -47,6 +47,8 @@ \b {config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')} \b +{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')} +\b {config.describe('OCRD_PROFILE_FILE')} \b {config.describe('OCRD_PROFILE', wrap_text=False)} diff --git a/src/ocrd_network/rabbitmq_utils/connector.py b/src/ocrd_network/rabbitmq_utils/connector.py index 893d55a219..8fbbc84ab9 100644 --- a/src/ocrd_network/rabbitmq_utils/connector.py +++ b/src/ocrd_network/rabbitmq_utils/connector.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Union from pika import BasicProperties, BlockingConnection, ConnectionParameters, PlainCredentials from pika.adapters.blocking_connection import BlockingChannel +from ocrd_utils import config from .constants import ( DEFAULT_EXCHANGER_NAME, DEFAULT_EXCHANGER_TYPE, @@ -69,8 +70,7 @@ def open_blocking_connection( port=port, virtual_host=vhost, credentials=credentials, - # TODO: The heartbeat should not be disabled (0)! - heartbeat=0 + heartbeat=config.OCRD_NETWORK_RABBITMQ_HEARTBEAT ), ) return blocking_connection diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d2cc4efce1..86f3200dd0 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -176,9 +176,21 @@ def _ocrd_download_timeout_parser(val): default=(True, '')) config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", - description="Number of attempts for a RabbitMQ client to connect before failing.", + description="Number of attempts for a RabbitMQ client to connect before failing.", + parser=int, + default=(True, 3)) + +config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", + description=""" + Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable + is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its + arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + """, parser=int, - default=(True, 3)) + default=(True, 0) +) config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", description="The root directory where all mets server related socket files are created", diff --git a/tests/network/config.py b/tests/network/config.py index e22cc6ce9d..c316202f1c 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -89,11 +89,21 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", + description="Number of attempts for a RabbitMQ client to connect before failing", + parser=int, + default=(True, 3) +) + +test_config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Number of attempts for a RabbitMQ client to connect before failing + Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable + is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its + arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. """, parser=int, - default=(True, 3) + default=(True, 0) ) test_config.add( From a295b0c29d2951c4e5f1a0bb572fd060fc12a5a7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:32:57 +0200 Subject: [PATCH 123/191] deps-torch: also install torchvision --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b5cd2f276e..26524a9fa6 100644 --- a/Makefile +++ b/Makefile @@ -158,7 +158,7 @@ deps-tf2: fi deps-torch: - $(PIP) install -i https://download.pytorch.org/whl/cu118 torch + $(PIP) install -i https://download.pytorch.org/whl/cu118 torch torchvision # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: From c5c60fde3c3879a3572772843ba583af4b22065d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 9 Oct 2024 17:08:26 +0200 Subject: [PATCH 124/191] fix: empty -> text --- src/ocrd/mets_server.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 774560a197..f54d0672c6 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -157,13 +157,13 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + return self.session.request("PUT", url=self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.save(self.ws_dir_path) - ) + ).json()["text"] def stop(self): """ @@ -171,14 +171,13 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) - return + return self.session.request("DELETE", self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.stop(self.ws_dir_path) - ) + ).json()["text"] except ConnectionError: # Expected because we exit the process without returning pass @@ -348,12 +347,12 @@ def __args_wrapper( @staticmethod def save(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="PUT", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={}) @staticmethod def stop(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="DELETE", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={}) @staticmethod def reload(ws_dir_path: str) -> Dict: From e1b97840a6a7d45b4d5b70501d04349ed975e612 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 9 Oct 2024 17:36:31 +0200 Subject: [PATCH 125/191] deployer: remove METS Server path and url from their resp. caches on stopping --- src/ocrd_network/runtime_data/deployer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 57b6d90819..eae0cd21d3 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -154,7 +154,7 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: "Removing to avoid any weird behavior before starting the server.") Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") - pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) + pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file)) self.mets_servers[str(mets_server_url)] = pid self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url @@ -164,8 +164,9 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit self.log.info(f"Path to the mets file: {path_to_mets}") self.log.debug(f"mets_server: {self.mets_servers}") self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") + workspace_path = str(Path(path_to_mets).parent) + mets_server_url_uds = self.mets_servers_paths[workspace_path] if stop_with_pid: - mets_server_url_uds = self.mets_servers_paths[str(Path(path_to_mets).parent)] if Path(mets_server_url_uds) not in self.mets_servers: message = f"UDS Mets server not found at URL: {mets_server_url_uds}, mets path: {path_to_mets}" self.log.warning(message) @@ -176,6 +177,8 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit if Path(mets_server_url_uds).exists(): self.log.warning(f"Deployer is removing the existing UDS socket file: {mets_server_url_uds}") Path(mets_server_url_uds).unlink() + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] self.log.info(f"Returning from the stop_uds_mets_server") return # TODO: Reconsider this again @@ -183,13 +186,15 @@ def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str, stop_wit # on the last request processed by the processing worker. # Sometimes 3 seconds is enough, sometimes not. sleep(5) - mets_server_pid = self.mets_servers[str(self.mets_servers_paths[str(Path(path_to_mets).parent)])] + mets_server_pid = self.mets_servers[mets_server_url_uds] self.log.info(f"Terminating mets server with pid: {mets_server_pid}") p = psutil.Process(mets_server_pid) - stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=str(Path(path_to_mets).parent)) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=workspace_path) if p.is_running(): p.wait() self.log.info(f"Terminated mets server with pid: {mets_server_pid}") else: self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] return From 31a8474e884812eae614916fcb3e878aa443995a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Oct 2024 16:34:39 +0000 Subject: [PATCH 126/191] ocrd_utils.initLogging: also add handler to root logger (to be consistent with file config and prevent imported libraries from initing logging first), but disable propagation for ocrd loggers (to avoid duplication) --- src/ocrd_utils/logging.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 181805118d..dfac74988b 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -48,6 +48,7 @@ # These are the loggers we add handlers to ROOT_OCRD_LOGGERS = [ + '', 'ocrd', 'ocrd_network' ] @@ -191,7 +192,10 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) for logger_name in ROOT_OCRD_LOGGERS: - logging.getLogger(logger_name).addHandler(ocrd_handler) + logger = logging.getLogger(logger_name) + logger.addHandler(ocrd_handler) + if logger_name: + logger.propagate = False # avoid duplication (from root handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True @@ -210,7 +214,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS + ['']: + for logger_name in ROOT_OCRD_LOGGERS: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: From 50fc15246be1721d91c5161b5d9731e7f31ed859 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Oct 2024 16:34:39 +0000 Subject: [PATCH 127/191] ocrd_utils.initLogging: also add handler to root logger (to be consistent with file config and prevent imported libraries from initing logging first), but disable propagation for ocrd loggers (to avoid duplication) --- src/ocrd_utils/logging.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index ac2b3416a4..9c9ea73e01 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -50,6 +50,7 @@ # These are the loggers we add handlers to ROOT_OCRD_LOGGERS = [ + '', 'ocrd', 'ocrd_network' ] @@ -193,7 +194,10 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) for logger_name in ROOT_OCRD_LOGGERS: - logging.getLogger(logger_name).addHandler(ocrd_handler) + logger = logging.getLogger(logger_name) + logger.addHandler(ocrd_handler) + if logger_name: + logger.propagate = False # avoid duplication (from root handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True @@ -211,8 +215,8 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): _initialized_flag = False # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) - # remove all handlers for the 'ocrd.' and root logger - for logger_name in ROOT_OCRD_LOGGERS + ['']: + # remove all handlers for the ocrd logger + for logger_name in ROOT_OCRD_LOGGERS: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: From d7049b1bffb185723124028882ac0e5d88bfabba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 01:03:46 +0000 Subject: [PATCH 128/191] CLI decorator: only import ocrd_network when needed --- src/ocrd/decorators/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index f52a13575b..f659bf58a0 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -13,7 +13,6 @@ redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -23,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -88,11 +85,9 @@ def ocrd_cli_wrap_processor( if list_resources: processor.list_resources() sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") # from here: single-run processing context initLogging() @@ -162,6 +157,11 @@ def goexit(): def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") From d39c3d716917239f2db25550f0be3f5c48ae2768 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:15:43 +0200 Subject: [PATCH 129/191] kill_mets_server_zombies: actually return List[int] --- src/ocrd_network/server_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 773668f5b7..2560dbbb03 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -264,8 +264,8 @@ def kill_mets_server_zombies(minutes_ago=60) -> List[int]: continue cmdline = cmdline_file.read_text().replace('\x00', ' ') if re.match(cmdline_pat, cmdline): - pid = procdir.name + pid = int(procdir.name) ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) - os.kill(int(pid), signal.SIGTERM) + os.kill(pid, signal.SIGTERM) return ret From 7512bd68f1b2e06ad8a62603c222b10624988a7f Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:16:21 +0200 Subject: [PATCH 130/191] kill_mets_server_zombies: allow dry_run to test --- src/ocrd_network/processing_server.py | 6 +++--- src/ocrd_network/server_utils.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 505e106ba2..336d04f0d9 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -1,7 +1,7 @@ from datetime import datetime from os import getpid from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from uvicorn import run as uvicorn_run from fastapi import APIRouter, FastAPI, File, HTTPException, Request, status, UploadFile @@ -826,8 +826,8 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response - async def kill_mets_server_zombies(self) -> List[int]: - pids_killed = kill_mets_server_zombies(minutes_ago=60) + async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run) return pids_killed async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 2560dbbb03..6e485f261f 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -4,7 +4,7 @@ from pathlib import Path from json import dumps, loads from urllib.parse import urljoin -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from time import time from fastapi import HTTPException, status, UploadFile @@ -249,7 +249,12 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago=60) -> List[int]: +def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]: + if minutes_ago == None: + minutes_ago = 90 + if dry_run == None: + dry_run = False + now = time() cmdline_pat = r'.*ocrd workspace -U.*server start $' ret = [] @@ -267,5 +272,8 @@ def kill_mets_server_zombies(minutes_ago=60) -> List[int]: pid = int(procdir.name) ret.append(pid) print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) - os.kill(pid, signal.SIGTERM) + if dry_run: + print(f'[dry_run is active] kill {pid}') + else: + os.kill(pid, signal.SIGTERM) return ret From e40ed798fe462468635161433ee4cb55574c9d5a Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:35:14 +0200 Subject: [PATCH 131/191] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ec973570..80868a6eb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Added: + + - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 + - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + ## [2.69.0] - 2024-09-30 Fixed: From 7f605591ac373664cc225634e22877797fcffb40 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 10 Oct 2024 12:41:57 +0200 Subject: [PATCH 132/191] Simplify description for OCRD_NETWORK_RABBITMQ_HEARTBEAT --- src/ocrd_utils/config.py | 6 ++---- tests/network/config.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 86f3200dd0..f191389799 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -183,10 +183,8 @@ def _ocrd_download_timeout_parser(val): config.add( name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value - proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable - is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its - arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, default=(True, 0) diff --git a/tests/network/config.py b/tests/network/config.py index c316202f1c..611ad63821 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -97,10 +97,8 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Controls AMQP heartbeat timeout negotiation during connection tuning. An integer value always overrides the value - proposed by broker. Use 0 to deactivate heartbeats and None to always accept the broker's proposal. If a callable - is given, it will be called with the connection instance and the heartbeat timeout proposed by broker as its - arguments. The callback should return a non-negative integer that will be used to override the broker's proposal. + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, default=(True, 0) From 02c6effb2626ac585760dee4fbaa998fbdb01df1 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 12:58:36 +0200 Subject: [PATCH 133/191] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80868a6eb5..fe8b5508d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 +Fixed: + + - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + ## [2.69.0] - 2024-09-30 Fixed: From 88707ca9a8646cddcc23d10e3eee5a9fcde38280 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 13:09:23 +0200 Subject: [PATCH 134/191] :memo: changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe8b5508d1..8b8e66fd0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,17 @@ Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + - No more zombie METS Server by properly shutting them down, #1284 + - `OCRD_NETWORK_RABBITMQ_HEARBEAT` to allow overriding the [heartbeat](https://pika.readthedocs.io/en/stable/examples/heartbeat_and_blocked_timeouts.html) behavior of RabbitMQ, #1285 + +Changed: + + - significantly more detailed logging for the METS Server and Processing Server, #1284 Fixed: - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + - Processing Server: remove shut down METS servers from deployer's cache, #1287 ## [2.69.0] - 2024-09-30 From cb8d7874806b489deca04f2bbe7215bc82cbb974 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 01:03:46 +0000 Subject: [PATCH 135/191] CLI decorator: only import ocrd_network when needed --- src/ocrd/decorators/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 464bb67ed8..bc969b3279 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -13,7 +13,6 @@ redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator -from ocrd_network import ProcessingWorker, ProcessorServer, AgentType from ..resolver import Resolver from ..processor.base import ResourceNotFoundError, run_processor @@ -23,8 +22,6 @@ from .ocrd_cli_options import ocrd_cli_options from .mets_find_options import mets_find_options -SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] - def ocrd_cli_wrap_processor( processorClass, @@ -66,11 +63,9 @@ def ocrd_cli_wrap_processor( list_resources=list_resources ) sys.exit() - if subcommand: + if subcommand or address or queue or database: # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - elif address or queue or database: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") initLogging() @@ -164,6 +159,11 @@ def goexit(): def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): """ """ + from ocrd_network import ProcessingWorker, ProcessorServer, AgentType + SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] + + if not subcommand: + raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") From 94e6d2c63351b55ee7c44bfa68ed6b36ef958ab8 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 13:10:08 +0200 Subject: [PATCH 136/191] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b8e66fd0c..6fc7bbb1aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Fixed: - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 - Processing Server: remove shut down METS servers from deployer's cache, #1287 + - typos, #1274 ## [2.69.0] - 2024-09-30 From e5cdbe930dc6e7d4e8873e838617c913d2ab1ed2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:22:05 +0200 Subject: [PATCH 137/191] deps-cuda: retry if micromamba is unresponsive --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9ad35e1c20..23a6b438a5 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 3 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh From 80c0c6f7e7c5c5de8807fe641e4bb452c68cd501 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 15:17:58 +0200 Subject: [PATCH 138/191] :memo: changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fc7bbb1aa..d2912a303d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ Added: Changed: - significantly more detailed logging for the METS Server and Processing Server, #1284 + - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 + + Fixed: From 7b1d17296231a14fb160f9638c7d5da05217298d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:05:55 +0200 Subject: [PATCH 139/191] create PyPI CD --- .github/workflows/publish-pypi.yml | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/publish-pypi.yml diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 0000000000..9228685ffe --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: make pypi pypi-workaround From 7750f3f04b99cc488ee1764c9939885c0ae84d14 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 16:14:42 +0200 Subject: [PATCH 140/191] :memo: changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2912a303d..dcf7bbfba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,8 +16,7 @@ Changed: - significantly more detailed logging for the METS Server and Processing Server, #1284 - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 - - + - Automate release via GitHub Actions, #1290 Fixed: From 012ccf6af1cbc7cd377c4564b436e71dceff2fa5 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 10 Oct 2024 16:16:00 +0200 Subject: [PATCH 141/191] :package: v2.70.0 --- CHANGELOG.md | 3 +++ VERSION | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcf7bbfba5..4b90a57a24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.70.0] - 2024-10-10 + Added: - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 @@ -2214,6 +2216,7 @@ Fixed Initial Release +[2.70.0]: ../../compare/v2.70.0..v2.69.0 [2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 diff --git a/VERSION b/VERSION index a740b92f5e..38a7743781 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.69.0 +2.70.0 From a9d49c1df906af98f618dbf99b01b2fb9900452b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 10 Oct 2024 14:28:41 +0000 Subject: [PATCH 142/191] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20ThreadPoolExecutor=E2=86=92ProcessPoolExecutor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ocrd/processor/base.py | 174 ++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 79 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 28cbaf7269..8ea53246d8 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,8 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ThreadPoolExecutor, TimeoutError +from concurrent.futures import ProcessPoolExecutor, TimeoutError +import multiprocessing as mp from click import wrap_text from deprecated import deprecated @@ -465,11 +466,7 @@ def process_workspace(self, workspace: Workspace) -> None: self.workspace = workspace self.verify() try: - nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 - - # set up multithreading + # set up multitasking max_workers = max(0, config.OCRD_MAX_PARALLEL_PAGES) if self.max_workers > 0 and self.max_workers < config.OCRD_MAX_PARALLEL_PAGES: self._base_logger.info("limiting number of threads from %d to %d", max_workers, self.max_workers) @@ -481,80 +478,17 @@ def process_workspace(self, workspace: Workspace) -> None: if self.max_page_seconds > 0 and self.max_page_seconds < config.OCRD_PROCESSING_PAGE_TIMEOUT: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ThreadPoolExecutor( - max_workers=max_workers or 1, - thread_name_prefix=f"pagetask.{workspace.mets.unique_identifier}" - ) - self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - tasks = {} - - for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - self._base_logger.debug("submitted %d processing tasks", len(tasks)) - - for task in tasks: - # wait for results, handle errors - page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) - nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below - except NotImplementedError: - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: - # FIXME: add re-usable/actionable logging - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - self._copy_page_file(input_files[0]) - nr_copied += 1 - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") - executor.shutdown() + with ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + mp_context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) as executor: + self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) + self._process_workspace_run(executor, max_workers, max_seconds) except NotImplementedError: # fall back to deprecated method @@ -564,6 +498,80 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None + def _process_workspace_run(self, executor, max_workers, max_seconds): + nr_succeeded = 0 + nr_skipped = 0 + nr_copied = 0 + + tasks = {} + for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + self._base_logger.debug("submitted %d processing tasks", len(tasks)) + + for task in tasks: + # wait for results, handle errors + page_id, input_files = tasks[task] + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) + task.result(timeout=max_seconds or None) + nr_succeeded += 1 + # exclude NotImplementedError, so we can try process() below + except NotImplementedError: + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + continue + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except (Exception, TimeoutError) as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + nr_skipped += 1 + continue + if config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + nr_copied += 1 + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + + if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with skipped output ({nr_skipped})") + if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with fallback output ({nr_skipped})") + def _copy_page_file(self, input_file : OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, @@ -940,6 +948,14 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ifts.append(tuple(ifiles)) return ifts +_page_worker_processor = None +def _page_worker_set_ctxt(processor): + global _page_worker_processor + _page_worker_processor = processor + +def _page_worker(*input_files): + _page_worker_processor.process_page_file(*input_files) + def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From a8e2c6488b819c08dc96092da04a103bc77b0593 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:38:39 +0200 Subject: [PATCH 143/191] deps-cuda: retry micro.mamba.pm even more --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 23a6b438a5..1708caa129 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl --retry 3 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 6 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh From 85bde1574293ea8b7ba29255fbb8e07312c28eb1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:41:26 +0200 Subject: [PATCH 144/191] PyPI: do not upload deprecated distribution aliases anymore --- .github/workflows/publish-pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index 9228685ffe..e811c958ab 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -28,4 +28,4 @@ jobs: env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: make pypi pypi-workaround + run: make pypi From 41c0ce8103379646eafd6a51c00b6596943d2948 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 14 Oct 2024 16:36:29 +0200 Subject: [PATCH 145/191] fix broken logging --- src/ocrd_network/processing_server.py | 5 +++-- src/ocrd_network/processing_worker.py | 9 ++++++--- src/ocrd_network/processor_server.py | 3 ++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 83cb1d75f1..31eeca5299 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -79,7 +79,6 @@ class ProcessingServer(FastAPI): """ def __init__(self, config_path: str, host: str, port: int) -> None: - initLogging() self.title = "OCR-D Processing Server" super().__init__( title=self.title, @@ -87,6 +86,7 @@ def __init__(self, config_path: str, host: str, port: int) -> None: on_shutdown=[self.on_shutdown], description="OCR-D Processing Server" ) + initLogging() self.log = getLogger("ocrd_network.processing_server") log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -156,7 +156,7 @@ def start(self) -> None: queue_names = self.deployer.find_matching_network_agents( worker_only=True, str_names_only=True, unique_only=True ) - self.log.debug(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") + self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names) self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url) @@ -168,6 +168,7 @@ def start(self) -> None: uvicorn_run(self, host=self.hostname, port=int(self.port)) async def on_startup(self): + self.log.info(f"Initializing the Database on: {self.mongodb_url}") await initiate_database(db_url=self.mongodb_url) async def on_shutdown(self) -> None: diff --git a/src/ocrd_network/processing_worker.py b/src/ocrd_network/processing_worker.py index a352ea5fde..7ede0f9164 100644 --- a/src/ocrd_network/processing_worker.py +++ b/src/ocrd_network/processing_worker.py @@ -9,12 +9,12 @@ """ from datetime import datetime -from os import getpid +from os import getpid, getppid from pika import BasicProperties from pika.adapters.blocking_connection import BlockingChannel from pika.spec import Basic -from ocrd_utils import getLogger +from ocrd_utils import getLogger, initLogging from .constants import JobState from .database import sync_initiate_database, sync_db_get_workspace, sync_db_update_processing_job, verify_database_uri from .logging_utils import ( @@ -35,14 +35,16 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: + initLogging() self.log = getLogger(f'ocrd_network.processing_worker') log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") try: verify_database_uri(mongodb_addr) - self.log.debug(f'Verified MongoDB URL: {mongodb_addr}') + self.log.info(f'Verified MongoDB URL: {mongodb_addr}') self.rmq_data = verify_and_parse_mq_uri(rabbitmq_addr) + self.log.info(f'Verified RabbitMQ URL: {rabbitmq_addr}') except ValueError as error: msg = f"Failed to parse data, error: {error}" self.log.exception(msg) @@ -61,6 +63,7 @@ def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, # Gets assigned when the `connect_publisher` is called on the worker object # Used to publish OcrdResultMessage type message to the queue with name {processor_name}-result self.rmq_publisher = None + self.log.info(f"Initialized processing worker: {processor_name}") def connect_consumer(self): self.rmq_consumer = connect_rabbitmq_consumer(self.log, self.rmq_data) diff --git a/src/ocrd_network/processor_server.py b/src/ocrd_network/processor_server.py index 5aed89d72c..60674afbf6 100644 --- a/src/ocrd_network/processor_server.py +++ b/src/ocrd_network/processor_server.py @@ -42,13 +42,13 @@ class ProcessorServer(FastAPI): def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): if not (processor_name or processor_class): raise ValueError("Either 'processor_name' or 'processor_class' must be provided") - initLogging() super().__init__( on_startup=[self.on_startup], on_shutdown=[self.on_shutdown], title=f"Network agent - Processor Server", description="Network agent - Processor Server" ) + initLogging() self.log = getLogger("ocrd_network.processor_server") log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -69,6 +69,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.processor_name = self.ocrd_tool["executable"] self.add_api_routes_processing() + self.log.info(f"Initialized processor server: {processor_name}") async def on_startup(self): await initiate_database(db_url=self.db_url) From 6d8539c6ea48bfa795f53554968f5c77f9d88551 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:22:48 +0200 Subject: [PATCH 146/191] deps-cuda: make sure cudnn gets installed, pin torch version for CUDA 11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (sry, must commit to master directly – only Docker deploy is affected; current version is broken for all CUDA images) --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1708caa129..0477727343 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ deps-cuda: # works, too: shopt -s nullglob; \ $(PIP) install nvidia-pyindex \ - && $(PIP) install nvidia-cudnn-cu11==8.7.* \ + && $(PIP) install nvidia-cudnn-cu11~=8.7 \ nvidia-cublas-cu11~=11.11 \ nvidia-cusparse-cu11~=11.7 \ nvidia-cusolver-cu11~=11.4 \ @@ -158,7 +158,7 @@ deps-tf2: fi deps-torch: - $(PIP) install -i https://download.pytorch.org/whl/cu118 torch torchvision + $(PIP) install -i https://download.pytorch.org/whl/cu118 torchvision==0.16.2+cu118 torch==2.1.2+cu118 # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: From 588c91df826951d29b24f1e1677cced3a55b2153 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:44:56 +0000 Subject: [PATCH 147/191] Processor.process_workspace: apply timeout on process_page_file worker itself (rather than future query) --- src/ocrd/processor/base.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8ea53246d8..ce6b3e4949 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -25,6 +25,8 @@ from frozendict import frozendict from concurrent.futures import ProcessPoolExecutor, TimeoutError import multiprocessing as mp +from threading import Timer +from _thread import interrupt_main from click import wrap_text from deprecated import deprecated @@ -524,7 +526,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") # process page #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, *input_files)] = (page_id, input_files) + tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) for task in tasks: @@ -536,7 +538,12 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # - persistent (data) error → skip / dummy / raise try: self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - task.result(timeout=max_seconds or None) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor offers nothing + # to that effect: + # task.result(timeout=max_seconds or None) + # so we instead apply the timeout within the worker function + task.result() nr_succeeded += 1 # exclude NotImplementedError, so we can try process() below except NotImplementedError: @@ -551,7 +558,7 @@ def _process_workspace_run(self, executor, max_workers, max_seconds): # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") # broad coverage of output failures (including TimeoutError) - except (Exception, TimeoutError) as err: + except Exception as err: # FIXME: add re-usable/actionable logging if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") @@ -953,8 +960,21 @@ def _page_worker_set_ctxt(processor): global _page_worker_processor _page_worker_processor = processor -def _page_worker(*input_files): - _page_worker_processor.process_page_file(*input_files) +def _page_worker(timeout, *input_files): + page_id = next((file.pageId for file in input_files + if hasattr(file, 'pageId')), "") + if timeout > 0: + timer = Timer(timeout, interrupt_main) + timer.start() + try: + _page_worker_processor.process_page_file(*input_files) + _page_worker_processor.logger.debug("page worker completed for page %s", page_id) + except KeyboardInterrupt: + _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) + raise TimeoutError() + finally: + if timeout > 0: + timer.cancel() def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. From d126bdce4ef81c148c1bae4718d000082f863704 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Oct 2024 08:46:21 +0000 Subject: [PATCH 148/191] =?UTF-8?q?Processor=20w/=20OCRD=5FMAX=5FPARALLEL?= =?UTF-8?q?=5FPAGES:=20concurrent.futures=E2=86=92loky?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + src/ocrd/processor/base.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index e78c186618..05d4e9aa44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ httpx>=0.22.0 importlib_metadata ; python_version < '3.8' importlib_resources ; python_version < '3.10' jsonschema>=4 +loky lxml memory-profiler >= 0.58.0 # XXX explicitly do not restrict the numpy version because different diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index ce6b3e4949..b6a41d6b5f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -23,7 +23,9 @@ import io import weakref from frozendict import frozendict -from concurrent.futures import ProcessPoolExecutor, TimeoutError +# concurrent.futures is buggy in py38, +# this is where the fixes came from: +from loky import ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -481,16 +483,19 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - with ProcessPoolExecutor( - max_workers=max_workers or 1, - # only forking method avoids pickling - mp_context=mp.get_context('fork'), - # share processor instance as global to avoid pickling - initializer=_page_worker_set_ctxt, - initargs=(self,), - ) as executor: + executor = ProcessPoolExecutor( + max_workers=max_workers or 1, + # only forking method avoids pickling + context=mp.get_context('fork'), + # share processor instance as global to avoid pickling + initializer=_page_worker_set_ctxt, + initargs=(self,), + ) + try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) self._process_workspace_run(executor, max_workers, max_seconds) + finally: + executor.shutdown(kill_workers=True) except NotImplementedError: # fall back to deprecated method From afa7f30a6bf212fece28ebc354da726a658ba121 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 00:23:06 +0000 Subject: [PATCH 149/191] Processor w/o OCRD_MAX_PARALLEL_PAGES: dummy instead of executor --- src/ocrd/processor/base.py | 46 ++++++++++++++++++++++++++++--- tests/processor/test_processor.py | 1 - 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index b6a41d6b5f..7ff271ecab 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -483,7 +483,29 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - executor = ProcessPoolExecutor( + class DummyExecutor: + """ + Mimics some of ProcessPoolExecutor but runs everything + immediately in this process. + """ + class DummyFuture: + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs): + return DummyExecutor.DummyFuture(fn, *args, **kwargs) + if max_workers > 1: + executor_cls = ProcessPoolExecutor + else: + executor_cls = DummyExecutor + executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), @@ -493,7 +515,7 @@ def process_workspace(self, workspace: Workspace) -> None: ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_workers, max_seconds) + self._process_workspace_run(executor, max_seconds) finally: executor.shutdown(kill_workers=True) @@ -505,7 +527,7 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_workers, max_seconds): + def _process_workspace_run(self, executor, max_seconds): nr_succeeded = 0 nr_skipped = 0 nr_copied = 0 @@ -961,11 +983,27 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): return ifts _page_worker_processor = None +""" +This global binding for the processor is required to avoid +squeezing the processor through a mp.Queue (which is impossible +due to unpicklable attributes like .workspace.mets._tree anyway) +when calling Processor.process_page_file as page worker processes +in Processor.process_workspace. Forking allows inheriting global +objects, and with the METS Server we do not mutate the local +processor instance anyway. +""" def _page_worker_set_ctxt(processor): + """ + Overwrites `ocrd.processor.base._page_worker_processor` instance + for sharing with subprocesses in ProcessPoolExecutor initializer. + """ global _page_worker_processor _page_worker_processor = processor - def _page_worker(timeout, *input_files): + """ + Wraps a `Processor.process_page_file` call as payload (call target) + of the ProcessPoolExecutor workers, but also enforces the given timeout. + """ page_id = next((file.pageId for file in input_files if hasattr(file, 'pageId')), "") if timeout > 0: diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 33a9548811..5844cb8774 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -277,7 +277,6 @@ def test_run_output_timeout(self): assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' config.OCRD_PROCESSING_PAGE_TIMEOUT = 1 - from concurrent.futures import TimeoutError with pytest.raises(TimeoutError) as exc: run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", From 58217018d8bcd85df5dc4e3e03eb62a0d9255690 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Oct 2024 01:27:58 +0000 Subject: [PATCH 150/191] ocrd.process.profile logger: account for subprocess CPU time, too --- src/ocrd/processor/helpers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 2cbbbd97e1..757f7ac045 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -2,6 +2,7 @@ Helper methods for running and documenting processors """ from time import perf_counter, process_time +from os import times from functools import lru_cache import json import inspect @@ -94,6 +95,7 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() + t0_os = times() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel @@ -123,7 +125,13 @@ def run_processor( t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu - logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( + t1_os = times() + # add CPU time from child processes (page worker etc) + t1_cpu += t1_os.children_user - t0_os.children_user + t1_cpu += t1_os.children_system - t0_os.children_system + logProfile.info( + "Executing processor '%s' took %fs (wall) %fs (CPU)( " + "[--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']", ocrd_tool['executable'], t1_wall, t1_cpu, @@ -131,7 +139,7 @@ def run_processor( processor.output_file_grp or '', json.dumps(processor.parameter) or '', processor.page_id or '' - )) + ) workspace.mets.add_agent( name=name, _type='OTHER', From 53b1854e139f66e3061d2e4feae5411c9b8d092a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 21 Oct 2024 12:47:33 +0000 Subject: [PATCH 151/191] Processor.process_workspace: improve reporting, raise early if too many failures already (rate will be too low) --- src/ocrd/processor/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ff271ecab..46b07c7161 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -22,6 +22,7 @@ import tarfile import io import weakref +from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: @@ -528,9 +529,10 @@ def submit(self, fn, *args, **kwargs): raise err from None def _process_workspace_run(self, executor, max_seconds): + # aggregate info for logging: nr_succeeded = 0 - nr_skipped = 0 - nr_copied = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): @@ -572,8 +574,8 @@ def _process_workspace_run(self, executor, max_seconds): # so we instead apply the timeout within the worker function task.result() nr_succeeded += 1 - # exclude NotImplementedError, so we can try process() below except NotImplementedError: + # exclude NotImplementedError, so we can try process() below raise # handle input failures separately except FileExistsError as err: @@ -587,24 +589,35 @@ def _process_workspace_run(self, executor, max_seconds): # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging + nr_errors[err.__class__.__name__] += 1 + nr_failed += 1 if config.OCRD_MISSING_OUTPUT == 'ABORT': self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") raise err self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") if config.OCRD_MISSING_OUTPUT == 'SKIP': - nr_skipped += 1 + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") continue if config.OCRD_MISSING_OUTPUT == 'COPY': + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") self._copy_page_file(input_files[0]) - nr_copied += 1 else: desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - if nr_skipped > 0 and nr_succeeded / nr_skipped < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with skipped output ({nr_skipped})") - if nr_copied > 0 and nr_succeeded / nr_copied < config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with fallback output ({nr_skipped})") + if nr_failed > 0: + nr_all = nr_succeeded + nr_failed + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + if config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 4d66e3702dfdd1063307ab09c33126ddc2f930a2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 23 Oct 2024 22:12:57 +0000 Subject: [PATCH 152/191] Processor: refactor process_workspace into overridable subfuncs --- repo/spec | 2 +- src/ocrd/processor/base.py | 299 +++++++++++++++++++++++++------------ 2 files changed, 201 insertions(+), 100 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3fd..506b33936d 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 46b07c7161..85a0dea212 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import Any, List, Optional, Union, get_args +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys import inspect import tarfile @@ -26,7 +26,7 @@ from frozendict import frozendict # concurrent.futures is buggy in py38, # this is where the fixes came from: -from loky import ProcessPoolExecutor +from loky import Future, ProcessPoolExecutor import multiprocessing as mp from threading import Timer from _thread import interrupt_main @@ -111,6 +111,31 @@ def __init__(self, fileGrp, pageId, mimetype): f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) +class DummyFuture: + """ + Mimics some of `concurrent.futures.Future` but runs immediately. + """ + def __init__(self, fn, *args, **kwargs): + self.fn = fn + self.args = args + self.kwargs = kwargs + def result(self): + return self.fn(*self.args, **self.kwargs) +class DummyExecutor: + """ + Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs + everything immediately in this process. + """ + def __init__(self, initializer=None, initargs=(), **kwargs): + initializer(*initargs) + def shutdown(self, **kwargs): + pass + def submit(self, fn, *args, **kwargs) -> DummyFuture: + return DummyFuture(fn, *args, **kwargs) + +TFuture = Union[DummyFuture, Future] +TExecutor = Union[DummyExecutor, ProcessPoolExecutor] + class Processor(): """ A processor is a tool that implements the uniform OCR-D @@ -462,6 +487,9 @@ def process_workspace(self, workspace: Workspace) -> None: for the given :py:data:`page_id` (or all pages) under the given :py:data:`parameter`. + Delegates to :py:meth:`.process_workspace_submit_tasks` + and :py:meth:`.process_workspace_handle_tasks`. + (This will iterate over pages and files, calling :py:meth:`.process_page_file` and handling exceptions. It should be overridden by subclasses to handle cases @@ -484,24 +512,6 @@ def process_workspace(self, workspace: Workspace) -> None: self._base_logger.info("limiting page timeout from %d to %d sec", max_seconds, self.max_page_seconds) max_seconds = self.max_page_seconds - class DummyExecutor: - """ - Mimics some of ProcessPoolExecutor but runs everything - immediately in this process. - """ - class DummyFuture: - def __init__(self, fn, *args, **kwargs): - self.fn = fn - self.args = args - self.kwargs = kwargs - def result(self): - return self.fn(*self.args, **self.kwargs) - def __init__(self, initializer=None, initargs=(), **kwargs): - initializer(*initargs) - def shutdown(self, **kwargs): - pass - def submit(self, fn, *args, **kwargs): - return DummyExecutor.DummyFuture(fn, *args, **kwargs) if max_workers > 1: executor_cls = ProcessPoolExecutor else: @@ -516,7 +526,8 @@ def submit(self, fn, *args, **kwargs): ) try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) - self._process_workspace_run(executor, max_seconds) + tasks = self.process_workspace_submit_tasks(executor, max_seconds) + stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True) @@ -528,96 +539,186 @@ def submit(self, fn, *args, **kwargs): # suppress the NotImplementedError context raise err from None - def _process_workspace_run(self, executor, max_seconds): - # aggregate info for logging: - nr_succeeded = 0 - nr_failed = 0 - nr_errors = defaultdict(int) # count causes - + def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: + """ + Look up all input files of the given ``workspace`` + from the given :py:data:`input_file_grp` + for the given :py:data:`page_id` (or all pages), + and schedules calling :py:meth:`.process_page_file` + on them for each page via `executor` (enforcing + a per-page time limit of `max_seconds`). + + When running with `OCRD_MAX_PARALLEL_PAGES>1` and + the workspace via METS Server, the executor will fork + this many worker parallel subprocesses each processing + one page at a time. (Interprocess communication is + done via task and result queues.) + + Otherwise, tasks are run sequentially in the + current process. + + Delegates to :py:meth:`.zip_input_files` to get + the input files for each page, and then calls + :py:meth:`.process_workspace_submit_page_task`. + + Returns a dict mapping the per-page tasks + (i.e. futures submitted to the executor) + to their corresponding pageId and input files. + """ tasks = {} for input_file_tuple in self.zip_input_files(on_error='abort', require_first=False): - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) - page_id = next(input_file.pageId - for input_file in input_file_tuple - if input_file) - self._base_logger.info(f"preparing page {page_id}") - for i, input_file in enumerate(input_file_tuple): - if input_file is None: - # file/page not found in this file grp - continue - input_files[i] = input_file - if not self.download: - continue - try: - input_files[i] = self.workspace.download_file(input_file) - except (ValueError, FileNotFoundError, HTTPError) as e: - self._base_logger.error(repr(e)) - self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") - # process page - #tasks[executor.submit(self.process_page_file, *input_files)] = (page_id, input_files) - tasks[executor.submit(_page_worker, max_seconds, *input_files)] = (page_id, input_files) + task, page_id, input_files = self.process_workspace_submit_page_task(executor, max_seconds, input_file_tuple) + tasks[task] = (page_id, input_files) self._base_logger.debug("submitted %d processing tasks", len(tasks)) + return tasks + def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]: + """ + Ensure all input files for a single page are + downloaded to the workspace, then schedule + :py:meth:`.process_process_file` to be run on + them via `executor` (enforcing a per-page time + limit of `max_seconds`). + + Delegates to :py:meth:`.process_page_file` + (wrapped in :py:func:`_page_worker` to share + the processor instance across forked processes). + + \b + Returns a tuple of: + - the scheduled future object, + - the corresponding pageId, + - the corresponding input files. + """ + input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + page_id = next(input_file.pageId + for input_file in input_file_tuple + if input_file) + self._base_logger.info(f"preparing page {page_id}") + for i, input_file in enumerate(input_file_tuple): + if input_file is None: + # file/page not found in this file grp + continue + input_files[i] = input_file + if not self.download: + continue + try: + input_files[i] = self.workspace.download_file(input_file) + except (ValueError, FileNotFoundError, HTTPError) as e: + self._base_logger.error(repr(e)) + self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}") + # process page + #executor.submit(self.process_page_file, *input_files) + return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files + + def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]: + """ + Look up scheduled per-page futures one by one, + handle errors (exceptions) and gather results. + + \b + Enforces policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns a tuple of: + - the number of successfully processed pages + - the number of failed (i.e. skipped or copied) pages + - a dict of the type and corresponding number of exceptions seen + - the number of total requested pages (i.e. success+fail+existing). + + Delegates to :py:meth:`.process_workspace_handle_page_task` + for each page. + """ + # aggregate info for logging: + nr_succeeded = 0 + nr_failed = 0 + nr_errors = defaultdict(int) # count causes + if config.OCRD_MISSING_OUTPUT == 'SKIP': + reason = "skipped" + elif config.OCRD_MISSING_OUTPUT == 'COPY': + reason = "fallback-copied" for task in tasks: # wait for results, handle errors page_id, input_files = tasks[task] - # FIXME: differentiate error cases in various ways: - # - ResourceNotFoundError → use ResourceManager to download (once), then retry - # - transient (I/O or OOM) error → maybe sleep, retry - # - persistent (data) error → skip / dummy / raise - try: - self._base_logger.debug("waiting for output of task %s (page %s) max_seconds=%d", task, page_id, max_seconds) - # timeout kwarg on future is useless: it only raises TimeoutError here, - # but does not stop the running process/thread, and executor offers nothing - # to that effect: - # task.result(timeout=max_seconds or None) - # so we instead apply the timeout within the worker function - task.result() - nr_succeeded += 1 - except NotImplementedError: - # exclude NotImplementedError, so we can try process() below - raise - # handle input failures separately - except FileExistsError as err: - if config.OCRD_EXISTING_OUTPUT == 'ABORT': - raise err - if config.OCRD_EXISTING_OUTPUT == 'SKIP': - continue - if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': - # too late here, must not happen - raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") - # broad coverage of output failures (including TimeoutError) - except Exception as err: - # FIXME: add re-usable/actionable logging - nr_errors[err.__class__.__name__] += 1 + result = self.process_workspace_handle_page_task(page_id, input_files, task) + if isinstance(result, Exception): + nr_errors[result.__class__.__name__] += 1 nr_failed += 1 - if config.OCRD_MISSING_OUTPUT == 'ABORT': - self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - raise err - self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") - if config.OCRD_MISSING_OUTPUT == 'SKIP': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with skipped output ({nr_failed} of {nr_failed+nr_succeeded})") - continue - if config.OCRD_MISSING_OUTPUT == 'COPY': - if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: - # already irredeemably many failures, stop short - raise Exception(f"too many failures with fallback-copied output ({nr_failed} of {nr_failed+nr_succeeded})") - self._copy_page_file(input_files[0]) - else: - desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) - raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") - + # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed + if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: + # already irredeemably many failures, stop short + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + elif result: + nr_succeeded += 1 + # else skipped - already exists + nr_errors = dict(nr_errors) if nr_failed > 0: nr_all = nr_succeeded + nr_failed - if config.OCRD_MISSING_OUTPUT == 'SKIP': - reason = "skipped" - if config.OCRD_MISSING_OUTPUT == 'COPY': - reason = "fallback-copied" if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(dict(nr_errors))) + self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + return nr_succeeded, nr_failed, nr_errors, len(tasks) + + def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: + """ + \b + Await a single page result and handle errors (exceptions), + enforcing policies configured by the following + environment variables: + - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) + - `OCRD_MISSING_OUTPUT` (abort/skip/fallback-copy) + - `OCRD_MAX_MISSING_OUTPUTS` (abort after all). + + \b + Returns + - true in case of success + - false in case the output already exists + - the exception in case of failure + """ + # FIXME: differentiate error cases in various ways: + # - ResourceNotFoundError → use ResourceManager to download (once), then retry + # - transient (I/O or OOM) error → maybe sleep, retry + # - persistent (data) error → skip / dummy / raise + try: + self._base_logger.debug("waiting for output of task %s (page %s)", task, page_id) + # timeout kwarg on future is useless: it only raises TimeoutError here, + # but does not stop the running process/thread, and executor itself + # offers nothing to that effect: + # task.result(timeout=max_seconds or None) + # so we instead applied the timeout within the worker function + task.result() + return True + except NotImplementedError: + # exclude NotImplementedError, so we can try process() below + raise + # handle input failures separately + except FileExistsError as err: + if config.OCRD_EXISTING_OUTPUT == 'ABORT': + raise err + if config.OCRD_EXISTING_OUTPUT == 'SKIP': + return False + if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': + # too late here, must not happen + raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + # broad coverage of output failures (including TimeoutError) + except Exception as err: + # FIXME: add re-usable/actionable logging + if config.OCRD_MISSING_OUTPUT == 'ABORT': + self._base_logger.error(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + raise err + self._base_logger.exception(f"Failure on page {page_id}: {str(err) or err.__class__.__name__}") + if config.OCRD_MISSING_OUTPUT == 'SKIP': + pass + elif config.OCRD_MISSING_OUTPUT == 'COPY': + self._copy_page_file(input_files[0]) + else: + desc = config.describe('OCRD_MISSING_OUTPUT', wrap_text=False, indent_text=False) + raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") + return err def _copy_page_file(self, input_file : OcrdFileType) -> None: """ From 93ec19a100376ec634ca3bc2f65a08fe8949d4b5 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 29 Oct 2024 14:27:47 +0100 Subject: [PATCH 153/191] fix internal_callback logging message --- src/ocrd_network/processing_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_network/processing_worker.py b/src/ocrd_network/processing_worker.py index a352ea5fde..0a39a5e971 100644 --- a/src/ocrd_network/processing_worker.py +++ b/src/ocrd_network/processing_worker.py @@ -240,7 +240,7 @@ def publish_result_to_all(self, processing_message: OcrdProcessingMessage, resul # post the result message (callback to a user defined endpoint) post_to_callback_url(self.log, callback_url, result_message) if internal_callback_url: - self.log.info(f"Publishing result to internal callback url (Processing Server): {callback_url}") + self.log.info(f"Publishing result to internal callback url (Processing Server): {internal_callback_url}") # If the internal callback_url field is set, # post the result message (callback to Processing Server endpoint) post_to_callback_url(self.log, internal_callback_url, result_message) From 71d6d496fdc42bdc9c7b338b1ce78d593b36555d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:31:18 +0000 Subject: [PATCH 154/191] Processor.process_workspace_handle_page_task: do not handler sigint --- src/ocrd/processor/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 85a0dea212..297b34647f 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -704,6 +704,8 @@ def process_workspace_handle_page_task(self, page_id : str, input_files : List[O if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE': # too late here, must not happen raise Exception(f"got {err} despite OCRD_EXISTING_OUTPUT==OVERWRITE") + except KeyboardInterrupt: + raise # broad coverage of output failures (including TimeoutError) except Exception as err: # FIXME: add re-usable/actionable logging @@ -1113,6 +1115,7 @@ def _page_worker_set_ctxt(processor): """ global _page_worker_processor _page_worker_processor = processor + def _page_worker(timeout, *input_files): """ Wraps a `Processor.process_page_file` call as payload (call target) From d2d5290a0fb789979b1ce29690f9e93f64c61c1f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 20:32:22 +0000 Subject: [PATCH 155/191] Processor.process_workspace_handle_tasks: log nr of ignored exceptions in the end --- src/ocrd/processor/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 297b34647f..87e6731dfa 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -529,7 +529,7 @@ def process_workspace(self, workspace: Workspace) -> None: tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: - executor.shutdown(kill_workers=True) + executor.shutdown(kill_workers=True, wait=False) except NotImplementedError: # fall back to deprecated method @@ -651,7 +651,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O # FIXME: this is just prospective, because len(tasks)==nr_failed+nr_succeeded is not guaranteed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: # already irredeemably many failures, stop short - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded})") + nr_errors = dict(nr_errors) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})") elif result: nr_succeeded += 1 # else skipped - already exists @@ -659,8 +660,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O if nr_failed > 0: nr_all = nr_succeeded + nr_failed if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all})") - self._base_logger.info("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") + self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) return nr_succeeded, nr_failed, nr_errors, len(tasks) def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: From 7d1503ebc40d4bd03d6c6e6a9813e8d6279a70a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 30 Oct 2024 22:47:18 +0100 Subject: [PATCH 156/191] :package: v3.0.0b6 --- CHANGELOG.md | 23 +++++++++++++++++++++++ VERSION | 2 +- repo/spec | 2 +- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abbfd5a4d8..da422654bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b6] - 2024-10-30 + +Fixed: + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + +Changed: + - :fire: `ocrd_utils.initLogging`: also add handler to root logger (as in file config), + but disable message propagation to avoid duplication + - only import `ocrd_network` in `src/ocrd/decorators/__init__.py` once needed + - `Processor.process_page_file`: skip computing `process_page_pcgts` if output already exists, + but `OCRD_EXISTING_OUTPUT!=OVERWRITE` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: switch from multithreading to multiprocessing, depend on + `loky` instead of stdlib `concurrent.futures` + - `OCRD_PROCESSING_PAGE_TIMEOUT>0`: actually enforce timeout within worker + - `OCRD_MAX_MISSING_OUTPUTS>0`: abort early if too many failures already, prospectively + - `Processor.process_workspace`: split up into overridable sub-methods: + - `process_workspace_submit_tasks` (iterate input file group and schedule page tasks) + - `process_workspace_submit_page_task` (download input files and submit single page task) + - `process_workspace_handle_tasks` (monitor page tasks and aggregate results) + - `process_workspace_handle_page_task` (await single page task and handle errors) + + ## [3.0.0b5] - 2024-09-16 Fixed: @@ -2287,6 +2309,7 @@ Fixed Initial Release +[3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 [3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 [3.0.0b3]: ../../compare/v3.0.0b3..v3.0.0b2 diff --git a/VERSION b/VERSION index 09fb39d267..43662e8c29 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b5 +3.0.0b6 diff --git a/repo/spec b/repo/spec index 506b33936d..df2a07e3fd 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 +Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 From 08a631ccc89401724caf32b3211529abc0a13382 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:27:25 +0000 Subject: [PATCH 157/191] tests: prevent side effects from ocrd_logging --- tests/base.py | 2 -- tests/cli/test_log.py | 11 +++++-- tests/processor/test_processor.py | 32 +++++++++++++------- tests/test_decorators.py | 17 +++++------ tests/test_logging.py | 6 ++++ tests/test_logging_conf.py | 49 +++++++++++++------------------ tests/test_mets_server.py | 28 ++++++++++++------ 7 files changed, 83 insertions(+), 62 deletions(-) diff --git a/tests/base.py b/tests/base.py index 53f393e08d..9eb1f20db8 100644 --- a/tests/base.py +++ b/tests/base.py @@ -26,8 +26,6 @@ class TestCase(VanillaTestCase): def setUp(self): chdir(dirname(realpath(__file__)) + '/..') - disableLogging() - initLogging(builtin_only=True) class CapturingTestCase(TestCase): """ diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py index c63d78c318..3d81e8266b 100644 --- a/tests/cli/test_log.py +++ b/tests/cli/test_log.py @@ -6,8 +6,8 @@ from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory from ocrd.decorators import ocrd_loglevel -from ocrd_utils import setOverrideLogLevel, logging, disableLogging -import logging as python_logging +from ocrd_utils import disableLogging, initLogging +import logging @click.group() @ocrd_loglevel @@ -18,14 +18,19 @@ def mock_ocrd_cli(log_level): class TestLogCli(TestCase): def _get_log_output(self, *args): - disableLogging() code, out, err = self.invoke_cli(mock_ocrd_cli, args) print({'code': code, 'out': out, 'err': err}) return err + def setUp(self): + super().setUp() + initLogging() + def tearDown(self): if 'OCRD_TOOL_NAME' in ENV: del(ENV['OCRD_TOOL_NAME']) + super().tearDown() + disableLogging() def test_loglevel(self): assert 'DEBUG ocrd.log_cli - foo' not in self._get_log_output('log', 'debug', 'foo') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 5844cb8774..06c129c3ca 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -27,21 +27,21 @@ class TestProcessor(TestCase): + def run(self, result=None): + with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as workdir: + with pushd_popd(workdir): + self.resolver = Resolver() + self.workspace = self.resolver.workspace_from_url('mets.xml') + super().run(result=result) + def setUp(self): super().setUp() - # make sure we get an isolated temporary copy of the testdata each time - # as long as we are not using pytest but unittest, we need to manage contexts - # (enterContext is only supported starting with py311) - with ExitStack() as stack: - self.resolver = Resolver() - self.workdir = stack.enter_context(copy_of_directory(assets.path_to('SBB0000F29300010000/data'))) - stack.enter_context(pushd_popd(self.workdir)) - self.workspace = self.resolver.workspace_from_url('mets.xml') - self.addCleanup(stack.pop_all().close) + initLogging() def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_incomplete_processor(self): proc = IncompleteProcessor(None) @@ -423,6 +423,7 @@ def ocrd_tool(self): def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 run_processor(DummyProcessorWithOutputSleep, workspace=ws, @@ -446,22 +447,33 @@ def test_run_output_metsserver(start_mets_server): parameter={"sleep": 0}, mets_server_url=mets_server_url) assert "already exists" in str(exc.value) + config.reset_defaults() # 2s (+ 2s tolerance) instead of 3*3s (+ 2s tolerance) -@pytest.mark.timeout(4) +# fixme: pytest-timeout does not shut down / finalize the fixture properly +# (regardless of method or func_only), so the next test in the suite +# does not execute ("previous item was not torn down properly") +# so we must instead wait for completion and assert on the time spent... +#@pytest.mark.timeout(timeout=4, func_only=True, method="signal") def test_run_output_parallel(start_mets_server): + import time mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 # do not raise for single-page timeout config.OCRD_PROCESSING_PAGE_TIMEOUT = -1 # do not raise for number of failures: config.OCRD_MAX_MISSING_OUTPUTS = -1 config.OCRD_MAX_PARALLEL_PAGES = 3 + start_time = time.time() run_processor(DummyProcessorWithOutputSleep, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", parameter={"sleep": 2}, mets_server_url=mets_server_url) + run_time = time.time() - start_time + assert run_time < 3, f"run_processor took {run_time}s" assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + config.reset_defaults() if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index c36577020a..561fdc762d 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -41,22 +41,20 @@ def cli_dummy_processor(*args, **kwargs): class TestDecorators(TestCase): - def setUp(self): - super().setUp() - disableLogging() - def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_minimal(self): - exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) - print(out, err) - assert not exit_code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) + assert not code, (out, err) def test_loglevel_invalid(self): - code, _, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) - assert code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) + assert code, (out, err) import click if int(click.__version__[0]) < 8: assert 'invalid choice: foo' in err @@ -67,7 +65,6 @@ def test_loglevel_override(self): if get_logging_config_files(): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging - disableLogging() assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() diff --git a/tests/test_logging.py b/tests/test_logging.py index c2b6913b10..091fc25bee 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -26,16 +26,22 @@ class TestLogging(TestCase): def setUp(self): pass # do not chdir + def tearDown(self): + super().tearDown() + disableLogging() + def test_loglevel_inheritance(self): initLogging(builtin_only=True) ocrd_logger = logging.getLogger('ocrd') assert ocrd_logger.getEffectiveLevel() == logging.INFO some_logger = getLogger('ocrd.foo') + assert some_logger.level == logging.NOTSET assert some_logger.getEffectiveLevel() == logging.INFO setOverrideLogLevel('ERROR') assert ocrd_logger.getEffectiveLevel() == logging.ERROR assert some_logger.getEffectiveLevel() == logging.ERROR another_logger = getLogger('ocrd.bar') + assert another_logger.level == logging.NOTSET assert another_logger.getEffectiveLevel() == logging.ERROR def test_getLevelName(self): diff --git a/tests/test_logging_conf.py b/tests/test_logging_conf.py index f8e0e9e894..0717674103 100644 --- a/tests/test_logging_conf.py +++ b/tests/test_logging_conf.py @@ -21,74 +21,67 @@ # sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../ocrd') TEST_ROOT = pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent -def resetLogging(): - disableLogging() - initLogging() - - @pytest.fixture(name="logging_conf") -def _fixture_logging_conf(tmpdir): +def _fixture_logging_conf(tmpdir, capfd): path_logging_conf_orig = os.path.join( str(TEST_ROOT), 'src', 'ocrd_utils', 'ocrd_logging.conf') path_logging_conf_dest = os.path.join(str(tmpdir), 'ocrd_logging.conf') shutil.copy(path_logging_conf_orig, path_logging_conf_dest) - return str(tmpdir) + with pushd_popd(tmpdir): + with capfd.disabled(): + initLogging() + yield str(tmpdir) + disableLogging() -def test_configured_dateformat(logging_conf, capsys): +def test_configured_dateformat(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and produces desired record format""" # arrange - with pushd_popd(logging_conf): - resetLogging() - test_logger = getLogger('') + test_logger = getLogger('ocrd') - # act - test_logger.info("test logger initialized") + # act + test_logger.info("test logger initialized") - log_info_output = capsys.readouterr().err - must_not_match = r"^\d{4}-\d{2}-\d{2}.*" - assert not re.match(must_not_match, log_info_output) - match_pattern = r"^\d{2}:\d{2}:\d{2}.*" - assert re.match(match_pattern, log_info_output) + log_info_output = capfd.readouterr().err + must_not_match = r"^\d{4}-\d{2}-\d{2}.*" + assert not re.match(must_not_match, log_info_output) + match_pattern = r"^\d{2}:\d{2}:\d{2}.*" + assert re.match(match_pattern, log_info_output), log_info_output -def test_configured_tensorflow_logger_present(logging_conf, capsys): +def test_configured_tensorflow_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger tensorflow""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('tensorflow') # act info logger_under_test.info("tensorflow logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("tensorflow has error") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output -def test_configured_shapely_logger_present(logging_conf, capsys): +def test_configured_shapely_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger shapely.geos""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('shapely.geos') # act info logger_under_test.info("shapely.geos logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("shapely alert") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output if __name__ == '__main__': diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index dc94d6c560..3bb96535c0 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,20 +22,17 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel, disableLogging, getLogger TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] -initLogging() -setOverrideLogLevel(10) - @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: - tmpdir = str(tmpdir) - def _start_mets_server(*args, **kwargs): - mets_server = OcrdMetsServer(*args, **kwargs) - mets_server.startup() + initLogging() + #setOverrideLogLevel(10) + logger = getLogger('ocrd') + tmpdir = str(tmpdir) mets_server_url = request.param if mets_server_url == TRANSPORTS[0]: @@ -47,13 +44,26 @@ def _start_mets_server(*args, **kwargs): copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) workspace = Workspace(Resolver(), tmpdir) - p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) + class MetsServerProcess(Process): + def __init__(self, *args, **kwargs): + self.server = OcrdMetsServer(*args, **kwargs) + super().__init__() + def run(self): + self.server.startup() + def terminate(self): + self.server.workspace.save_mets() + super().terminate() + p = MetsServerProcess(workspace=workspace, url=request.param) p.start() + logger.info("started METS Server") sleep(1) # sleep to start up server workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) yield mets_server_url, workspace_server p.terminate() + p.join() + logger.info("terminated METS Server") rmtree(tmpdir, ignore_errors=True) + disableLogging() def add_file_server(x, force=False): mets_server_url, directory, i = x From f3e423ac52f5293596cf88ac2031384857be4145 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:36:17 +0000 Subject: [PATCH 158/191] initLogging: do not remove any previous handlers/levels --- src/ocrd_utils/logging.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index dfac74988b..404ac7ddbc 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -161,18 +161,6 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L global _initialized_flag if _initialized_flag and not force_reinit: return - # disableLogging() - - # https://docs.python.org/3/library/logging.html#logging.disable - # If logging.disable(logging.NOTSET) is called, it effectively removes this - # overriding level, so that logging output again depends on the effective - # levels of individual loggers. - logging.disable(logging.NOTSET) - - # remove all handlers for the ocrd root loggers - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) config_file = None if not builtin_only: From 31435187dffb43c692f24f3108f24d0ed1093cfd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:38:44 +0000 Subject: [PATCH 159/191] initLogging: only add root handler instead of multiple redundant handlers with propagate=false --- src/ocrd_utils/logging.py | 7 ++----- src/ocrd_utils/ocrd_logging.conf | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 404ac7ddbc..7f59221c8e 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -179,11 +179,8 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler = logging.StreamHandler(stream=sys.stderr) ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) - for logger_name in ROOT_OCRD_LOGGERS: - logger = logging.getLogger(logger_name) - logger.addHandler(ocrd_handler) - if logger_name: - logger.propagate = False # avoid duplication (from root handler) + root_logger = logging.getLogger('') + root_logger.addHandler(ocrd_handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 5cf161398e..0af039b2ac 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -56,22 +56,22 @@ handlers=consoleHandler,fileHandler # ocrd loggers [logger_ocrd] level=INFO -handlers=consoleHandler,fileHandler +handlers= qualname=ocrd -propagate=0 [logger_ocrd_network] level=INFO -handlers=consoleHandler,processingServerHandler +#handlers=consoleHandler,processingServerHandler +handlers=processingServerHandler qualname=ocrd_network -propagate=0 +#propagate=0 # # logger tensorflow # [logger_ocrd_tensorflow] level=ERROR -handlers=consoleHandler +handlers= qualname=tensorflow # @@ -79,7 +79,7 @@ qualname=tensorflow # [logger_ocrd_shapely_geos] level=ERROR -handlers=consoleHandler +handlers= qualname=shapely.geos @@ -88,7 +88,7 @@ qualname=shapely.geos # [logger_ocrd_PIL] level=INFO -handlers=consoleHandler +handlers= qualname=PIL # @@ -96,34 +96,32 @@ qualname=PIL # [logger_paramiko] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko -propagate=0 [logger_paramiko_transport] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko.transport -propagate=0 # # uvicorn loggers # [logger_uvicorn] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn [logger_uvicorn_access] level=WARN -handlers=consoleHandler +handlers= qualname=uvicorn.access [logger_uvicorn_error] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn.error [logger_multipart] level=INFO -handlers=consoleHandler +handlers= qualname=multipart From 27323c665edc608958a484ce7ae4aebaa65f45f6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:41:20 +0000 Subject: [PATCH 160/191] disableLogging: remove all handlers, reset all levels --- src/ocrd_utils/logging.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 7f59221c8e..db7921c843 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -46,13 +46,6 @@ 'setOverrideLogLevel', ] -# These are the loggers we add handlers to -ROOT_OCRD_LOGGERS = [ - '', - 'ocrd', - 'ocrd_network' -] - LOGGING_DEFAULTS = { 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, @@ -196,24 +189,16 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False - # logging.basicConfig(level=logging.CRITICAL) - # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) - for logger_name in LOGGING_DEFAULTS: - logging.getLogger(logger_name).setLevel(logging.NOTSET) + # remove all handlers we might have added (via initLogging on builtin or file config) + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Resetting {logger_name} log level and handlers') + logger = logging.getLogger(logger_name) + logger.setLevel(logging.NOTSET) + for handler in logger.handlers[:]: + logger.removeHandler(handler) + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) # Python default log level is WARNING logging.root.setLevel(logging.WARNING) -# Initializing stream handlers at module level -# would cause message output in all runtime contexts, -# including those which are already run for std output -# (--dump-json, --version, ocrd-tool, bashlib etc). -# So this needs to be an opt-in from the CLIs/decorators: -#initLogging() -# Also, we even have to block log output for libraries -# (like matplotlib/tensorflow) which set up logging -# themselves already: -disableLogging() From eb3120d77fab33ce2da91515dc452ffe438833e9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:42:52 +0000 Subject: [PATCH 161/191] setOverrideLogLevel: override all currently active loggers' level --- src/ocrd_utils/logging.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index db7921c843..98c2f58b2c 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -107,18 +107,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): lvl (string): Log level name. silent (boolean): Whether to log the override call """ - if not _initialized_flag: - initLogging(silent=silent) - ocrd_logger = logging.getLogger('ocrd') - - if lvl is None: - if not silent: - print('[LOGGING] Reset log level override', file=sys.stderr) - ocrd_logger.setLevel(logging.NOTSET) - else: - if not silent: - print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) - ocrd_logger.setLevel(lvl) + if lvl is not None: + lvl = getLevelName(lvl) + if not _initialized_flag: + initLogging(silent=silent) + # affect all configured loggers + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): """ From 0186c53795c0f32167a148172ea123906db79c41 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:43:40 +0000 Subject: [PATCH 162/191] logging: increase default root (not ocrd) level from INFO to WARNING --- src/ocrd_utils/logging.py | 1 + src/ocrd_utils/ocrd_logging.conf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 98c2f58b2c..ddb8b88b2a 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -47,6 +47,7 @@ ] LOGGING_DEFAULTS = { + '': logging.WARNING, 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, # 'ocrd.resolver': logging.INFO, diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 0af039b2ac..41e6d5af7a 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -34,7 +34,7 @@ keys=defaultFormatter,detailedFormatter # default logger "root" using consoleHandler # [logger_root] -level=INFO +level=WARNING handlers=consoleHandler,fileHandler From 5ba27209d396c44eb4d5e53f784a9fd42167a9ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:44:06 +0000 Subject: [PATCH 163/191] Processor: update max_workers docstring --- src/ocrd/processor/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 87e6731dfa..f0d453f4ac 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -158,12 +158,12 @@ class Processor(): max_workers : int = -1 """ - maximum number of processor threads for page-parallel processing (ignored if negative), + maximum number of processor forks for page-parallel processing (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. whatever is smaller). (Override this if you know how many pages fit into processing units - GPU shaders / CPU cores - - at once, or if your class is not thread-safe.) + - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.) """ max_page_seconds : int = -1 From f17d5880c7dbd82e81389b5de39a705cabb5e0ee Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:36:17 +0000 Subject: [PATCH 164/191] initLogging: do not remove any previous handlers/levels --- src/ocrd_utils/logging.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 9c9ea73e01..f88d098036 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -163,18 +163,6 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L global _initialized_flag if _initialized_flag and not force_reinit: return - # disableLogging() - - # https://docs.python.org/3/library/logging.html#logging.disable - # If logging.disable(logging.NOTSET) is called, it effectively removes this - # overriding level, so that logging output again depends on the effective - # levels of individual loggers. - logging.disable(logging.NOTSET) - - # remove all handlers for the ocrd root loggers - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) config_file = None if not builtin_only: From d6c551ec0f55cf9211ef7a71c901d9ad616b090e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:38:44 +0000 Subject: [PATCH 165/191] initLogging: only add root handler instead of multiple redundant handlers with propagate=false --- src/ocrd_utils/logging.py | 7 ++----- src/ocrd_utils/ocrd_logging.conf | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index f88d098036..f01855aa0e 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -181,11 +181,8 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L ocrd_handler = logging.StreamHandler(stream=sys.stderr) ocrd_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) ocrd_handler.setLevel(logging.DEBUG) - for logger_name in ROOT_OCRD_LOGGERS: - logger = logging.getLogger(logger_name) - logger.addHandler(ocrd_handler) - if logger_name: - logger.propagate = False # avoid duplication (from root handler) + root_logger = logging.getLogger('') + root_logger.addHandler(ocrd_handler) for logger_name, logger_level in LOGGING_DEFAULTS.items(): logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 5cf161398e..0af039b2ac 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -56,22 +56,22 @@ handlers=consoleHandler,fileHandler # ocrd loggers [logger_ocrd] level=INFO -handlers=consoleHandler,fileHandler +handlers= qualname=ocrd -propagate=0 [logger_ocrd_network] level=INFO -handlers=consoleHandler,processingServerHandler +#handlers=consoleHandler,processingServerHandler +handlers=processingServerHandler qualname=ocrd_network -propagate=0 +#propagate=0 # # logger tensorflow # [logger_ocrd_tensorflow] level=ERROR -handlers=consoleHandler +handlers= qualname=tensorflow # @@ -79,7 +79,7 @@ qualname=tensorflow # [logger_ocrd_shapely_geos] level=ERROR -handlers=consoleHandler +handlers= qualname=shapely.geos @@ -88,7 +88,7 @@ qualname=shapely.geos # [logger_ocrd_PIL] level=INFO -handlers=consoleHandler +handlers= qualname=PIL # @@ -96,34 +96,32 @@ qualname=PIL # [logger_paramiko] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko -propagate=0 [logger_paramiko_transport] level=INFO -handlers=consoleHandler +handlers= qualname=paramiko.transport -propagate=0 # # uvicorn loggers # [logger_uvicorn] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn [logger_uvicorn_access] level=WARN -handlers=consoleHandler +handlers= qualname=uvicorn.access [logger_uvicorn_error] level=INFO -handlers=consoleHandler +handlers= qualname=uvicorn.error [logger_multipart] level=INFO -handlers=consoleHandler +handlers= qualname=multipart From 8e87023adaec951e299ea141c63d253dffe707a6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:41:20 +0000 Subject: [PATCH 166/191] disableLogging: remove all handlers, reset all levels --- src/ocrd_utils/logging.py | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index f01855aa0e..80d98eb2d4 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -48,13 +48,6 @@ 'setOverrideLogLevel', ] -# These are the loggers we add handlers to -ROOT_OCRD_LOGGERS = [ - '', - 'ocrd', - 'ocrd_network' -] - LOGGING_DEFAULTS = { 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, @@ -198,24 +191,16 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False - # logging.basicConfig(level=logging.CRITICAL) - # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: - for handler in logging.getLogger(logger_name).handlers[:]: - logging.getLogger(logger_name).removeHandler(handler) - for logger_name in LOGGING_DEFAULTS: - logging.getLogger(logger_name).setLevel(logging.NOTSET) + # remove all handlers we might have added (via initLogging on builtin or file config) + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Resetting {logger_name} log level and handlers') + logger = logging.getLogger(logger_name) + logger.setLevel(logging.NOTSET) + for handler in logger.handlers[:]: + logger.removeHandler(handler) + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) # Python default log level is WARNING logging.root.setLevel(logging.WARNING) -# Initializing stream handlers at module level -# would cause message output in all runtime contexts, -# including those which are already run for std output -# (--dump-json, --version, ocrd-tool, bashlib etc). -# So this needs to be an opt-in from the CLIs/decorators: -#initLogging() -# Also, we even have to block log output for libraries -# (like matplotlib/tensorflow) which set up logging -# themselves already: -disableLogging() From 4a2a0906a47a672184aefb7f46e9bbc45b177201 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:42:52 +0000 Subject: [PATCH 167/191] setOverrideLogLevel: override all currently active loggers' level --- src/ocrd_utils/logging.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 80d98eb2d4..a209a1b4a6 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -109,18 +109,15 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): lvl (string): Log level name. silent (boolean): Whether to log the override call """ - if not _initialized_flag: - initLogging(silent=silent) - ocrd_logger = logging.getLogger('ocrd') - - if lvl is None: - if not silent: - print('[LOGGING] Reset log level override', file=sys.stderr) - ocrd_logger.setLevel(logging.NOTSET) - else: - if not silent: - print(f'[LOGGING] Overriding ocrd log level to {lvl}', file=sys.stderr) - ocrd_logger.setLevel(lvl) + if lvl is not None: + lvl = getLevelName(lvl) + if not _initialized_flag: + initLogging(silent=silent) + # affect all configured loggers + for logger_name in logging.root.manager.loggerDict: + if not silent: + print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) + logging.getLogger(logger_name).setLevel(lvl) def get_logging_config_files(): """ From 7e8705e30ad8ad364e08cb5b8847103adc2a4fbc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:43:40 +0000 Subject: [PATCH 168/191] logging: increase default root (not ocrd) level from INFO to WARNING --- src/ocrd_utils/logging.py | 1 + src/ocrd_utils/ocrd_logging.conf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index a209a1b4a6..c56c1401f4 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -49,6 +49,7 @@ ] LOGGING_DEFAULTS = { + '': logging.WARNING, 'ocrd': logging.INFO, 'ocrd_network': logging.INFO, # 'ocrd.resolver': logging.INFO, diff --git a/src/ocrd_utils/ocrd_logging.conf b/src/ocrd_utils/ocrd_logging.conf index 0af039b2ac..41e6d5af7a 100644 --- a/src/ocrd_utils/ocrd_logging.conf +++ b/src/ocrd_utils/ocrd_logging.conf @@ -34,7 +34,7 @@ keys=defaultFormatter,detailedFormatter # default logger "root" using consoleHandler # [logger_root] -level=INFO +level=WARNING handlers=consoleHandler,fileHandler From edaab4e4483c32381231893a148e2186b8585332 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Nov 2024 18:27:25 +0000 Subject: [PATCH 169/191] tests: prevent side effects from ocrd_logging --- tests/base.py | 2 -- tests/cli/test_log.py | 11 +++++-- tests/processor/test_processor.py | 21 ++++++------- tests/test_decorators.py | 17 +++++------ tests/test_logging.py | 6 ++++ tests/test_logging_conf.py | 49 +++++++++++++------------------ tests/test_mets_server.py | 28 ++++++++++++------ 7 files changed, 72 insertions(+), 62 deletions(-) diff --git a/tests/base.py b/tests/base.py index 53f393e08d..9eb1f20db8 100644 --- a/tests/base.py +++ b/tests/base.py @@ -26,8 +26,6 @@ class TestCase(VanillaTestCase): def setUp(self): chdir(dirname(realpath(__file__)) + '/..') - disableLogging() - initLogging(builtin_only=True) class CapturingTestCase(TestCase): """ diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py index c63d78c318..3d81e8266b 100644 --- a/tests/cli/test_log.py +++ b/tests/cli/test_log.py @@ -6,8 +6,8 @@ from tests.base import CapturingTestCase as TestCase, main, assets, copy_of_directory from ocrd.decorators import ocrd_loglevel -from ocrd_utils import setOverrideLogLevel, logging, disableLogging -import logging as python_logging +from ocrd_utils import disableLogging, initLogging +import logging @click.group() @ocrd_loglevel @@ -18,14 +18,19 @@ def mock_ocrd_cli(log_level): class TestLogCli(TestCase): def _get_log_output(self, *args): - disableLogging() code, out, err = self.invoke_cli(mock_ocrd_cli, args) print({'code': code, 'out': out, 'err': err}) return err + def setUp(self): + super().setUp() + initLogging() + def tearDown(self): if 'OCRD_TOOL_NAME' in ENV: del(ENV['OCRD_TOOL_NAME']) + super().tearDown() + disableLogging() def test_loglevel(self): assert 'DEBUG ocrd.log_cli - foo' not in self._get_log_output('log', 'debug', 'foo') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index f2261d0ffb..8fed914802 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -17,21 +17,21 @@ class TestProcessor(TestCase): + def run(self, result=None): + with copy_of_directory(assets.path_to('SBB0000F29300010000/data')) as workdir: + with pushd_popd(workdir): + self.resolver = Resolver() + self.workspace = self.resolver.workspace_from_url('mets.xml') + super().run(result=result) + def setUp(self): super().setUp() - # make sure we get an isolated temporary copy of the testdata each time - # as long as we are not using pytest but unittest, we need to manage contexts - # (enterContext is only supported starting with py311) - with ExitStack() as stack: - self.resolver = Resolver() - self.workdir = stack.enter_context(copy_of_directory(assets.path_to('SBB0000F29300010000/data'))) - stack.enter_context(pushd_popd(self.workdir)) - self.workspace = self.resolver.workspace_from_url('mets.xml') - self.addCleanup(stack.pop_all().close) + initLogging() def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_incomplete_processor(self): proc = IncompleteProcessor(None) @@ -251,6 +251,7 @@ class ZipTestProcessor(Processor): pass def test_run_output_metsserver(start_mets_server): mets_server_url, ws = start_mets_server + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 0 run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-OUT", @@ -269,7 +270,7 @@ def test_run_output_metsserver(start_mets_server): output_file_grp="OCR-D-OUT", mets_server_url=mets_server_url) assert "already exists" in str(exc.value) - + config.reset_defaults() if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index c36577020a..561fdc762d 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -41,22 +41,20 @@ def cli_dummy_processor(*args, **kwargs): class TestDecorators(TestCase): - def setUp(self): - super().setUp() - disableLogging() - def tearDown(self): super().tearDown() config.reset_defaults() + disableLogging() def test_minimal(self): - exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) - print(out, err) - assert not exit_code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) + assert not code, (out, err) def test_loglevel_invalid(self): - code, _, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) - assert code + initLogging() + code, out, err = self.invoke_cli(cli_with_ocrd_loglevel, ['--log-level', 'foo']) + assert code, (out, err) import click if int(click.__version__[0]) < 8: assert 'invalid choice: foo' in err @@ -67,7 +65,6 @@ def test_loglevel_override(self): if get_logging_config_files(): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging - disableLogging() assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() diff --git a/tests/test_logging.py b/tests/test_logging.py index 2e4e0861b5..bd6aa7e7b2 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -26,16 +26,22 @@ class TestLogging(TestCase): def setUp(self): pass # do not chdir + def tearDown(self): + super().tearDown() + disableLogging() + def test_loglevel_inheritance(self): initLogging(builtin_only=True) ocrd_logger = logging.getLogger('ocrd') assert ocrd_logger.getEffectiveLevel() == logging.INFO some_logger = getLogger('ocrd.foo') + assert some_logger.level == logging.NOTSET assert some_logger.getEffectiveLevel() == logging.INFO setOverrideLogLevel('ERROR') assert ocrd_logger.getEffectiveLevel() == logging.ERROR assert some_logger.getEffectiveLevel() == logging.ERROR another_logger = getLogger('ocrd.bar') + assert another_logger.level == logging.NOTSET assert another_logger.getEffectiveLevel() == logging.ERROR def test_getLevelName(self): diff --git a/tests/test_logging_conf.py b/tests/test_logging_conf.py index f8e0e9e894..0717674103 100644 --- a/tests/test_logging_conf.py +++ b/tests/test_logging_conf.py @@ -21,74 +21,67 @@ # sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../ocrd') TEST_ROOT = pathlib.Path(os.path.dirname(os.path.abspath(__file__))).parent -def resetLogging(): - disableLogging() - initLogging() - - @pytest.fixture(name="logging_conf") -def _fixture_logging_conf(tmpdir): +def _fixture_logging_conf(tmpdir, capfd): path_logging_conf_orig = os.path.join( str(TEST_ROOT), 'src', 'ocrd_utils', 'ocrd_logging.conf') path_logging_conf_dest = os.path.join(str(tmpdir), 'ocrd_logging.conf') shutil.copy(path_logging_conf_orig, path_logging_conf_dest) - return str(tmpdir) + with pushd_popd(tmpdir): + with capfd.disabled(): + initLogging() + yield str(tmpdir) + disableLogging() -def test_configured_dateformat(logging_conf, capsys): +def test_configured_dateformat(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and produces desired record format""" # arrange - with pushd_popd(logging_conf): - resetLogging() - test_logger = getLogger('') + test_logger = getLogger('ocrd') - # act - test_logger.info("test logger initialized") + # act + test_logger.info("test logger initialized") - log_info_output = capsys.readouterr().err - must_not_match = r"^\d{4}-\d{2}-\d{2}.*" - assert not re.match(must_not_match, log_info_output) - match_pattern = r"^\d{2}:\d{2}:\d{2}.*" - assert re.match(match_pattern, log_info_output) + log_info_output = capfd.readouterr().err + must_not_match = r"^\d{4}-\d{2}-\d{2}.*" + assert not re.match(must_not_match, log_info_output) + match_pattern = r"^\d{2}:\d{2}:\d{2}.*" + assert re.match(match_pattern, log_info_output), log_info_output -def test_configured_tensorflow_logger_present(logging_conf, capsys): +def test_configured_tensorflow_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger tensorflow""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('tensorflow') # act info logger_under_test.info("tensorflow logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("tensorflow has error") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output -def test_configured_shapely_logger_present(logging_conf, capsys): +def test_configured_shapely_logger_present(logging_conf, capfd): """Ensure example ocrd_logging.conf is valid and contains logger shapely.geos""" # arrange - os.chdir(logging_conf) - resetLogging() logger_under_test = getLogger('shapely.geos') # act info logger_under_test.info("shapely.geos logger initialized") - log_info_output = capsys.readouterr().err + log_info_output = capfd.readouterr().err assert not log_info_output # act error logger_under_test.error("shapely alert") - log_error_output = capsys.readouterr().err + log_error_output = capfd.readouterr().err assert log_error_output if __name__ == '__main__': diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index dc94d6c560..3bb96535c0 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,20 +22,17 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel, disableLogging, getLogger TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] -initLogging() -setOverrideLogLevel(10) - @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: - tmpdir = str(tmpdir) - def _start_mets_server(*args, **kwargs): - mets_server = OcrdMetsServer(*args, **kwargs) - mets_server.startup() + initLogging() + #setOverrideLogLevel(10) + logger = getLogger('ocrd') + tmpdir = str(tmpdir) mets_server_url = request.param if mets_server_url == TRANSPORTS[0]: @@ -47,13 +44,26 @@ def _start_mets_server(*args, **kwargs): copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) workspace = Workspace(Resolver(), tmpdir) - p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) + class MetsServerProcess(Process): + def __init__(self, *args, **kwargs): + self.server = OcrdMetsServer(*args, **kwargs) + super().__init__() + def run(self): + self.server.startup() + def terminate(self): + self.server.workspace.save_mets() + super().terminate() + p = MetsServerProcess(workspace=workspace, url=request.param) p.start() + logger.info("started METS Server") sleep(1) # sleep to start up server workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) yield mets_server_url, workspace_server p.terminate() + p.join() + logger.info("terminated METS Server") rmtree(tmpdir, ignore_errors=True) + disableLogging() def add_file_server(x, force=False): mets_server_url, directory, i = x From 192895afd5c38314e9a7de59b866b8bfedf1f523 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 13:34:10 +0000 Subject: [PATCH 170/191] initLogging: call disableLogging if already initialized and force_reinit --- src/ocrd_utils/logging.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index c56c1401f4..addd1cfcac 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -152,8 +152,11 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L - silent (bool, True): Whether to log logging behavior by printing to stderr """ global _initialized_flag - if _initialized_flag and not force_reinit: - return + if _initialized_flag: + if force_reinit: + disableLogging(silent=silent) + else: + return config_file = None if not builtin_only: From f8f71d809207f3bf1fc94dbdb9525272c13cd286 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 13:34:10 +0000 Subject: [PATCH 171/191] initLogging: call disableLogging if already initialized and force_reinit --- src/ocrd_utils/logging.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index ddb8b88b2a..52b01883f1 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -150,8 +150,11 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L - silent (bool): Whether to log logging behavior by printing to stderr """ global _initialized_flag - if _initialized_flag and not force_reinit: - return + if _initialized_flag: + if force_reinit: + disableLogging(silent=silent) + else: + return config_file = None if not builtin_only: From 5f2f602f5917d2f0970ff0fc15d64b148083b98b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 16:02:44 +0000 Subject: [PATCH 172/191] Processor: replace weakref with __del__ to trigger shutdown --- src/ocrd/processor/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index f0d453f4ac..7ec77162ee 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -21,7 +21,6 @@ import inspect import tarfile import io -import weakref from collections import defaultdict from frozendict import frozendict # concurrent.futures is buggy in py38, @@ -366,12 +365,14 @@ def __init__( self._base_logger = getLogger('ocrd.processor.base') if parameter is not None: self.parameter = parameter - # ensure that shutdown gets called at destruction - self._finalizer = weakref.finalize(self, self.shutdown) # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): setattr(self, 'process', deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) + def __del__(self): + self._base_logger.debug("shutting down") + self.shutdown() + def show_help(self, subcommand=None): """ Print a usage description including the standard CLI and all of this processor's ocrd-tool From 0446b82be55093536c5c0818de3b49d0aecc727a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 11 Nov 2024 23:23:15 +0000 Subject: [PATCH 173/191] Processor parallel pages: log via QueueHandler in subprocess, QueueListener in main --- repo/spec | 2 +- src/ocrd/processor/base.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/repo/spec b/repo/spec index df2a07e3fd..506b33936d 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit df2a07e3fda634b2eda5785afe67399b61a81173 +Subproject commit 506b33936d89080a683fa8a26837f2a23b23e5e2 diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 7ec77162ee..d6348b40e1 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -18,6 +18,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, get_args import sys +import logging +import logging.handlers import inspect import tarfile import io @@ -515,22 +517,31 @@ def process_workspace(self, workspace: Workspace) -> None: if max_workers > 1: executor_cls = ProcessPoolExecutor + log_queue = mp.Queue() + # forward messages from log queue (in subprocesses) to all root handlers + log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True) else: executor_cls = DummyExecutor + log_queue = None + log_listener = None executor = executor_cls( max_workers=max_workers or 1, # only forking method avoids pickling context=mp.get_context('fork'), # share processor instance as global to avoid pickling initializer=_page_worker_set_ctxt, - initargs=(self,), + initargs=(self, log_queue), ) + if max_workers > 1: + log_listener.start() try: self._base_logger.debug("started executor %s with %d workers", str(executor), max_workers or 1) tasks = self.process_workspace_submit_tasks(executor, max_seconds) stats = self.process_workspace_handle_tasks(tasks) finally: executor.shutdown(kill_workers=True, wait=False) + if max_workers > 1: + log_listener.stop() except NotImplementedError: # fall back to deprecated method @@ -1110,13 +1121,16 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): objects, and with the METS Server we do not mutate the local processor instance anyway. """ -def _page_worker_set_ctxt(processor): +def _page_worker_set_ctxt(processor, log_queue): """ Overwrites `ocrd.processor.base._page_worker_processor` instance for sharing with subprocesses in ProcessPoolExecutor initializer. """ global _page_worker_processor _page_worker_processor = processor + if log_queue: + # replace all log handlers with just one queue handler + logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] def _page_worker(timeout, *input_files): """ From 53c4c18240684936d2cd4e87051b5bbcc57f9cb2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 Nov 2024 00:46:38 +0000 Subject: [PATCH 174/191] :package: v3.0.0b7 --- CHANGELOG.md | 13 +++++++++++++ VERSION | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da422654bc..04ea2d42a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0b7] - 2024-11-12 + +Fixed: + - `initLogging`: only add root handler instead of multiple redundant handlers with `propagate=false` + - `setOverrideLogLevel`: override all currently active loggers' level + +Changed: + - :fire: logging: increase default root (not `ocrd`) level from `INFO` to `WARNING` + - :fire: `initLogging`: do not remove any previous handlers/levels, unless `force_reinit` + - :fire: `disableLogging`: remove all handlers, reset all levels - instead of being selective + - :fire: Processor: replace `weakref` with `__del__` to trigger `shutdown` + - :fire: `OCRD_MAX_PARALLEL_PAGES>1`: log via `QueueHandler` in subprocess, `QueueListener` in main + ## [3.0.0b6] - 2024-10-30 Fixed: diff --git a/VERSION b/VERSION index 43662e8c29..1129dfd443 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b6 +3.0.0b7 From 18c0c857d1d62aab12cdbc534df6b187a7d5e589 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 18 Nov 2024 14:39:05 +0100 Subject: [PATCH 175/191] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b90a57a24..890822b0b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * Typo in processing_worker log message, #1293 + * Call `initLogging` at the right time in `ocrd_network`, #1292 + ## [2.70.0] - 2024-10-10 Added: From 6258b3abfa798671ab5a4400003ab3832866452d Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 18 Nov 2024 14:39:58 +0100 Subject: [PATCH 176/191] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 890822b0b7..27073910d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Fixed: * Typo in processing_worker log message, #1293 * Call `initLogging` at the right time in `ocrd_network`, #1292 + * `make docs` fixed with absolute path to location, #1273 ## [2.70.0] - 2024-10-10 From 583fdd1a6e827dccf797e8267785ef478b68fe2d Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 18 Nov 2024 14:48:54 +0100 Subject: [PATCH 177/191] :memo: changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27073910d6..80cc8887c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * Rewrite `ocrd_utils.logging`, #1288 + * Handle only `''` as the root logger + * `disableLogging`: Remove handlers from root and all configured loggers + * Do not do any module-level modification of the log config + Fixed: * Typo in processing_worker log message, #1293 From c0f6116757ef811df8635b0397c45c4d91df9296 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 20 Nov 2024 13:18:35 +0100 Subject: [PATCH 178/191] :package: v2.71.0 --- CHANGELOG.md | 3 +++ VERSION | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80cc8887c7..c777d55c17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.71.0] - 2024-11-20 + Changed: * Rewrite `ocrd_utils.logging`, #1288 @@ -2229,6 +2231,7 @@ Fixed Initial Release +[2.71.0]: ../../compare/v2.71.0..v2.70.0 [2.70.0]: ../../compare/v2.70.0..v2.69.0 [2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 diff --git a/VERSION b/VERSION index 38a7743781..6ab6ec3319 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.70.0 +2.71.0 From 44923673ef66183b368ff9d722845a5930cc996d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 20 Nov 2024 20:09:52 +0100 Subject: [PATCH 179/191] ocrd_cli_wrap_processor: always do initLogging --- src/ocrd/decorators/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index bc969b3279..67e5d0fdf5 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -47,6 +47,9 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # init logging handlers so no imported libs can preempt ours + initLogging() + if not sys.argv[1:]: processorClass(None, show_help=True) sys.exit(1) @@ -67,8 +70,6 @@ def ocrd_cli_wrap_processor( # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - initLogging() - LOG = getLogger('ocrd.cli_wrap_processor') assert kwargs['input_file_grp'] is not None assert kwargs['output_file_grp'] is not None From db21d754e2561664deeb68da26f98307b8e67382 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 6 Jan 2025 16:12:59 +0100 Subject: [PATCH 180/191] ocrd_cli_wrap_processor: always do initLogging --- src/ocrd/decorators/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index f659bf58a0..6e0ceb1f1c 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -48,6 +48,9 @@ def ocrd_cli_wrap_processor( # ocrd_network params end # **kwargs ): + # init logging handlers so no imported libs can preempt ours + initLogging() + # FIXME: remove workspace arg entirely processor = processorClass(None) if not sys.argv[1:]: @@ -89,8 +92,6 @@ def ocrd_cli_wrap_processor( # Used for checking/starting network agents for the WebAPI architecture check_and_run_network_agent(processorClass, subcommand, address, database, queue) - # from here: single-run processing context - initLogging() if 'parameter' in kwargs: # Disambiguate parameter file/literal, and resolve file def resolve(name): From f5f1a3b1e2e7cbc16a76dd2795ee35e4efe59413 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 6 Jan 2025 16:16:24 +0100 Subject: [PATCH 181/191] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c777d55c17..2acca969df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * Do `initLogging` before calling processors in `ocrd_cli_wrap_processor`, #1232, #1296 + ## [2.71.0] - 2024-11-20 Changed: From 71c0c19551d28093422c9af8f8fdcfaf0909a626 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 6 Jan 2025 16:17:35 +0100 Subject: [PATCH 182/191] :package: v2.71.1 --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2acca969df..4ab60e3232 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.71.1] - 2025-01-06 + Changed: * Do `initLogging` before calling processors in `ocrd_cli_wrap_processor`, #1232, #1296 @@ -2235,6 +2237,7 @@ Fixed Initial Release +[2.71.1]: ../../compare/v2.71.1..v2.71.0 [2.71.0]: ../../compare/v2.71.0..v2.70.0 [2.70.0]: ../../compare/v2.70.0..v2.69.0 [2.69.0]: ../../compare/v2.69.0..v2.68.0 From 6e048e113aeffd6e5ef990333d8a373b10a153a9 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 14:52:28 +0100 Subject: [PATCH 183/191] fix help output for multi-line config option descriptions --- src/ocrd/cli/__init__.py | 4 ++-- src/ocrd_utils/config.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9e8a37b8bf..2af14ce632 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -12,11 +12,11 @@ # pylint: disable=wrong-import-position -def command_with_replaced_help(*replacements): +def command_with_replaced_help(*replacements: tuple[str, str]): class CommandWithReplacedHelp(click.Command): def get_help(self, ctx): - newhelp = super().get_help(ctx) + newhelp : str = super().get_help(ctx) for replacement in replacements: newhelp = re.sub(*replacement, newhelp) # print(newhelp) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 36399870e2..16c9eb02e8 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -21,7 +21,7 @@ def _parser_boolean(val): class OcrdEnvVariable(): - def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]): + def __init__(self, name, description, parser=str, validator=lambda _: True, default=[False, None]): """ An environment variable for use in OCR-D. @@ -47,10 +47,19 @@ def __str__(self): return f'{self.name}: {self.description}' def describe(self, wrap_text=True, indent_text=True): + """ + Output help information on a config option. + + If ``option.description`` is a multiline string with complex formatting + (e.g. markdown lists), replace empty lines with ``\b`` and set + ``wrap_text`` to ``False``. + """ desc = self.description if self.has_default: default = self.default() if callable(self.default) else self.default - desc += f' (Default: "{default}")' + if not desc.endswith('\n'): + desc += ' ' + desc += f'(Default: "{default}")' ret = '' ret = f'{self.name}\n' if wrap_text: @@ -146,11 +155,11 @@ def raw_value(self, name): description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): - +\b - `CPU`: yields CPU and wall-time, - `RSS`: also yields peak memory (resident set size) - `PSS`: also yields peak memory (proportional set size) - +\b """, validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), default=(True, '')) @@ -183,11 +192,12 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_MISSING_INPUT", description="""\ -How to deal with missing input files (for some fileGrp/pageId) during processing: - +How to deal with missing input files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed with next page's input - `ABORT`: throw :py:class:`.MissingInputFile` - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'ABORT'], @@ -195,12 +205,13 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_MISSING_OUTPUT", description="""\ -How to deal with missing output files (for some fileGrp/pageId) during processing: - +How to deal with missing output files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed processing next page - `COPY`: fall back to copying input PAGE to output fileGrp for page - `ABORT`: re-throw whatever caused processing to fail - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], @@ -213,12 +224,13 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_EXISTING_OUTPUT", description="""\ -How to deal with already existing output files (for some fileGrp/pageId) during processing: - +How to deal with already existing output files +(for some fileGrp/pageId) during processing: +\b - `SKIP`: ignore and proceed processing next page - `OVERWRITE`: force writing result to output fileGrp for page - `ABORT`: re-throw :py:class:`FileExistsError` - +\b """, default=(True, 'SKIP'), validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], From 3eea1739964b30ae2a6624372e79ef99be13f8a2 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:07:14 +0100 Subject: [PATCH 184/191] merge master --- .github/workflows/publish-pypi.yml | 31 ++++++ CHANGELOG.md | 73 ++++++++++++ Dockerfile.cuda-torch | 2 - Makefile | 8 +- .../ocrd_network.client_utils.rst | 7 ++ docs/api/ocrd_network/ocrd_network.rst | 1 + docs/conf.py | 3 +- repo/assets | 2 +- src/ocrd/cli/__init__.py | 2 + src/ocrd/mets_server.py | 104 +++++++++++------- src/ocrd/resource_manager.py | 2 + src/ocrd_models/ocrd_exif.py | 4 +- src/ocrd_network/cli/client.py | 35 ++++-- src/ocrd_network/client.py | 15 ++- src/ocrd_network/client_utils.py | 39 ++++--- src/ocrd_network/processing_server.py | 42 ++++--- src/ocrd_network/processing_worker.py | 11 +- src/ocrd_network/processor_server.py | 3 +- src/ocrd_network/rabbitmq_utils/connector.py | 4 +- src/ocrd_network/runtime_data/deployer.py | 46 +++++--- src/ocrd_network/server_cache.py | 49 +++++---- src/ocrd_network/server_utils.py | 44 +++++++- src/ocrd_network/tcp_to_uds_mets_proxy.py | 13 ++- src/ocrd_network/utils.py | 34 +++--- src/ocrd_utils/config.py | 16 ++- tests/model/test_exif.py | 8 +- tests/network/config.py | 12 +- .../network/test_modules_mets_server_proxy.py | 2 +- tests/test_resolver.py | 2 +- tests/test_resource_manager.py | 2 +- 30 files changed, 440 insertions(+), 176 deletions(-) create mode 100644 .github/workflows/publish-pypi.yml create mode 100644 docs/api/ocrd_network/ocrd_network.client_utils.rst diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml new file mode 100644 index 0000000000..e811c958ab --- /dev/null +++ b/.github/workflows/publish-pypi.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: make pypi diff --git a/CHANGELOG.md b/CHANGELOG.md index 04ea2d42a1..7f7a0eb2e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -163,6 +163,73 @@ Added: - `Processor.verify`: handle fileGrp cardinality verification, with default implementation - `Processor.setup`: to set up processor before processing, optional +## [2.71.0] - 2024-11-20 + +Changed: + + * Rewrite `ocrd_utils.logging`, #1288 + * Handle only `''` as the root logger + * `disableLogging`: Remove handlers from root and all configured loggers + * Do not do any module-level modification of the log config + +Fixed: + + * Typo in processing_worker log message, #1293 + * Call `initLogging` at the right time in `ocrd_network`, #1292 + * `make docs` fixed with absolute path to location, #1273 + +## [2.70.0] - 2024-10-10 + +Added: + + - `ocrd network client workflow run`: Add `--print-status` flag to periodically print the job status, #1277 + - Processing Server: `DELETE /mets_server_zombies` to kill any renegade METS servers, #1277 + - No more zombie METS Server by properly shutting them down, #1284 + - `OCRD_NETWORK_RABBITMQ_HEARBEAT` to allow overriding the [heartbeat](https://pika.readthedocs.io/en/stable/examples/heartbeat_and_blocked_timeouts.html) behavior of RabbitMQ, #1285 + +Changed: + + - significantly more detailed logging for the METS Server and Processing Server, #1284 + - Only import `ocrd_network` in src/ocrd/decorators/__init__.py once needed, #1289 + - Automate release via GitHub Actions, #1290 + +Fixed: + + - `ocrd/core-cuda-torch`: Install torchvision as well, #1286 + - Processing Server: remove shut down METS servers from deployer's cache, #1287 + - typos, #1274 + +## [2.69.0] - 2024-09-30 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.workspace`: make `list-page` work w/ METS Server + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `lib.bash`: fix `errexit` handling + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + - `disableLogging`: also re-instate root logger to Python defaults + - `OcrdExif`: handle multi-frame TIFFs gracefully in `identify` callout, #1276 + +Changed: + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + - `ClientSideOcrdMets`: use same logger name prefix as METS Server + - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - METS Server: export and delegate `physical_pages` + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - processor CLI: delegate `--resolve-resource`, too + - `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `ocrd_utils.scale_coordinates` for resizing images + ## [2.68.0] - 2024-08-23 Changed: @@ -2322,6 +2389,7 @@ Fixed Initial Release +<<<<<<< HEAD [3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 [3.0.0b4]: ../../compare/v3.0.0b4..v3.0.0b3 @@ -2330,6 +2398,11 @@ Initial Release [3.0.0b1]: ../../compare/v3.0.0b1..v3.0.0a2 [3.0.0a2]: ../../compare/v3.0.0a2..v3.0.0a1 [3.0.0a1]: ../../compare/v3.0.0a1..v2.67.2 +======= +[2.71.0]: ../../compare/v2.71.0..v2.70.0 +[2.70.0]: ../../compare/v2.70.0..v2.69.0 +[2.69.0]: ../../compare/v2.69.0..v2.68.0 +>>>>>>> master [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 diff --git a/Dockerfile.cuda-torch b/Dockerfile.cuda-torch index 8d6c3aa624..59ce1144be 100644 --- a/Dockerfile.cuda-torch +++ b/Dockerfile.cuda-torch @@ -9,7 +9,5 @@ RUN make deps-torch WORKDIR /data -RUN rm -fr /build - CMD ["/usr/local/bin/ocrd", "--help"] diff --git a/Makefile b/Makefile index 1a4a6bbdb8..bb51269558 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,7 @@ deps-cuda: CONDA_EXE ?= /usr/local/bin/conda deps-cuda: export CONDA_PREFIX ?= /conda deps-cuda: PYTHON_PREFIX != $(PYTHON) -c 'import sysconfig; print(sysconfig.get_paths()["purelib"])' deps-cuda: - curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba + curl --retry 6 -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba mv bin/micromamba $(CONDA_EXE) # Install Conda system-wide (for interactive / login shells) echo 'export MAMBA_EXE=$(CONDA_EXE) MAMBA_ROOT_PREFIX=$(CONDA_PREFIX) CONDA_PREFIX=$(CONDA_PREFIX) PATH=$(CONDA_PREFIX)/bin:$$PATH' >> /etc/profile.d/98-conda.sh @@ -97,7 +97,7 @@ deps-cuda: # works, too: shopt -s nullglob; \ $(PIP) install nvidia-pyindex \ - && $(PIP) install nvidia-cudnn-cu11==8.7.* \ + && $(PIP) install nvidia-cudnn-cu11~=8.7 \ nvidia-cublas-cu11~=11.11 \ nvidia-cusparse-cu11~=11.7 \ nvidia-cusolver-cu11~=11.4 \ @@ -158,7 +158,7 @@ deps-tf2: fi deps-torch: - $(PIP) install -i https://download.pytorch.org/whl/cu118 torch + $(PIP) install -i https://download.pytorch.org/whl/cu118 torchvision==0.16.2+cu118 torch==2.1.2+cu118 # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: @@ -178,7 +178,7 @@ build: # (Re)install the tool install: #build - # not stricttly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 + # not strictly necessary but a precaution against outdated python build tools, https://github.com/OCR-D/core/pull/1166 $(PIP) install -U pip wheel $(PIP_INSTALL) . $(PIP_INSTALL_CONFIG_OPTION) @# workaround for shapely#1598 diff --git a/docs/api/ocrd_network/ocrd_network.client_utils.rst b/docs/api/ocrd_network/ocrd_network.client_utils.rst new file mode 100644 index 0000000000..973e27cdb5 --- /dev/null +++ b/docs/api/ocrd_network/ocrd_network.client_utils.rst @@ -0,0 +1,7 @@ +ocrd\_network.client\_utils module +================================== + +.. automodule:: ocrd_network.client_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/ocrd_network/ocrd_network.rst b/docs/api/ocrd_network/ocrd_network.rst index 4497702751..d61da39313 100644 --- a/docs/api/ocrd_network/ocrd_network.rst +++ b/docs/api/ocrd_network/ocrd_network.rst @@ -24,6 +24,7 @@ Submodules :maxdepth: 4 ocrd_network.client + ocrd_network.client_utils ocrd_network.constants ocrd_network.database ocrd_network.logging_utils diff --git a/docs/conf.py b/docs/conf.py index 917c5c62ca..939277ad5f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,8 @@ # import os # import sys # # sys.path.insert(0, os.path.abspath('..')) -with open('../VERSION', encoding='utf-8') as f: +from pathlib import Path +with open(Path(__file__).parent.parent / 'VERSION', encoding='utf-8') as f: VERSION = f.read() diff --git a/repo/assets b/repo/assets index 05568aaa2d..ca108faf0e 160000 --- a/repo/assets +++ b/repo/assets @@ -1 +1 @@ -Subproject commit 05568aaa2dc20678bf87ffec77f3baf2924d7c24 +Subproject commit ca108faf0e95cc823a9e84cd0a1602282ae006b1 diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 9e8a37b8bf..667bddc7c5 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -83,6 +83,8 @@ def get_help(self, ctx): \b {config.describe('OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS')} \b +{config.describe('OCRD_NETWORK_RABBITMQ_HEARTBEAT')} +\b {config.describe('OCRD_PROFILE_FILE')} \b {config.describe('OCRD_PROFILE', wrap_text=False)} diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 101727e064..e0f0029570 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -1,8 +1,10 @@ """ # METS server functionality """ +import os import re from os import _exit, chmod +import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path @@ -155,13 +157,13 @@ def save(self): Request writing the changes to the file system """ if not self.multiplexing_mode: - self.session.request("PUT", url=self.url) + return self.session.request("PUT", url=self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.save(self.ws_dir_path) - ) + ).json()["text"] def stop(self): """ @@ -169,14 +171,13 @@ def stop(self): """ try: if not self.multiplexing_mode: - self.session.request("DELETE", self.url) - return + return self.session.request("DELETE", self.url).text else: - self.session.request( + return self.session.request( "POST", self.url, json=MpxReq.stop(self.ws_dir_path) - ) + ).json()["text"] except ConnectionError: # Expected because we exit the process without returning pass @@ -323,7 +324,7 @@ def add_file( class MpxReq: - """This class wrapps the request bodies needed for the tcp forwarding + """This class wraps the request bodies needed for the tcp forwarding For every mets-server-call like find_files or workspace_path a special request_body is needed to call `MetsServerProxy.forward_tcp_request`. These are created by this functions. @@ -346,12 +347,12 @@ def __args_wrapper( @staticmethod def save(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="PUT", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="PUT", response_type="text", request_url="", request_data={}) @staticmethod def stop(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( - ws_dir_path, method_type="DELETE", response_type="empty", request_url="", request_data={}) + ws_dir_path, method_type="DELETE", response_type="text", request_url="", request_data={}) @staticmethod def reload(ws_dir_path: str) -> Dict: @@ -428,18 +429,24 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int @staticmethod def kill_process(mets_server_pid: int): - subprocess_run(args=["kill", "-s", "SIGINT", f"{mets_server_pid}"], shell=False, universal_newlines=True) + os.kill(mets_server_pid, signal.SIGINT) + sleep(3) + try: + os.kill(mets_server_pid, signal.SIGKILL) + except ProcessLookupError as e: + pass def shutdown(self): + pid = os.getpid() + self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") + os.kill(pid, signal.SIGTERM) if self.is_uds: if Path(self.url).exists(): - self.log.debug(f'UDS socket {self.url} still exists, removing it') + self.log.warning(f"Due to a server shutdown, removing the existing UDS socket file: {self.url}") Path(self.url).unlink() - # os._exit because uvicorn catches SystemExit raised by sys.exit - _exit(0) def startup(self): - self.log.info("Starting up METS server") + self.log.info(f"Configuring the Mets Server") workspace = self.workspace @@ -465,32 +472,49 @@ def save(): """ Write current changes to the file system """ - return workspace.save_mets() + workspace.save_mets() + response = Response(content="The Mets Server is writing changes to disk.", media_type='text/plain') + self.log.info(f"PUT / -> {response.__dict__}") + return response @app.delete(path='/') - async def stop(): + def stop(): """ Stop the mets server """ - getLogger('ocrd.models.ocrd_mets').info(f'Shutting down METS Server {self.url}') workspace.save_mets() + response = Response(content="The Mets Server will shut down soon...", media_type='text/plain') self.shutdown() + self.log.info(f"DELETE / -> {response.__dict__}") + return response @app.post(path='/reload') - async def workspace_reload_mets(): + def workspace_reload_mets(): """ Reload mets file from the file system """ workspace.reload_mets() - return Response(content=f'Reloaded from {workspace.directory}', media_type="text/plain") + response = Response(content=f"Reloaded from {workspace.directory}", media_type='text/plain') + self.log.info(f"POST /reload -> {response.__dict__}") + return response @app.get(path='/unique_identifier', response_model=str) async def unique_identifier(): - return Response(content=workspace.mets.unique_identifier, media_type='text/plain') + response = Response(content=workspace.mets.unique_identifier, media_type='text/plain') + self.log.info(f"GET /unique_identifier -> {response.__dict__}") + return response @app.get(path='/workspace_path', response_model=str) async def workspace_path(): - return Response(content=workspace.directory, media_type="text/plain") + response = Response(content=workspace.directory, media_type="text/plain") + self.log.info(f"GET /workspace_path -> {response.__dict__}") + return response + + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + response = {'physical_pages': workspace.mets.physical_pages} + self.log.info(f"GET /physical_pages -> {response}") + return response @app.get(path='/physical_pages', response_model=OcrdPageListModel) async def physical_pages(): @@ -498,18 +522,24 @@ async def physical_pages(): @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): - return {'file_groups': workspace.mets.file_groups} + response = {'file_groups': workspace.mets.file_groups} + self.log.info(f"GET /file_groups -> {response}") + return response @app.get(path='/agent', response_model=OcrdAgentListModel) async def agents(): - return OcrdAgentListModel.create(workspace.mets.agents) + response = OcrdAgentListModel.create(workspace.mets.agents) + self.log.info(f"GET /agent -> {response.__dict__}") + return response @app.post(path='/agent', response_model=OcrdAgentModel) async def add_agent(agent: OcrdAgentModel): kwargs = agent.dict() kwargs['_type'] = kwargs.pop('type') workspace.mets.add_agent(**kwargs) - return agent + response = agent + self.log.info(f"POST /agent -> {response.__dict__}") + return response @app.get(path="/file", response_model=OcrdFileListModel) async def find_files( @@ -526,7 +556,9 @@ async def find_files( found = workspace.mets.find_all_files( fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, local_filename=local_filename, url=url ) - return OcrdFileListModel.create(found) + response = OcrdFileListModel.create(found) + self.log.info(f"GET /file -> {response.__dict__}") + return response @app.post(path='/file', response_model=OcrdFileModel) async def add_file( @@ -549,7 +581,9 @@ async def add_file( # Add to workspace kwargs = file_resource.dict() workspace.add_file(**kwargs, force=force) - return file_resource + response = file_resource + self.log.info(f"POST /file -> {response.__dict__}") + return response # ------------- # @@ -557,9 +591,6 @@ async def add_file( # Create socket and change to world-readable and -writable to avoid permission errors self.log.debug(f"chmod 0o677 {self.url}") server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - if Path(self.url).exists() and not is_socket_in_use(self.url): - # remove leftover unused socket which blocks startup - Path(self.url).unlink() server.bind(self.url) # creates the socket file atexit.register(self.shutdown) server.close() @@ -571,16 +602,5 @@ async def add_file( uvicorn_kwargs['log_config'] = None uvicorn_kwargs['access_log'] = False - self.log.debug("Starting uvicorn") + self.log.info("Starting the uvicorn Mets Server") uvicorn.run(app, **uvicorn_kwargs) - - -def is_socket_in_use(socket_path): - if Path(socket_path).exists(): - client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - try: - client.connect(socket_path) - except OSError: - return False - client.close() - return True diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 3c4c603060..95d0fec4e1 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -23,6 +23,8 @@ # pylint: enable=wrong-import-position +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index ab050bae59..937416f5ef 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -49,11 +49,11 @@ def run_identify(self, img): for prop in ['compression', 'photometric_interpretation']: setattr(self, prop, img.info[prop] if prop in img.info else None) if img.filename: - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', img.filename], check=False, stderr=PIPE, stdout=PIPE) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE) else: with BytesIO() as bio: img.save(bio, format=img.format) - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) if ret.returncode: stderr = ret.stderr.decode('utf-8') if 'no decode delegate for this image format' in stderr: diff --git a/src/ocrd_network/cli/client.py b/src/ocrd_network/cli/client.py index 9c7f15c88f..350cf64b90 100644 --- a/src/ocrd_network/cli/client.py +++ b/src/ocrd_network/cli/client.py @@ -2,6 +2,7 @@ from json import dumps from typing import List, Optional, Tuple from ocrd.decorators.parameter_option import parameter_option, parameter_override_option +from ocrd_network.constants import JobState from ocrd_utils import DEFAULT_METS_BASENAME from ocrd_utils.introspect import set_json_key_value_overrides from ocrd_utils.str import parse_json_string_or_file @@ -104,8 +105,10 @@ def check_processing_job_status(address: Optional[str], processing_job_id: str): @click.option('--result-queue-name') @click.option('--callback-url') @click.option('--agent-type', default='worker') -@click.option('-b', '--block', default=False, +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_processing_job_request( address: Optional[str], processor_name: str, @@ -120,7 +123,8 @@ def send_processing_job_request( # TODO: This is temporally available to toggle # between the ProcessingWorker/ProcessorServer agent_type: Optional[str], - block: Optional[bool] + block: Optional[bool], + print_state: Optional[bool] ): """ Submit a processing job to the processing server. @@ -146,7 +150,7 @@ def send_processing_job_request( assert processing_job_id print(f"Processing job id: {processing_job_id}") if block: - client.poll_job_status(job_id=processing_job_id) + client.poll_job_status(job_id=processing_job_id, print_state=print_state) @client_cli.group('workflow') @@ -176,24 +180,39 @@ def check_workflow_job_status(address: Optional[str], workflow_job_id: str): 'the "OCRD_NETWORK_SERVER_ADDR_PROCESSING" env variable is used by default') @click.option('-m', '--path-to-mets', required=True) @click.option('-w', '--path-to-workflow', required=True) -@click.option('-b', '--block', default=False, +@click.option('--page-wise/--no-page-wise', is_flag=True, default=False, help="Whether to generate per-page jobs") +@click.option('-b', '--block', default=False, is_flag=True, help='If set, the client will block till job timeout, fail or success.') +@click.option('-p', '--print-state', default=False, is_flag=True, + help='If set, the client will print job states by each iteration.') def send_workflow_job_request( address: Optional[str], path_to_mets: str, path_to_workflow: str, - block: Optional[bool] + page_wise: bool, + block: bool, + print_state: bool ): """ Submit a workflow job to the processing server. """ client = Client(server_addr_processing=address) - workflow_job_id = client.send_workflow_job_request(path_to_wf=path_to_workflow, path_to_mets=path_to_mets) + workflow_job_id = client.send_workflow_job_request( + path_to_wf=path_to_workflow, + path_to_mets=path_to_mets, + page_wise=page_wise, + ) assert workflow_job_id print(f"Workflow job id: {workflow_job_id}") if block: - client.poll_workflow_status(job_id=workflow_job_id) - + print(f"Polling state of workflow job {workflow_job_id}") + state = client.poll_workflow_status(job_id=workflow_job_id, print_state=print_state) + if state != JobState.success: + print(f"Workflow failed with {state}") + exit(1) + else: + print(f"Workflow succeeded") + exit(0) @client_cli.group('workspace') def workspace_cli(): diff --git a/src/ocrd_network/client.py b/src/ocrd_network/client.py index 8ec8e541ea..bb7cf4dbf2 100644 --- a/src/ocrd_network/client.py +++ b/src/ocrd_network/client.py @@ -46,18 +46,21 @@ def check_job_status(self, job_id: str): def check_workflow_status(self, workflow_job_id: str): return get_ps_workflow_job_status(self.server_addr_processing, workflow_job_id=workflow_job_id) - def poll_job_status(self, job_id: str) -> str: + def poll_job_status(self, job_id: str, print_state: bool = False) -> str: return poll_job_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_state=print_state) - def poll_workflow_status(self, job_id: str) -> str: + def poll_workflow_status(self, job_id: str, print_state: bool = False) -> str: return poll_wf_status_till_timeout_fail_or_success( - ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait) + ps_server_host=self.server_addr_processing, job_id=job_id, tries=self.polling_tries, wait=self.polling_wait, + print_state=print_state) def send_processing_job_request(self, processor_name: str, req_params: dict) -> str: return post_ps_processing_request( ps_server_host=self.server_addr_processing, processor=processor_name, job_input=req_params) - def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str): + def send_workflow_job_request(self, path_to_wf: str, path_to_mets: str, page_wise: bool = False): return post_ps_workflow_request( - ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets) + ps_server_host=self.server_addr_processing, path_to_wf=path_to_wf, path_to_mets=path_to_mets, + page_wise=page_wise) diff --git a/src/ocrd_network/client_utils.py b/src/ocrd_network/client_utils.py index 9b924c16a4..4eaf4ea95b 100644 --- a/src/ocrd_network/client_utils.py +++ b/src/ocrd_network/client_utils.py @@ -1,9 +1,10 @@ +import json from requests import get as request_get, post as request_post from time import sleep from .constants import JobState, NETWORK_PROTOCOLS -def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int): +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, print_state: bool = False) -> JobState: if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -13,18 +14,22 @@ def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries job_state = get_ps_processing_job_status(ps_server_host, job_id) if job_type == "workflow": job_state = get_ps_workflow_job_status(ps_server_host, job_id) + if print_state: + print(f"State of the {job_type} job {job_id}: {job_state}") if job_state == JobState.success or job_state == JobState.failed: break tries -= 1 return job_state -def poll_job_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait) +def poll_job_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "processor", tries, wait, print_state) -def poll_wf_status_till_timeout_fail_or_success(ps_server_host: str, job_id: str, tries: int, wait: int) -> JobState: - return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait) +def poll_wf_status_till_timeout_fail_or_success( + ps_server_host: str, job_id: str, tries: int, wait: int, print_state: bool = False) -> JobState: + return _poll_endpoint_status(ps_server_host, job_id, "workflow", tries, wait, print_state) def get_ps_deployed_processors(ps_server_host: str): @@ -47,22 +52,21 @@ def get_ps_processing_job_log(ps_server_host: str, processing_job_id: str): return response -def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> str: +def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> JobState: request_url = f"{ps_server_host}/processor/job/{processing_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state - + return getattr(JobState, job_state.lower()) -def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> str: +def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState: request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" job_state = response.json()["state"] assert job_state - return job_state + return getattr(JobState, job_state.lower()) def post_ps_processing_request(ps_server_host: str, processor: str, job_input: dict) -> str: @@ -78,9 +82,13 @@ def post_ps_processing_request(ps_server_host: str, processor: str, job_input: d return processing_job_id -# TODO: Can be extended to include other parameters such as page_wise -def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: str) -> str: - request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise=True" +def post_ps_workflow_request( + ps_server_host: str, + path_to_wf: str, + path_to_mets: str, + page_wise: bool = False, +) -> str: + request_url = f"{ps_server_host}/workflow/run?mets_path={path_to_mets}&page_wise={'True' if page_wise else 'False'}" response = request_post( url=request_url, headers={"accept": "application/json; charset=utf-8"}, @@ -88,8 +96,11 @@ def post_ps_workflow_request(ps_server_host: str, path_to_wf: str, path_to_mets: ) # print(response.json()) # print(response.__dict__) + json_resp_raw = response.text + # print(f'post_ps_workflow_request >> {response.status_code}') + # print(f'post_ps_workflow_request >> {json_resp_raw}') assert response.status_code == 200, f"Processing server: {request_url}, {response.status_code}" - wf_job_id = response.json()["job_id"] + wf_job_id = json.loads(json_resp_raw)["job_id"] assert wf_job_id return wf_job_id diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index 34c22e5cf6..31eeca5299 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -1,7 +1,7 @@ from datetime import datetime from os import getpid from pathlib import Path -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from uvicorn import run as uvicorn_run from fastapi import APIRouter, FastAPI, File, HTTPException, Request, status, UploadFile @@ -48,6 +48,7 @@ get_workflow_content, get_from_database_workspace, get_from_database_workflow_job, + kill_mets_server_zombies, parse_workflow_tasks, raise_http_exception, request_processor_server_tool_json, @@ -78,7 +79,6 @@ class ProcessingServer(FastAPI): """ def __init__(self, config_path: str, host: str, port: int) -> None: - initLogging() self.title = "OCR-D Processing Server" super().__init__( title=self.title, @@ -86,6 +86,7 @@ def __init__(self, config_path: str, host: str, port: int) -> None: on_shutdown=[self.on_shutdown], description="OCR-D Processing Server" ) + initLogging() self.log = getLogger("ocrd_network.processing_server") log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -155,7 +156,7 @@ def start(self) -> None: queue_names = self.deployer.find_matching_network_agents( worker_only=True, str_names_only=True, unique_only=True ) - self.log.debug(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") + self.log.info(f"Creating message queues on RabbitMQ instance url: {self.rabbitmq_url}") create_message_queues(logger=self.log, rmq_publisher=self.rmq_publisher, queue_names=queue_names) self.deployer.deploy_network_agents(mongodb_url=self.mongodb_url, rabbitmq_url=self.rabbitmq_url) @@ -167,6 +168,7 @@ def start(self) -> None: uvicorn_run(self, host=self.hostname, port=int(self.port)) async def on_startup(self): + self.log.info(f"Initializing the Database on: {self.mongodb_url}") await initiate_database(db_url=self.mongodb_url) async def on_shutdown(self) -> None: @@ -200,6 +202,14 @@ def add_api_routes_others(self): tags=[ServerApiTags.WORKSPACE], summary="Forward a TCP request to UDS mets server" ) + others_router.add_api_route( + path="/kill_mets_server_zombies", + endpoint=self.kill_mets_server_zombies, + methods=["DELETE"], + tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], + status_code=status.HTTP_200_OK, + summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + ) self.include_router(others_router) def add_api_routes_processing(self): @@ -320,7 +330,7 @@ async def forward_tcp_request_to_uds_mets_server(self, request: Request) -> Dict """Forward mets-server-request A processor calls a mets related method like add_file with ClientSideOcrdMets. This sends - a request to this endpoint. This request contains all infomation neccessary to make a call + a request to this endpoint. This request contains all information necessary to make a call to the uds-mets-server. This information is used by `MetsServerProxy` to make a the call to the local (local for the processing-server) reachable the uds-mets-server. """ @@ -574,26 +584,20 @@ async def _cancel_cached_dependent_jobs(self, workspace_key: str, job_id: str) - ) async def _consume_cached_jobs_of_workspace( - self, workspace_key: str, mets_server_url: str + self, workspace_key: str, mets_server_url: str, path_to_mets: str ) -> List[PYJobInput]: - - # Check whether the internal queue for the workspace key still exists - if workspace_key not in self.cache_processing_requests.processing_requests: - self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") - return [] - # decrease the internal cache counter by 1 request_counter = self.cache_processing_requests.update_request_counter( workspace_key=workspace_key, by_value=-1 ) self.log.debug(f"Internal processing job cache counter value: {request_counter}") - if not len(self.cache_processing_requests.processing_requests[workspace_key]): + if (workspace_key not in self.cache_processing_requests.processing_requests or + not len(self.cache_processing_requests.processing_requests[workspace_key])): if request_counter <= 0: # Shut down the Mets Server for the workspace_key since no # more internal callbacks are expected for that workspace self.log.debug(f"Stopping the mets server: {mets_server_url}") - - self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url) + self.deployer.stop_uds_mets_server(mets_server_url=mets_server_url, path_to_mets=path_to_mets) try: # The queue is empty - delete it @@ -609,6 +613,10 @@ async def _consume_cached_jobs_of_workspace( else: self.log.debug(f"Internal request cache is empty but waiting for {request_counter} result callbacks.") return [] + # Check whether the internal queue for the workspace key still exists + if workspace_key not in self.cache_processing_requests.processing_requests: + self.log.debug(f"No internal queue available for workspace with key: {workspace_key}") + return [] consumed_requests = await self.cache_processing_requests.consume_cached_requests(workspace_key=workspace_key) return consumed_requests @@ -643,7 +651,7 @@ async def remove_job_from_request_cache(self, result_message: PYResultMessage): raise_http_exception(self.log, status.HTTP_404_NOT_FOUND, message, error) consumed_cached_jobs = await self._consume_cached_jobs_of_workspace( - workspace_key=workspace_key, mets_server_url=mets_server_url + workspace_key=workspace_key, mets_server_url=mets_server_url, path_to_mets=path_to_mets ) await self.push_cached_jobs_to_agents(processing_jobs=consumed_cached_jobs) @@ -817,6 +825,10 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response + async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]: + pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run) + return pids_killed + async def get_workflow_info_simple(self, workflow_job_id) -> Dict[str, JobState]: """ Simplified version of the `get_workflow_info` that returns a single state for the entire workflow. diff --git a/src/ocrd_network/processing_worker.py b/src/ocrd_network/processing_worker.py index a352ea5fde..302100743d 100644 --- a/src/ocrd_network/processing_worker.py +++ b/src/ocrd_network/processing_worker.py @@ -9,12 +9,12 @@ """ from datetime import datetime -from os import getpid +from os import getpid, getppid from pika import BasicProperties from pika.adapters.blocking_connection import BlockingChannel from pika.spec import Basic -from ocrd_utils import getLogger +from ocrd_utils import getLogger, initLogging from .constants import JobState from .database import sync_initiate_database, sync_db_get_workspace, sync_db_update_processing_job, verify_database_uri from .logging_utils import ( @@ -35,14 +35,16 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: + initLogging() self.log = getLogger(f'ocrd_network.processing_worker') log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") try: verify_database_uri(mongodb_addr) - self.log.debug(f'Verified MongoDB URL: {mongodb_addr}') + self.log.info(f'Verified MongoDB URL: {mongodb_addr}') self.rmq_data = verify_and_parse_mq_uri(rabbitmq_addr) + self.log.info(f'Verified RabbitMQ URL: {rabbitmq_addr}') except ValueError as error: msg = f"Failed to parse data, error: {error}" self.log.exception(msg) @@ -61,6 +63,7 @@ def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, # Gets assigned when the `connect_publisher` is called on the worker object # Used to publish OcrdResultMessage type message to the queue with name {processor_name}-result self.rmq_publisher = None + self.log.info(f"Initialized processing worker: {processor_name}") def connect_consumer(self): self.rmq_consumer = connect_rabbitmq_consumer(self.log, self.rmq_data) @@ -240,7 +243,7 @@ def publish_result_to_all(self, processing_message: OcrdProcessingMessage, resul # post the result message (callback to a user defined endpoint) post_to_callback_url(self.log, callback_url, result_message) if internal_callback_url: - self.log.info(f"Publishing result to internal callback url (Processing Server): {callback_url}") + self.log.info(f"Publishing result to internal callback url (Processing Server): {internal_callback_url}") # If the internal callback_url field is set, # post the result message (callback to Processing Server endpoint) post_to_callback_url(self.log, internal_callback_url, result_message) diff --git a/src/ocrd_network/processor_server.py b/src/ocrd_network/processor_server.py index 5aed89d72c..60674afbf6 100644 --- a/src/ocrd_network/processor_server.py +++ b/src/ocrd_network/processor_server.py @@ -42,13 +42,13 @@ class ProcessorServer(FastAPI): def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class=None): if not (processor_name or processor_class): raise ValueError("Either 'processor_name' or 'processor_class' must be provided") - initLogging() super().__init__( on_startup=[self.on_startup], on_shutdown=[self.on_shutdown], title=f"Network agent - Processor Server", description="Network agent - Processor Server" ) + initLogging() self.log = getLogger("ocrd_network.processor_server") log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -69,6 +69,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.processor_name = self.ocrd_tool["executable"] self.add_api_routes_processing() + self.log.info(f"Initialized processor server: {processor_name}") async def on_startup(self): await initiate_database(db_url=self.db_url) diff --git a/src/ocrd_network/rabbitmq_utils/connector.py b/src/ocrd_network/rabbitmq_utils/connector.py index 893d55a219..8fbbc84ab9 100644 --- a/src/ocrd_network/rabbitmq_utils/connector.py +++ b/src/ocrd_network/rabbitmq_utils/connector.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Union from pika import BasicProperties, BlockingConnection, ConnectionParameters, PlainCredentials from pika.adapters.blocking_connection import BlockingChannel +from ocrd_utils import config from .constants import ( DEFAULT_EXCHANGER_NAME, DEFAULT_EXCHANGER_TYPE, @@ -69,8 +70,7 @@ def open_blocking_connection( port=port, virtual_host=vhost, credentials=credentials, - # TODO: The heartbeat should not be disabled (0)! - heartbeat=0 + heartbeat=config.OCRD_NETWORK_RABBITMQ_HEARTBEAT ), ) return blocking_connection diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index b956904d07..919d5b97ce 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -8,7 +8,7 @@ """ from __future__ import annotations from pathlib import Path -from subprocess import Popen, run as subprocess_run +import psutil from time import sleep from typing import Dict, List, Union @@ -30,6 +30,8 @@ def __init__(self, config_path: str) -> None: self.data_hosts: List[DataHost] = parse_hosts_data(ps_config["hosts"]) self.internal_callback_url = ps_config.get("internal_callback_url", None) self.mets_servers: Dict = {} # {"mets_server_url": "mets_server_pid"} + # This is required to store UDS urls that are multiplexed through the TCP proxy and are not preserved anywhere + self.mets_servers_paths: Dict = {} # {"ws_dir_path": "mets_server_url"} self.use_tcp_mets = ps_config.get("use_tcp_mets", False) # TODO: Reconsider this. @@ -146,25 +148,33 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: if is_mets_server_running(mets_server_url=str(mets_server_url)): self.log.debug(f"The UDS mets server for {ws_dir_path} is already started: {mets_server_url}") return mets_server_url + elif Path(mets_server_url).is_socket(): + self.log.warning( + f"The UDS mets server for {ws_dir_path} is not running but the socket file exists: {mets_server_url}." + "Removing to avoid any weird behavior before starting the server.") + Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") - pid = OcrdMetsServer.create_process(mets_server_url=mets_server_url, ws_dir_path=ws_dir_path, log_file=log_file) - self.mets_servers[mets_server_url] = pid + pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file)) + self.mets_servers[str(mets_server_url)] = pid + self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url - def stop_uds_mets_server(self, mets_server_url: str, stop_with_pid: bool = False) -> None: + def stop_uds_mets_server(self, mets_server_url: str, path_to_mets: str) -> None: self.log.info(f"Stopping UDS mets server: {mets_server_url}") - if stop_with_pid: - if Path(mets_server_url) not in self.mets_servers: - message = f"UDS Mets server not found at URL: {mets_server_url}" - self.log.exception(message) - raise Exception(message) - mets_server_pid = self.mets_servers[Path(mets_server_url)] - OcrdMetsServer.kill_process(mets_server_pid=mets_server_pid) - return - # TODO: Reconsider this again - # Not having this sleep here causes connection errors - # on the last request processed by the processing worker. - # Sometimes 3 seconds is enough, sometimes not. - sleep(5) - stop_mets_server(mets_server_url=mets_server_url) + self.log.info(f"Path to the mets file: {path_to_mets}") + self.log.debug(f"mets_server: {self.mets_servers}") + self.log.debug(f"mets_server_paths: {self.mets_servers_paths}") + workspace_path = str(Path(path_to_mets).parent) + mets_server_url_uds = self.mets_servers_paths[workspace_path] + mets_server_pid = self.mets_servers[mets_server_url_uds] + self.log.info(f"Terminating mets server with pid: {mets_server_pid}") + p = psutil.Process(mets_server_pid) + stop_mets_server(self.log, mets_server_url=mets_server_url, ws_dir_path=workspace_path) + if p.is_running(): + p.wait() + self.log.info(f"Terminated mets server with pid: {mets_server_pid}") + else: + self.log.info(f"Mets server with pid: {mets_server_pid} has already terminated.") + del self.mets_servers_paths[workspace_path] + del self.mets_servers[mets_server_url_uds] return diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index b57f3fd235..179a76139d 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -31,7 +31,7 @@ def check_if_locked_pages_for_output_file_grps( self, workspace_key: str, output_file_grps: List[str], page_ids: List[str] ) -> bool: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return False debug_message = f"Caching the received request due to locked output file grp pages." for file_group in output_file_grps: @@ -46,46 +46,45 @@ def check_if_locked_pages_for_output_file_grps( def get_locked_pages(self, workspace_key: str) -> Dict[str, List[str]]: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No locked pages available for workspace key: {workspace_key}") + self.log.info(f"No locked pages available for workspace key: {workspace_key}") return {} return self.locked_pages[workspace_key] def lock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") - self.log.debug(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"Creating an entry in the locked pages cache for workspace key: {workspace_key}") self.locked_pages[workspace_key] = {} for file_group in output_file_grps: if file_group not in self.locked_pages[workspace_key]: - self.log.debug(f"Creating an empty list for output file grp: {file_group}") + self.log.info(f"Creating an empty list for output file grp: {file_group}") self.locked_pages[workspace_key][file_group] = [] # The page id list is not empty - only some pages are in the request if page_ids: - self.log.debug(f"Locking pages for '{file_group}': {page_ids}") + self.log.info(f"Locking pages for '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group].extend(page_ids) - self.log.debug(f"Locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Locked pages of '{file_group}': {self.locked_pages[workspace_key][file_group]}") else: # Lock all pages with a single value - self.log.debug(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") + self.log.info(f"Locking pages for '{file_group}': {self.placeholder_all_pages}") self.locked_pages[workspace_key][file_group].append(self.placeholder_all_pages) def unlock_pages(self, workspace_key: str, output_file_grps: List[str], page_ids: List[str]) -> None: if not self.locked_pages.get(workspace_key, None): - self.log.debug(f"No entry found in the locked pages cache for workspace key: {workspace_key}") + self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return for file_group in output_file_grps: if file_group in self.locked_pages[workspace_key]: if page_ids: # Unlock the previously locked pages - self.log.debug(f"Unlocking pages of '{file_group}': {page_ids}") + self.log.info(f"Unlocking pages of '{file_group}': {page_ids}") self.locked_pages[workspace_key][file_group] = \ [x for x in self.locked_pages[workspace_key][file_group] if x not in page_ids] - self.log.debug(f"Remaining locked pages of '{file_group}': " - f"{self.locked_pages[workspace_key][file_group]}") + self.log.info(f"Remaining locked pages of '{file_group}': " + f"{self.locked_pages[workspace_key][file_group]}") else: # Remove the single variable used to indicate all pages are locked - self.log.debug(f"Unlocking all pages for: {file_group}") + self.log.info(f"Unlocking all pages for: {file_group}") self.locked_pages[workspace_key][file_group].remove(self.placeholder_all_pages) @@ -127,11 +126,11 @@ def __print_job_input_debug_message(self, job_input: PYJobInput): debug_message += f", page ids: {job_input.page_id}" debug_message += f", job id: {job_input.job_id}" debug_message += f", job depends on: {job_input.depends_on}" - self.log.debug(debug_message) + self.log.info(debug_message) async def consume_cached_requests(self, workspace_key: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be consumed for workspace key: {workspace_key}") + self.log.info(f"No jobs to be consumed for workspace key: {workspace_key}") return [] found_consume_requests = [] for current_element in self.processing_requests[workspace_key]: @@ -165,25 +164,27 @@ def update_request_counter(self, workspace_key: str, by_value: int) -> int: # If a record counter of this workspace key does not exist # in the requests counter cache yet, create one and assign 0 if not self.processing_counter.get(workspace_key, None): - self.log.debug(f"Creating an internal request counter for workspace key: {workspace_key}") + self.log.info(f"Creating an internal request counter for workspace key: {workspace_key}") self.processing_counter[workspace_key] = 0 self.processing_counter[workspace_key] = self.processing_counter[workspace_key] + by_value + self.log.info(f"The new request counter of {workspace_key}: {self.processing_counter[workspace_key]}") return self.processing_counter[workspace_key] def cache_request(self, workspace_key: str, data: PYJobInput): # If a record queue of this workspace key does not exist in the requests cache if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"Creating an internal request queue for workspace_key: {workspace_key}") + self.log.info(f"Creating an internal request queue for workspace_key: {workspace_key}") self.processing_requests[workspace_key] = [] self.__print_job_input_debug_message(job_input=data) # Add the processing request to the end of the internal queue + self.log.info(f"Caching a processing request of {workspace_key}: {data.job_id}") self.processing_requests[workspace_key].append(data) async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str) -> List[PYJobInput]: if not self.has_workspace_cached_requests(workspace_key=workspace_key): - self.log.debug(f"No jobs to be cancelled for workspace key: {workspace_key}") + self.log.info(f"No jobs to be cancelled for workspace key: {workspace_key}") return [] - self.log.debug(f"Cancelling jobs dependent on job id: {processing_job_id}") + self.log.info(f"Cancelling jobs dependent on job id: {processing_job_id}") found_cancel_requests = [] for i, current_element in enumerate(self.processing_requests[workspace_key]): if processing_job_id in current_element.depends_on: @@ -192,7 +193,7 @@ async def cancel_dependent_jobs(self, workspace_key: str, processing_job_id: str for cancel_element in found_cancel_requests: try: self.processing_requests[workspace_key].remove(cancel_element) - self.log.debug(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") + self.log.info(f"For job id: '{processing_job_id}', cancelling job id: '{cancel_element.job_id}'") cancelled_jobs.append(cancel_element) await db_update_processing_job(job_id=cancel_element.job_id, state=JobState.cancelled) # Recursively cancel dependent jobs for the cancelled job @@ -225,9 +226,11 @@ async def sync_is_caching_required(self, job_dependencies: List[str]) -> bool: def has_workspace_cached_requests(self, workspace_key: str) -> bool: if not self.processing_requests.get(workspace_key, None): - self.log.debug(f"In processing requests cache, no workspace key found: {workspace_key}") + self.log.info(f"In processing requests cache, no workspace key found: {workspace_key}") return False if not len(self.processing_requests[workspace_key]): - self.log.debug(f"The processing requests cache is empty for workspace key: {workspace_key}") + self.log.info(f"The processing requests cache is empty for workspace key: {workspace_key}") return False + self.log.info(f"The processing requests cache has {len(self.processing_requests[workspace_key])} " + f"entries for workspace key: {workspace_key} ") return True diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 9d8628170c..6e485f261f 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -1,12 +1,18 @@ +import os +import re +import signal +from pathlib import Path +from json import dumps, loads +from urllib.parse import urljoin +from typing import Dict, List, Optional, Union +from time import time + from fastapi import HTTPException, status, UploadFile from fastapi.responses import FileResponse from httpx import AsyncClient, Timeout -from json import dumps, loads from logging import Logger -from pathlib import Path from requests import get as requests_get -from typing import Dict, List, Union -from urllib.parse import urljoin +from requests_unixsocket import sys from ocrd.resolver import Resolver from ocrd.task_sequence import ProcessorTask @@ -241,3 +247,33 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s if group not in available_groups: message = f"Input file group '{group}' of the first processor not found: {input_file_grps}" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) + + +def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]: + if minutes_ago == None: + minutes_ago = 90 + if dry_run == None: + dry_run = False + + now = time() + cmdline_pat = r'.*ocrd workspace -U.*server start $' + ret = [] + for procdir in sorted(Path('/proc').glob('*'), key=os.path.getctime): + if not procdir.is_dir(): + continue + cmdline_file = procdir.joinpath('cmdline') + if not cmdline_file.is_file(): + continue + ctime_ago = int((now - procdir.stat().st_ctime) / 60) + if ctime_ago < minutes_ago: + continue + cmdline = cmdline_file.read_text().replace('\x00', ' ') + if re.match(cmdline_pat, cmdline): + pid = int(procdir.name) + ret.append(pid) + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + if dry_run: + print(f'[dry_run is active] kill {pid}') + else: + os.kill(pid, signal.SIGTERM) + return ret diff --git a/src/ocrd_network/tcp_to_uds_mets_proxy.py b/src/ocrd_network/tcp_to_uds_mets_proxy.py index 176f4f1442..3f335435ab 100644 --- a/src/ocrd_network/tcp_to_uds_mets_proxy.py +++ b/src/ocrd_network/tcp_to_uds_mets_proxy.py @@ -1,5 +1,5 @@ from requests_unixsocket import Session as requests_unixsocket_session -from .utils import get_uds_path +from .utils import get_uds_path, convert_url_to_uds_format from typing import Dict from ocrd_utils import getLogger @@ -31,9 +31,13 @@ def forward_tcp_request(self, request_body) -> Dict: if method_type not in SUPPORTED_METHOD_TYPES: raise NotImplementedError(f"Method type: {method_type} not recognized") ws_socket_file = str(get_uds_path(ws_dir_path=ws_dir_path)) - ws_unix_socket_url = f'http+unix://{ws_socket_file.replace("/", "%2F")}' + ws_unix_socket_url = convert_url_to_uds_format(ws_socket_file) uds_request_url = f"{ws_unix_socket_url}/{request_url}" + self.log.info(f"Forwarding TCP mets server request to UDS url: {uds_request_url}") + self.log.info(f"Forwarding method type {method_type}, request data: {request_data}, " + f"expected response type: {response_type}") + if not request_data: response = self.session.request(method_type, uds_request_url) elif "params" in request_data: @@ -45,12 +49,11 @@ def forward_tcp_request(self, request_body) -> Dict: else: raise ValueError("Expecting request_data to be empty or containing single key: params," f"form, or class but not {request_data.keys}") - + if response_type == "empty": + return {} if not response: self.log.error(f"Uds-Mets-Server gives unexpected error. Response: {response.__dict__}") return {"error": response.text} - elif response_type == "empty": - return {} elif response_type == "text": return {"text": response.text} elif response_type == "class" or response_type == "dict": diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index a2f563de43..5abe2104fd 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -4,6 +4,7 @@ from functools import wraps from hashlib import md5 from json import loads +from logging import Logger from pathlib import Path from re import compile as re_compile, split as re_split from requests import get as requests_get, Session as Session_TCP @@ -151,22 +152,25 @@ def is_mets_server_running(mets_server_url: str, ws_dir_path: str = None) -> boo return False -def stop_mets_server(mets_server_url: str, ws_dir_path: str = None) -> bool: +def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> bool: protocol = "tcp" if (mets_server_url.startswith("http://") or mets_server_url.startswith("https://")) else "uds" - session = Session_TCP() if protocol == "tcp" else Session_UDS() - if protocol == "uds": - mets_server_url = convert_url_to_uds_format(mets_server_url) - try: - if 'tcp_mets' in mets_server_url: - if not ws_dir_path: - return False - response = session.post(url=f"{mets_server_url}", json=MpxReq.stop(ws_dir_path)) - else: - response = session.delete(url=f"{mets_server_url}/") - except Exception: - return False - return response.status_code == 200 - + # If the mets server URL is the proxy endpoint + if protocol == "tcp" and "tcp_mets" in mets_server_url: + # Convert the mets server url to UDS format + ws_socket_file = str(get_uds_path(ws_dir_path)) + mets_server_url = convert_url_to_uds_format(ws_socket_file) + protocol = "uds" + if protocol == "tcp": + request_json = MpxReq.stop(ws_dir_path) + logger.info(f"Sending POST request to: {mets_server_url}, request_json: {request_json}") + response = Session_TCP().post(url=f"{mets_server_url}", json=request_json) + return response.status_code == 200 + elif protocol == "uds": + logger.info(f"Sending DELETE request to: {mets_server_url}/") + response = Session_UDS().delete(url=f"{mets_server_url}/") + return response.status_code == 200 + else: + ValueError(f"Unexpected protocol type: {protocol}") def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 36399870e2..c5f1e16679 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -231,7 +231,7 @@ def _ocrd_download_timeout_parser(val): config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP", description="How many seconds to sleep before trying again.", parser=int, - default=(True, 30)) + default=(True, 10)) config.add("OCRD_NETWORK_CLIENT_POLLING_TIMEOUT", description="Timeout for a blocking ocrd network client (in seconds).", @@ -247,9 +247,19 @@ def _ocrd_download_timeout_parser(val): default=(True, '')) config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", - description="Number of attempts for a RabbitMQ client to connect before failing.", + description="Number of attempts for a RabbitMQ client to connect before failing.", + parser=int, + default=(True, 3)) + +config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", + description=""" + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. + """, parser=int, - default=(True, 3)) + default=(True, 0) +) config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", description="The root directory where all mets server related socket files are created", diff --git a/tests/model/test_exif.py b/tests/model/test_exif.py index f6771fb8ee..18c5e4c467 100644 --- a/tests/model/test_exif.py +++ b/tests/model/test_exif.py @@ -24,7 +24,13 @@ ('leptonica_samples/data/OCR-D-IMG/OCR-D-IMG_1555_007.jpg', 944, 1472, 1, 1, 1, 'inches', 'RGB', None), ('kant_aufklaerung_1784-jp2/data/OCR-D-IMG/INPUT_0020.jp2', - 1457, 2084, 1, 1, 1, 'inches', 'RGB', None) + 1457, 2084, 1, 1, 1, 'inches', 'RGB', None), + # tolerate multi-frame TIFF: + ('gutachten/data/IMG/IMG_1.tif', + 2088, 2634, 300, 300, 300, 'inches', 'RGB', 'raw'), + # multi-frame TIFF with metric pixel density (is actually YCBCR not RGB but Pillow thinks otherwise...) + ('indian-ferns/data/OCR-D-IMG/0004.tif', + 2626, 3620, 28, 28, 28, 'cm', 'RGB', 'jpeg'), ]) def test_ocrd_exif(path, width, height, xResolution, yResolution, resolution, resolutionUnit, photometricInterpretation, compression): """Check EXIF attributes for different input formats diff --git a/tests/network/config.py b/tests/network/config.py index e22cc6ce9d..611ad63821 100644 --- a/tests/network/config.py +++ b/tests/network/config.py @@ -89,11 +89,19 @@ test_config.add( name="OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", + description="Number of attempts for a RabbitMQ client to connect before failing", + parser=int, + default=(True, 3) +) + +test_config.add( + name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", description=""" - Number of attempts for a RabbitMQ client to connect before failing + Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value + proposed by broker. Use 0 to deactivate heartbeat. """, parser=int, - default=(True, 3) + default=(True, 0) ) test_config.add( diff --git a/tests/network/test_modules_mets_server_proxy.py b/tests/network/test_modules_mets_server_proxy.py index 8b8c0d35f7..f19d7e415e 100644 --- a/tests/network/test_modules_mets_server_proxy.py +++ b/tests/network/test_modules_mets_server_proxy.py @@ -119,7 +119,7 @@ def test_find_files(start_uds_mets_server): {"file_grp": test_file_group} ) response_dict = MetsServerProxy().forward_tcp_request(request_body=request_body) - assert len(response_dict["files"]) == 3, "Expected to find exatly 3 matching files" + assert len(response_dict["files"]) == 3, "Expected to find exactly 3 matching files" request_body = MpxReq.find_files( TEST_WORKSPACE_DIR, {"file_grp": test_non_existing_file_group} diff --git a/tests/test_resolver.py b/tests/test_resolver.py index c2575b6086..97d2ee6658 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -118,7 +118,7 @@ def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): @patch.object(Session, "get") def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path): """ - Fail with clobber_mets=False, succeeed with clobber_mets=True + Fail with clobber_mets=False, succeed with clobber_mets=True """ # arrange diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index 653167e10a..286f6ea6b0 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -80,7 +80,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): assert mgr.userdir == tmp_path -def test_resources_manager_config_explicite(tmp_path): +def test_resources_manager_config_explicit(tmp_path): # act from ocrd.resource_manager import OcrdResourceManager From 358d40630a948d2ad1a35af94ba95fb5b6ab74ef Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:56:15 +0100 Subject: [PATCH 185/191] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bacd5aebe..b3ab85de32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + - Merge v2 master into new-procesor-api + + ## [3.0.0b7] - 2024-11-12 Fixed: From 75ce41507fa3d1259ce7d3c17ef2e41fce00dae1 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:56:41 +0100 Subject: [PATCH 186/191] :memo: changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3ab85de32..f49b954334 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ Changed: - Merge v2 master into new-procesor-api +Fixed: + + - `ocrd --help` output was broken for multiline config options, bertsky/core#25 ## [3.0.0b7] - 2024-11-12 From ff2a73bd04dd8164c95c2931d6ee8c5eb528dadc Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 19:57:54 +0100 Subject: [PATCH 187/191] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f49b954334..8fcb0c54da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Changed: Fixed: - `ocrd --help` output was broken for multiline config options, bertsky/core#25 + - Call `initLogging` before instantiating processors in `ocrd_cli_wrap_processor`, bertsky/core#24, #1296 ## [3.0.0b7] - 2024-11-12 From e59222abe5735b99eca65d46c1a2b4a1ba3d5ec4 Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 20:06:21 +0100 Subject: [PATCH 188/191] :memo: changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8fcb0c54da..fc9170522b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). Changed: - Merge v2 master into new-procesor-api + - PAGE API: Update to latest generateDS 2.44.1, bertsky/core#21 Fixed: - `ocrd --help` output was broken for multiline config options, bertsky/core#25 - Call `initLogging` before instantiating processors in `ocrd_cli_wrap_processor`, bertsky/core#24, #1296 + - PAGE API: Fully reversable mapping from/to XML element/generateDS instances, bertsky/core#21 + +Added: + + - `ocrd-filter` processor to remove segments based on XPath expressions, bertsky/core#21 + - XPath function `pc:pixelarea` for the number of pixels of the bounding box (or sum area on node sets), bertsky/core#21 + - XPath function `pc:textequiv` for the first TextEquiv unicode string (or concatenated string on node sets), bertsky/core#21 ## [3.0.0b7] - 2024-11-12 From 68786a671a5615ae5559ca00a0a307df30ae25ef Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 8 Jan 2025 20:43:46 +0100 Subject: [PATCH 189/191] remove 3.8 breaking typing hints --- src/ocrd/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 6f37858ecf..794538752d 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -12,7 +12,7 @@ # pylint: disable=wrong-import-position -def command_with_replaced_help(*replacements: tuple[str, str]): +def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): def get_help(self, ctx): From 5d69e497a2e269ebaee0da7caf43a2035cf80068 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 9 Jan 2025 11:30:22 +0100 Subject: [PATCH 190/191] :package: :fire: v3.0.0 --- CHANGELOG.md | 3 +++ VERSION | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc9170522b..c3ad521248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [3.0.0] - 2025-01-09 + Changed: - Merge v2 master into new-procesor-api @@ -2412,6 +2414,7 @@ Fixed Initial Release +[3.0.0]: ../../compare/v3.0.0..v3.0.0b7 [3.0.0b7]: ../../compare/v3.0.0b7..v3.0.0b6 [3.0.0b6]: ../../compare/v3.0.0b6..v3.0.0b5 [3.0.0b5]: ../../compare/v3.0.0b5..v3.0.0b4 diff --git a/VERSION b/VERSION index 1129dfd443..4a36342fca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.0.0b7 +3.0.0 From 92fac9b1ebf0753f8d74ce7289fcfdf2cd8ef59e Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 9 Jan 2025 15:11:31 +0100 Subject: [PATCH 191/191] CI: Use token authentication for pypi --- .github/workflows/publish-pypi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml index e811c958ab..1b0e85e60a 100644 --- a/.github/workflows/publish-pypi.yml +++ b/.github/workflows/publish-pypi.yml @@ -26,6 +26,6 @@ jobs: pip install -r requirements.txt - name: Build and publish env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: make pypi