From 0c2a81869b6952a4dd12b4f8ada24c82d970f821 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 8 Apr 2025 20:14:22 +0200 Subject: [PATCH 01/13] :package: 0.0.2, migrate to OCR-D v3 --- docstruct/config.py | 4 -- docstruct/ocrd-tool.json | 4 +- docstruct/proc.py | 103 +++++++++++++++++++-------------------- requirements.txt | 2 +- 4 files changed, 55 insertions(+), 58 deletions(-) delete mode 100644 docstruct/config.py diff --git a/docstruct/config.py b/docstruct/config.py deleted file mode 100644 index 01e0b23..0000000 --- a/docstruct/config.py +++ /dev/null @@ -1,4 +0,0 @@ -import json -from pkg_resources import resource_string - -OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/docstruct/ocrd-tool.json b/docstruct/ocrd-tool.json index 32abf25..7e3dc14 100644 --- a/docstruct/ocrd-tool.json +++ b/docstruct/ocrd-tool.json @@ -1,5 +1,5 @@ { - "version": "0.0.1", + "version": "0.0.2", "git_url": "https://github.com/bertsky/docstruct", "tools": { "ocrd-docstruct": { @@ -7,6 +7,8 @@ "categories": ["Layout analysis"], "description": "Parsing page-level text regions with headings and reading order, create a dummy logical structMap", "steps": ["layout/analysis"], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 0, "parameters": { "mode": { "type": "string", diff --git a/docstruct/proc.py b/docstruct/proc.py index ae78b9e..442a581 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -1,19 +1,18 @@ -import click +from typing import Optional, get_args + from lxml import etree as ET +import click -from ocrd import Processor +from ocrd import Processor, Workspace from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, xywh_from_points, xywh_from_bbox, bbox_from_xywh ) from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_mets import OcrdMets - +from ocrd_models.ocrd_file import OcrdFileType +from ocrd_models.ocrd_page import OcrdPage from ocrd_models.ocrd_mets import OcrdMets from ocrd_models.constants import ( NAMESPACES as NS, @@ -22,39 +21,32 @@ TAG_METS_STRUCTMAP, ) -from .config import OCRD_TOOL - # should go into ocrd_models.constants: TAG_METS_STRUCTLINK = '{%s}structLink' % NS['mets'] TAG_METS_SMLINK = '{%s}smLink' % NS['mets'] TAG_METS_AREA = '{%s}area' % NS['mets'] TAG_METS_SEQ = '{%s}seq' % NS['mets'] -TOOL = 'ocrd-docstruct' class OcrdDocStruct(Processor): + max_workers = 1 - def __init__(self, *args, **kwargs): - kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] - kwargs['version'] = OCRD_TOOL['version'] - super().__init__(*args, **kwargs) - self.last_result = [] - self.log_links = {} - self.first = None + @property + def executable(self): + return 'ocrd-docstruct' - def create_logmap_smlink(self): - LOG = getLogger('OcrdDocStruct') - el_root = self.workspace.mets._tree.getroot() + def create_logmap_smlink(self, mets: OcrdMets): + el_root = mets._tree.getroot() self.log = el_root.find('mets:structMap[@TYPE="LOGICAL"]', NS) if self.log is None: self.log = ET.SubElement(el_root, TAG_METS_STRUCTMAP) self.log.set('TYPE', 'LOGICAL') - LOG.info('mets:structMap LOGICAL created') + self.logger.info('mets:structMap LOGICAL created') else: - LOG.warning('mets:structMap LOGICAL already exists, adding to it') + self.logger.warning('mets:structMap LOGICAL already exists, adding to it') self.log_map = {div.get('ID'): div for div in self.log.xpath('.//mets:div', namespaces=NS)} self.log_ids = [id_ for id_ in self.log_map.keys() if id_ and id_.startswith("LOG_")] - self.phy_ids = self.workspace.mets.physical_pages + self.phy_ids = mets.physical_pages self.link = el_root.find(TAG_METS_STRUCTLINK) if self.link is None and self.parameter['mode'] != 'enmap': self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK) @@ -65,36 +57,44 @@ def create_logmap_smlink(self): smlink_log = smlink.get('{' + NS['xlink'] + '}from') self.link_map.setdefault(smlink_phy, list()).append(smlink_log) - def process(self): + def process_workspace(self, workspace: Workspace) -> None: """ """ - LOG = getLogger('OcrdDocStruct') - assert_file_grp_cardinality(self.input_file_grp, 1) + self.create_logmap_smlink(workspace.mets) + self.results = [] + super().process_workspace(workspace) + self.write_to_mets() + + def process_page_file(self, input_file : OcrdFileType) -> None: + assert isinstance(input_file, get_args(OcrdFileType)) + page_id = input_file.pageId + self._base_logger.info("processing page %s", page_id) + self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + input_pcgts = page_from_file(input_file) + assert isinstance(input_pcgts, OcrdPage) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") + return + mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink # FIXME: more parameters (what kind of region types, geometric rules etc) - self.create_logmap_smlink() - - results = [] - for input_file in self.input_files: - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s", page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']: - LOG.info("skipping page type %s", page.get_type()) - continue - if page.get_type() in ['table-of-contents', 'index']: - # FIXME use directly - LOG.info("skipping page type %s", page.get_type()) - results.extend(self.extract_text(page, input_file)) - self.write_to_mets(results) + page = input_pcgts.get_Page() + if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']: + self.logger.info("skipping page type %s", page.get_type()) + return + if page.get_type() in ['table-of-contents', 'index']: + # FIXME use directly + self.logger.info("skipping page type %s", page.get_type()) + return + self.results.extend(self.extract_text(page, input_file)) def extract_text(self, page, input_file): """ get text regions in reading order, put them into a hierarchy (via heuristic rules) """ - LOG = getLogger('OcrdDocStruct') target = self.parameter['type'] result = [] # FIXME: what about non-text regions (tables, images)? @@ -119,7 +119,7 @@ def extract_text(self, page, input_file): region_xywh = xywh_from_points(region.get_Coords().points) region_text = region.get_TextEquiv() if not region_text: - LOG.warning("skipping empty text region %s", region.id) + self.logger.warning("skipping empty text region %s", region.id) continue region_text = region_text[0].Unicode # FIXME: textual cues @@ -131,8 +131,7 @@ def extract_text(self, page, input_file): result.append([input_file, region.id, region_xywh, 'text', '']) return result - def write_to_mets(self, results): - LOG = getLogger('OcrdDocStruct') + def write_to_mets(self): mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink log_ids = sorted(int(id_[4:]) for id_ in self.log_ids if id_[4:].isnumeric()) @@ -179,13 +178,13 @@ def add_area(parent, file_id, region_id): div = None last_type = None last_page = None - for input_file, region_id, region_xywh, region_type, region_text in results: + for input_file, region_id, region_xywh, region_type, region_text in self.results: page_id = input_file.pageId if region_type == 'text': if div is None: - LOG.warning("%s: skipping region '%s' prior to first heading", page_id, region_id) + self.logger.warning("%s: skipping region '%s' prior to first heading", page_id, region_id) continue - LOG.info("continuing with text region %s on page %s", region_id, page_id) + self.logger.info("continuing with text region %s on page %s", region_id, page_id) if mode == 'enmap': # add to current div add_area(div, input_file.ID, region_id) @@ -199,7 +198,7 @@ def add_area(parent, file_id, region_id): loglist = self.link_map.get(page_id, []) if len(loglist): log = self.log_map[loglist[-1]] - LOG.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE')) + self.logger.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE')) else: # get deepest embedded, still non-structural existing div log = next([log for log in reversed(self.log.iterdescendants(TAG_METS_DIV)) @@ -210,7 +209,7 @@ def add_area(parent, file_id, region_id): 'volume', 'monograph', # 'chapter', 'letter', 'fascicle', 'fragment', 'manuscript', 'bundle', ]], self.log) - LOG.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE')) + self.logger.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE')) div = log div_type = div.get('TYPE').lower() if (div_type in [ @@ -230,7 +229,7 @@ def add_area(parent, file_id, region_id): else: # coordination div = add_div(div.getparent(), region_type, region_text) - LOG.info("continuing with %s region %s on page %s", region_type, region_id, page_id) + self.logger.info("continuing with %s region %s on page %s", region_type, region_id, page_id) if mode == 'enmap': # add to new div add_area(div, input_file.ID, region_id) diff --git a/requirements.txt b/requirements.txt index 121fdae..1aa85af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -ocrd >= 2.38.0 +ocrd >= 3.3.0 lxml click From 9ec9e74c3faae9c8973b511a36443ecb80ad64ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 01:42:11 +0200 Subject: [PATCH 02/13] =?UTF-8?q?switch=20`setup.py`=20=E2=86=92=20`pyproj?= =?UTF-8?q?ect.toml`=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (including `ocrd-tool.json` based versioning) --- Makefile | 17 +++++++++++++---- pyproject.toml | 45 +++++++++++++++++++++++++++++++++++++++++++++ setup.py | 35 ----------------------------------- 3 files changed, 58 insertions(+), 39 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/Makefile b/Makefile index eabe93b..a761d14 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,11 @@ help: @echo @echo " Targets" @echo - @echo " deps Install only Python deps via pip" - @echo " install Install full Python package via pip" - @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" + @echo " deps Install only Python deps via pip" + @echo " install Install full Python package via pip" + @echo " install-dev Install in editable mode" + @echo " build Build binary and source Python package" + @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" # Install Python deps via pip deps: @@ -21,6 +23,13 @@ deps: install: $(PIP) install . +install-dev: + $(PIP) install -e . + +build: + $(PIP) install build wheel + $(PYTHON) -m build . + docker: docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ @@ -28,4 +37,4 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -.PHONY: help deps install docker +.PHONY: help deps install install-dev build docker diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b9d2025 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] + +[project] +name = "docstruct" +authors = [ + {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, +] +description = "Document structure detection from PAGE to METS" +readme = "README.md" +license.text = "Apache-2.0" +requires-python = ">=3.8" + +dynamic = ["version", "dependencies"] + +# https://pypi.org/classifiers/ +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Intended Audience :: Other Audience", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing", +] + +[project.scripts] +ocrd-docstruct = "docstruct.proc:cli" + +[project.urls] +Homepage = "https://github.com/bertsky/docstruct" +Repository = "https://github.com/bertsky/docstruct.git" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} +optional-dependencies.test = {file = ["requirements-test.txt"]} + +[tool.setuptools] +packages = ["docstruct"] +package-data = {"*" = ["ocrd-tool.json"]} + +[tool.coverage.run] +branch = true +source = ["docstruct"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 71cc8b7..0000000 --- a/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Installs: - - - ocrd-docstruct -""" -import codecs - -import json -from setuptools import setup, find_packages - -with open('./ocrd-tool.json', 'r') as f: - version = json.load(f)['version'] - -setup( - name='docstruct', - version=version, - description='Document structure detection from PAGE to METS', - long_description=codecs.open('README.md', encoding='utf-8').read(), - long_description_content_type='text/markdown', - author='Robert Sachunsky', - author_email='sachunsky@informatik.uni-leipzig.de', - url='https://github.com/bertsky/docstruct', - license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), - install_requires=open('requirements.txt').read().split('\n'), - package_data={ - '': ['*.json', '*.yml', '*.yaml'], - }, - entry_points={ - 'console_scripts': [ - 'ocrd-docstruct=docstruct.proc:cli', - ] - }, -) From 7846234ce6a7e2cdba826fb039909212aef00e91 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 01:42:52 +0200 Subject: [PATCH 03/13] =?UTF-8?q?update=20Docker=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - labels and variables conforming to spec - preinstall `ocrd-all-tool.json` - base stage version --- Dockerfile | 41 +++++++++++++++++++++++++++++++---------- Makefile | 2 +- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index c87f290..1a8a6f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,20 +3,41 @@ FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://ocr-d.de/kontakt" \ + maintainer="https://ocr-d.de/en/contact" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/bertsky/docstruct" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="nmalign" \ + org.opencontainers.image.description="Document structure detection from PAGE to METS" \ + org.opencontainers.image.source="https://github.com/bertsky/docstruct" \ + org.opencontainers.image.documentation="https://github.com/bertsky/docstruct/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONIOENCODING=utf8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/docstruct -COPY setup.py . -COPY docstruct/ocrd-tool.json . -COPY docstruct ./docstruct -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/docstruct +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# install everything and reduce image size +RUN make install && rm -rf /build/docstruct +# smoke test +RUN ocrd-docstruct -h WORKDIR /data VOLUME ["/data"] diff --git a/Makefile b/Makefile index a761d14..3fde9f3 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/docstruct help: From b220be60e8a456e67f8795ff0606ac696a6e8802 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 18:00:50 +0200 Subject: [PATCH 04/13] fix search for top-level mets:div for existing mets:structMap --- docstruct/proc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docstruct/proc.py b/docstruct/proc.py index 442a581..f99cb9f 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -201,14 +201,14 @@ def add_area(parent, file_id, region_id): self.logger.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE')) else: # get deepest embedded, still non-structural existing div - log = next([log for log in reversed(self.log.iterdescendants(TAG_METS_DIV)) + log = next((log for log in reversed(list(self.log.iterdescendants(TAG_METS_DIV))) if log.get('TYPE').lower() in [ # 'serial', 'multivolume_work', 'newspaper', 'issue', # 'month', 'year', 'part', 'folder', 'map', 'illustration', 'additional', 'volume', 'monograph', # 'chapter', 'letter', 'fascicle', 'fragment', 'manuscript', 'bundle', - ]], self.log) + ]), self.log) self.logger.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE')) div = log div_type = div.get('TYPE').lower() From 2203a6da14ebfd92a321501d1658e5751844743f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 18:04:15 +0200 Subject: [PATCH 05/13] fix lxml boolean element checks (now interpreted as list) --- docstruct/proc.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docstruct/proc.py b/docstruct/proc.py index f99cb9f..8fa15fb 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -154,19 +154,17 @@ def add_div(parent, div_type, text): def add_link(page_id, div_id): # add mets:smLink entry to mets:structLink (for dfg representation) link = ET.SubElement(self.link, TAG_METS_SMLINK) - link.set('{' + NS['xlink'] + '}to', page_id) link.set('{' + NS['xlink'] + '}from', div_id) + link.set('{' + NS['xlink'] + '}to', page_id) self.link_map.setdefault(page_id, []).append(div_id) return link def add_area(parent, file_id, region_id): # add mets:fptr/mets:area entry to mets:div (for enmap representation) - fptr = parent.find(TAG_METS_FPTR) - if fptr is None: + if (fptr := parent.find(TAG_METS_FPTR)) is None: fptr = ET.SubElement(parent, TAG_METS_FPTR) - if fptr.find(TAG_METS_SEQ): - fptr = fptr.find(TAG_METS_SEQ) - elif fptr.find(TAG_METS_AREA): - area = fptr.find(TAG_METS_AREA) + if (seq := fptr.find(TAG_METS_SEQ)) is not None: + fptr = seq + elif (area := fptr.find(TAG_METS_AREA)) is not None: fptr.remove(area) fptr = ET.SubElement(fptr, TAG_METS_SEQ) fptr.append(area) From 6a02fbf08ff92e9b39230eb758769549338707e5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 18:05:11 +0200 Subject: [PATCH 06/13] use UUID for new mets:div, no assumptions on @ID of existing ones --- docstruct/proc.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/docstruct/proc.py b/docstruct/proc.py index 8fa15fb..937af57 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -1,4 +1,5 @@ from typing import Optional, get_args +import uuid from lxml import etree as ET import click @@ -37,25 +38,26 @@ def executable(self): def create_logmap_smlink(self, mets: OcrdMets): el_root = mets._tree.getroot() - self.log = el_root.find('mets:structMap[@TYPE="LOGICAL"]', NS) - if self.log is None: - self.log = ET.SubElement(el_root, TAG_METS_STRUCTMAP) - self.log.set('TYPE', 'LOGICAL') + if (structmap := el_root.find(TAG_METS_STRUCTMAP + '[@TYPE="LOGICAL"]')) is None: + structmap = ET.SubElement(el_root, TAG_METS_STRUCTMAP) + structmap.set('TYPE', 'LOGICAL') self.logger.info('mets:structMap LOGICAL created') else: self.logger.warning('mets:structMap LOGICAL already exists, adding to it') - self.log_map = {div.get('ID'): div for div in self.log.xpath('.//mets:div', namespaces=NS)} - self.log_ids = [id_ for id_ in self.log_map.keys() if id_ and id_.startswith("LOG_")] + self.log = structmap + divs = list(self.log.iterdescendants(TAG_METS_DIV)) + self.log_map = {div.get('ID'): div for div in divs} + self.log_ids = [div.get('ID') for div in divs] self.phy_ids = mets.physical_pages - self.link = el_root.find(TAG_METS_STRUCTLINK) - if self.link is None and self.parameter['mode'] != 'enmap': - self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK) self.link_map = dict() - if self.link is not None: + if (structlink := el_root.find(TAG_METS_STRUCTLINK)) is not None: + self.link = structlink for smlink in self.link.findall(TAG_METS_SMLINK): smlink_phy = smlink.get('{' + NS['xlink'] + '}to') smlink_log = smlink.get('{' + NS['xlink'] + '}from') self.link_map.setdefault(smlink_phy, list()).append(smlink_log) + elif self.parameter['mode'] != 'enmap': + self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK) def process_workspace(self, workspace: Workspace) -> None: """ @@ -133,16 +135,8 @@ def extract_text(self, page, input_file): def write_to_mets(self): mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink - log_ids = sorted(int(id_[4:]) for id_ in self.log_ids - if id_[4:].isnumeric()) - if log_ids: - last_id = log_ids[-1] - else: - last_id = 0 def add_div(parent, div_type, text): - nonlocal last_id - last_id += 1 - div_id = "LOG_" + str(last_id) + div_id = str(uuid.uuid4()) div = ET.SubElement(parent, TAG_METS_DIV) div.set('TYPE', div_type) div.set('ID', div_id) From 954d75ce909780be92ef5145cec57818712a0d2c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 18:09:18 +0200 Subject: [PATCH 07/13] add basic tests via Pytest --- .gitignore | 3 ++ .gitmodules | 3 ++ Makefile | 47 +++++++++++++++++--- repo/assets | 1 + requirements-test.txt | 3 ++ tests/__init__.py | 0 tests/conftest.py | 100 ++++++++++++++++++++++++++++++++++++++++++ tests/test_all.py | 72 ++++++++++++++++++++++++++++++ 8 files changed, 224 insertions(+), 5 deletions(-) create mode 100644 .gitmodules create mode 160000 repo/assets create mode 100644 requirements-test.txt create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_all.py diff --git a/.gitignore b/.gitignore index 4074f70..c14e1ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +tests/assets/ +/*.zip + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5b24fbb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "repo/assets"] + path = repo/assets + url = https://github.com/OCR-D/assets diff --git a/Makefile b/Makefile index 3fde9f3..2157ae3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS ?= "-vv" DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/docstruct @@ -9,11 +10,20 @@ help: @echo @echo " Targets" @echo - @echo " deps Install only Python deps via pip" - @echo " install Install full Python package via pip" - @echo " install-dev Install in editable mode" - @echo " build Build binary and source Python package" - @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" + @echo " deps Install only Python deps via pip" + @echo " install Install full Python package via pip" + @echo " install-dev Install in editable mode" + @echo " build Build binary and source Python package" + @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" + @echo " test Run tests via Pytest" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " tests/assets Copy to ./tests/assets" + @echo "" + @echo " Variables" + @echo "" + @echo " DOCKER_TAG Docker container tag ($(DOCKER_TAG))" + @echo " PYTEST_ARGS Additional runtime options for pytest ($(PYTEST_ARGS))" + @echo " (See --help, esp. custom option --workspace)" # Install Python deps via pip deps: @@ -30,6 +40,33 @@ build: $(PIP) install build wheel $(PYTHON) -m build . +# Run test +test: tests/assets + $(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) + +# +# Assets +# + +# Update OCR-D/assets submodule +.PHONY: repos always-update tests/assets +repo/assets: always-update + git submodule sync --recursive $@ + if git submodule status --recursive $@ | grep -qv '^ '; then \ + git submodule update --init --recursive $@ && \ + touch $@; \ + fi + +benner_herrnhuterey04_1748.ocrd.zip: + wget https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/$@ + +# Setup test assets +tests/assets: benner_herrnhuterey04_1748.ocrd.zip +tests/assets: repo/assets + mkdir -p $@ + cp -a $ 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + directory = str(tmpdir) + resolver = Resolver() + url = WORKSPACES[asset] + workspace = resolver.workspace_from_url(url, dst_dir=directory) # download=True + workspace.name = asset # for debugging + # determine GT file group and download PAGE files + gtGrp = None + for file in workspace.find_files(mimetype=MIMETYPE_PAGE): + if file.url.startswith("file:/"): + # ignore broken and irrelevant groups + workspace.remove_file(file.ID, force=True) + elif 'GT' in file.fileGrp and (gtGrp or file.fileGrp) == file.fileGrp: + gtGrp = file.fileGrp + workspace.download_file(file) + yield workspace, gtGrp + disableLogging() + +def pytest_addoption(parser): + parser.addoption("--workspace", + action="append", + choices=list(WORKSPACES) + ["all"], + help="workspace(s) to run on (set 'all' to use all)" + ) + +@pytest.hookimpl +def pytest_generate_tests(metafunc): + if "asset" in metafunc.fixturenames: + ws = metafunc.config.getoption("workspace") + if ws == ['all']: + ws = list(WORKSPACES) + elif not ws: + ws = ["aufklaerung"] # default + metafunc.parametrize("asset", ws) + +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@pytest.fixture(params=CONFIGS) +def processor_kwargs(request, workspace): + config.OCRD_DOWNLOAD_INPUT = False # only pre-downloaded pages + workspace, gt_grp = workspace + config.OCRD_MISSING_OUTPUT = "ABORT" # --debug + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + #print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + #print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + #print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + # instantiate client-side workspace + asset = workspace.name + workspace = Workspace(workspace.resolver, workspace.directory, + mets_server_url='mets.sock', + mets_basename=os.path.basename(workspace.mets_target)) + workspace.name = asset + yield {'workspace': workspace, 'input_file_grp': gt_grp, 'mets_server_url': 'mets.sock'} + process.terminate() + else: + yield {'workspace': workspace, 'input_file_grp': gt_grp} + config.reset_defaults() diff --git a/tests/test_all.py b/tests/test_all.py new file mode 100644 index 0000000..2c93230 --- /dev/null +++ b/tests/test_all.py @@ -0,0 +1,72 @@ +# pylint: disable=import-error + +import os +import pytest + +from ocrd import Workspace, run_processor +from ocrd_models.constants import NAMESPACES as NS + +from docstruct.proc import OcrdDocStruct + +# from ocrd_pagetopdf +def get_structure(mets): + metsroot = mets._tree.getroot() + structlink = next(metsroot.iterfind('.//mets:structLink', NS), None) + smlinks = {link.get('{http://www.w3.org/1999/xlink}from'): + link.get('{http://www.w3.org/1999/xlink}to') + for link in reversed(structlink.findall('./mets:smLink', NS) + if structlink else [])} + phymap = next(structmap for structmap in metsroot.iterfind('.//mets:structMap', NS) + if structmap.get('TYPE') == 'PHYSICAL') + topdiv = next(phymap.iterfind('./mets:div', NS)) + pages = {page.get('ID'): page.get('ORDER') or order + for order, page in enumerate(topdiv.findall('./mets:div', NS)) + if page.get('TYPE') == "page"} + logmap = next((structmap for structmap in metsroot.iterfind('.//mets:structMap', NS) + if structmap.get('TYPE') == 'LOGICAL'), None) + if logmap is None: + return None + topdiv = next(logmap.iterfind('./mets:div', NS), None) + if topdiv is None: + return None + # descend to deepest ADM + while (topdiv.get('ADMID') is None and + (div := topdiv.find('./mets:div', NS)) is not None): + topdiv = div + # we want to dive into multivolume_work, periodical, newspaper, year, month... + # we are looking for issue, volume, monograph, lecture, dossier, act, judgement, study, paper, *_thesis, report, register, file, fragment, manuscript... + while ((div := topdiv.find('./mets:div', NS)) is not None and + div.get('ADMID') is not None): + topdiv = div + #for div in topdiv.iterdescendants('{%s}div' % NS['mets']): + # recursive: + def find_depth(div, depth=0): + div_id = div.get('ID', div.getparent().get('ID')) + return { + 'label': div.get('LABEL') or div.get('ORDERLABEL') or '', + 'type': div.get('TYPE') or '', + 'id': div_id, + 'page': pages.get(smlinks.get(div_id, ''), ''), + 'depth': depth, + 'subs': [find_depth(subdiv, depth+1) + for subdiv in div.findall('./mets:div', NS)] + } + struct = find_depth(topdiv) + return struct + +def test_docstruct(processor_kwargs): + ws = processor_kwargs['workspace'] + input_file_grp = processor_kwargs['input_file_grp'] + if not input_file_grp: + pytest.skip(f"workspace asset {ws.name} has no PAGE GT fileGrp") + offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target)) + structure_old = get_structure(offline_ws.mets) + run_processor(OcrdDocStruct, + output_file_grp="", # as long as core#1321 is open, we must something here + parameter=dict(mode="enmap"), + **processor_kwargs, + ) + ws.save_mets() + offline_ws.reload_mets() + structure_new = get_structure(offline_ws.mets) + assert structure_old != structure_new From 118336d3b51d8d9563043dc0c05be196ab0f54b2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 18:33:58 +0200 Subject: [PATCH 08/13] make work in the presence of a METS Server --- docstruct/proc.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/docstruct/proc.py b/docstruct/proc.py index 937af57..19b6bc5 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -1,3 +1,4 @@ +import os from typing import Optional, get_args import uuid @@ -15,6 +16,7 @@ from ocrd_models.ocrd_file import OcrdFileType from ocrd_models.ocrd_page import OcrdPage from ocrd_models.ocrd_mets import OcrdMets +from ocrd.mets_server import ClientSideOcrdMets from ocrd_models.constants import ( NAMESPACES as NS, TAG_METS_DIV, @@ -58,14 +60,38 @@ def create_logmap_smlink(self, mets: OcrdMets): self.link_map.setdefault(smlink_phy, list()).append(smlink_log) elif self.parameter['mode'] != 'enmap': self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK) + else: + self.link = None + + def reset(self): + del self.log + del self.log_map + del self.log_ids + del self.phy_ids + del self.link_map + del self.link + del self.results def process_workspace(self, workspace: Workspace) -> None: """ """ - self.create_logmap_smlink(workspace.mets) + if isinstance(workspace.mets, ClientSideOcrdMets): + # serialise and write METS to disk + # (in-memory changes could come from prio processing step) + workspace.save_mets() + # instantiate (read and parse) METS from disk (read-only, metadata are constant) + ws = Workspace(workspace.resolver, workspace.directory, + mets_basename=os.path.basename(workspace.mets_target)) + else: + ws = workspace + self.create_logmap_smlink(ws.mets) self.results = [] super().process_workspace(workspace) - self.write_to_mets() + self.update_mets() + self.reset() + ws.save_mets() + if isinstance(workspace.mets, ClientSideOcrdMets): + workspace.reload_mets() def process_page_file(self, input_file : OcrdFileType) -> None: assert isinstance(input_file, get_args(OcrdFileType)) @@ -133,7 +159,7 @@ def extract_text(self, page, input_file): result.append([input_file, region.id, region_xywh, 'text', '']) return result - def write_to_mets(self): + def update_mets(self): mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink def add_div(parent, div_type, text): div_id = str(uuid.uuid4()) From a8bc99d9484f45d9fc3c382b6534017dea4459df Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 19:02:37 +0200 Subject: [PATCH 09/13] fix tests --- Makefile | 2 +- tests/conftest.py | 1 - tests/test_all.py | 15 ++++++--------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 2157ae3..82c884a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 -PYTEST_ARGS ?= "-vv" +PYTEST_ARGS ?= "-vv --workspace=all" DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/docstruct diff --git a/tests/conftest.py b/tests/conftest.py index 7ed5b17..01c7e12 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,6 @@ WORKSPACES = { "manifesto": assets.path_to('communist_manifesto/data/mets.xml'), "aufklaerung": assets.path_to('kant_aufklaerung_1784/data/mets.xml'), - "sbb": assets.path_to('SBB0000F29300010000/data/mets.xml'), "herrnhuterey04": assets.path_to('benner_herrnhuterey04_1748.ocrd/mets.xml'), } diff --git a/tests/test_all.py b/tests/test_all.py index 2c93230..5e802b4 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -11,23 +11,20 @@ # from ocrd_pagetopdf def get_structure(mets): metsroot = mets._tree.getroot() - structlink = next(metsroot.iterfind('.//mets:structLink', NS), None) + structlink = metsroot.find('mets:structLink', NS) smlinks = {link.get('{http://www.w3.org/1999/xlink}from'): link.get('{http://www.w3.org/1999/xlink}to') for link in reversed(structlink.findall('./mets:smLink', NS) - if structlink else [])} - phymap = next(structmap for structmap in metsroot.iterfind('.//mets:structMap', NS) - if structmap.get('TYPE') == 'PHYSICAL') + if structlink is not None else [])} + phymap = metsroot.find('mets:structMap[@TYPE="PHYSICAL"]', NS) topdiv = next(phymap.iterfind('./mets:div', NS)) pages = {page.get('ID'): page.get('ORDER') or order for order, page in enumerate(topdiv.findall('./mets:div', NS)) if page.get('TYPE') == "page"} - logmap = next((structmap for structmap in metsroot.iterfind('.//mets:structMap', NS) - if structmap.get('TYPE') == 'LOGICAL'), None) + logmap = metsroot.find('mets:structMap[@TYPE="LOGICAL"]', NS) if logmap is None: return None - topdiv = next(logmap.iterfind('./mets:div', NS), None) - if topdiv is None: + if (topdiv := logmap.find('./mets:div', NS)) is None: return None # descend to deepest ADM while (topdiv.get('ADMID') is None and @@ -58,7 +55,7 @@ def test_docstruct(processor_kwargs): ws = processor_kwargs['workspace'] input_file_grp = processor_kwargs['input_file_grp'] if not input_file_grp: - pytest.skip(f"workspace asset {ws.name} has no PAGE GT fileGrp") + pytest.skip(f"workspace asset '{ws.name}' has no PAGE GT fileGrp") offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target)) structure_old = get_structure(offline_ws.mets) run_processor(OcrdDocStruct, From 4e27195a677b26824049755dc83328baa0949b3d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 19:09:53 +0200 Subject: [PATCH 10/13] test both mode=enmap and mode=dfg via subtests --- tests/test_all.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/test_all.py b/tests/test_all.py index 5e802b4..0578cea 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -51,19 +51,25 @@ def find_depth(div, depth=0): struct = find_depth(topdiv) return struct -def test_docstruct(processor_kwargs): +def test_docstruct(processor_kwargs, subtests): ws = processor_kwargs['workspace'] input_file_grp = processor_kwargs['input_file_grp'] if not input_file_grp: pytest.skip(f"workspace asset '{ws.name}' has no PAGE GT fileGrp") + # for tests w/ METS Server, retrieve a new OcrdMets directly from the file offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target)) structure_old = get_structure(offline_ws.mets) - run_processor(OcrdDocStruct, - output_file_grp="", # as long as core#1321 is open, we must something here - parameter=dict(mode="enmap"), - **processor_kwargs, - ) - ws.save_mets() - offline_ws.reload_mets() - structure_new = get_structure(offline_ws.mets) - assert structure_old != structure_new + for mode in ['enmap', 'dfg']: + with subtests.test(mode=mode): + run_processor(OcrdDocStruct, + output_file_grp="", # as long as core#1321 is open, we must something here + parameter=dict(mode=mode), + **processor_kwargs, + ) + ws.save_mets() + offline_ws.reload_mets() + structure_new = get_structure(offline_ws.mets) + assert structure_old != structure_new + # reset mets.xml to previous state + offline_ws.save_mets() + From 377cf48d0a7a2c597a63160dd8406cb9eb609289 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 19:29:47 +0200 Subject: [PATCH 11/13] fix uuid generation: xs:ID needs alphabetical prefix --- docstruct/proc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docstruct/proc.py b/docstruct/proc.py index 19b6bc5..4e30f8b 100644 --- a/docstruct/proc.py +++ b/docstruct/proc.py @@ -162,7 +162,7 @@ def extract_text(self, page, input_file): def update_mets(self): mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink def add_div(parent, div_type, text): - div_id = str(uuid.uuid4()) + div_id = 'uuid-' + str(uuid.uuid4()) div = ET.SubElement(parent, TAG_METS_DIV) div.set('TYPE', div_type) div.set('ID', div_id) From 345c990f5108ef88be649e52487af1054b716a22 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 19:58:10 +0200 Subject: [PATCH 12/13] fix subtest cleanup, add more asserts and METS XSD validation --- tests/test_all.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_all.py b/tests/test_all.py index 0578cea..6266570 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,10 +1,12 @@ # pylint: disable=import-error import os +from pathlib import Path import pytest from ocrd import Workspace, run_processor from ocrd_models.constants import NAMESPACES as NS +from ocrd_validators.xsd_mets_validator import XsdMetsValidator from docstruct.proc import OcrdDocStruct @@ -59,6 +61,7 @@ def test_docstruct(processor_kwargs, subtests): # for tests w/ METS Server, retrieve a new OcrdMets directly from the file offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target)) structure_old = get_structure(offline_ws.mets) + mets_old = offline_ws.mets.to_xml(xmllint=True).decode('utf-8') for mode in ['enmap', 'dfg']: with subtests.test(mode=mode): run_processor(OcrdDocStruct, @@ -70,6 +73,12 @@ def test_docstruct(processor_kwargs, subtests): offline_ws.reload_mets() structure_new = get_structure(offline_ws.mets) assert structure_old != structure_new - # reset mets.xml to previous state - offline_ws.save_mets() - + if structure_old: + assert structure_old['id'] == structure_new['id'] + assert len(structure_new['subs']) > 0 + report = XsdMetsValidator.validate(Path(ws.mets_target)) + assert not report.errors + # reset METS to previous state + with open(ws.mets_target, 'w') as mets_file: + mets_file.write(mets_old) + ws.reload_mets() From 867cd017aaf9f5b62e0f199182460d6b2eb637ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Apr 2025 20:02:13 +0200 Subject: [PATCH 13/13] add coverage test, add CI --- .github/workflows/test-python.yml | 34 +++++++++++++++++++++++++++++++ Makefile | 10 ++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-python.yml diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml new file mode 100644 index 0000000..81ea755 --- /dev/null +++ b/.github/workflows/test-python.yml @@ -0,0 +1,34 @@ +name: Pytest CI + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + make install-dev + make deps-test + - name: Test with pytest + run: make coverage PYTEST_ARGS="-vv --workspace=all --junitxml=pytest.xml" + - name: Get coverage results + run: | + coverage report --format=markdown >> $GITHUB_STEP_SUMMARY + coverage xml + - name: Store coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-report_${{ matrix.python-version }} + path: pytest.xml diff --git a/Makefile b/Makefile index 82c884a..fbf845f 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,9 @@ help: deps: $(PIP) install -r requirements.txt +deps-test: + $(PIP) install -r requirements-test.txt + # Install Python package via pip install: $(PIP) install . @@ -44,6 +47,11 @@ build: test: tests/assets $(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) +coverage: + coverage erase + $(MAKE) test PYTHON="coverage run" + coverage report -m + # # Assets # @@ -74,4 +82,4 @@ docker: --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG) . -.PHONY: help deps install install-dev build docker +.PHONY: help deps deps-test install install-dev build test coverage docker