diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml
new file mode 100644
index 0000000..81ea755
--- /dev/null
+++ b/.github/workflows/test-python.yml
@@ -0,0 +1,34 @@
+name: Pytest CI
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        make install-dev
+        make deps-test
+    - name: Test with pytest
+      run: make coverage PYTEST_ARGS="-vv --workspace=all --junitxml=pytest.xml"
+    - name: Get coverage results
+      run: |
+        coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
+        coverage xml
+    - name: Store coverage results
+      uses: actions/upload-artifact@v4
+      with:
+          name: coverage-report_${{ matrix.python-version }}
+          path: pytest.xml
diff --git a/.gitignore b/.gitignore
index 4074f70..c14e1ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+tests/assets/
+/*.zip
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..5b24fbb
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "repo/assets"]
+	path = repo/assets
+	url = https://github.com/OCR-D/assets
diff --git a/Dockerfile b/Dockerfile
index c87f290..1a8a6f1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,20 +3,41 @@ FROM $DOCKER_BASE_IMAGE
 ARG VCS_REF
 ARG BUILD_DATE
 LABEL \
-    maintainer="https://ocr-d.de/kontakt" \
+    maintainer="https://ocr-d.de/en/contact" \
     org.label-schema.vcs-ref=$VCS_REF \
     org.label-schema.vcs-url="https://github.com/bertsky/docstruct" \
-    org.label-schema.build-date=$BUILD_DATE
+    org.label-schema.build-date=$BUILD_DATE \
+    org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
+    org.opencontainers.image.title="nmalign" \
+    org.opencontainers.image.description="Document structure detection from PAGE to METS" \
+    org.opencontainers.image.source="https://github.com/bertsky/docstruct" \
+    org.opencontainers.image.documentation="https://github.com/bertsky/docstruct/blob/${VCS_REF}/README.md" \
+    org.opencontainers.image.revision=$VCS_REF \
+    org.opencontainers.image.created=$BUILD_DATE \
+    org.opencontainers.image.base.name=ocrd/core
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONIOENCODING=utf8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+# avoid HOME/.local/share (hard to predict USER here)
+# so let XDG_DATA_HOME coincide with fixed system location
+# (can still be overridden by derived stages)
+ENV XDG_DATA_HOME /usr/local/share
+# avoid the need for an extra volume for persistent resource user db
+# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
+ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
 
 WORKDIR /build/docstruct
-COPY setup.py .
-COPY docstruct/ocrd-tool.json .
-COPY docstruct ./docstruct
-COPY requirements.txt .
-COPY README.md .
-COPY Makefile .
-RUN make install
-RUN rm -rf /build/docstruct
+COPY . .
+COPY ocrd-tool.json .
+# prepackage ocrd-tool.json as ocrd-all-tool.json
+RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
+# install everything and reduce image size
+RUN make install && rm -rf /build/docstruct
+# smoke test
+RUN ocrd-docstruct -h
 
 WORKDIR /data
 VOLUME ["/data"]
diff --git a/Makefile b/Makefile
index eabe93b..fbf845f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,26 +1,80 @@
 PYTHON = python3
 PIP = pip3
 PYTHONIOENCODING=utf8
+PYTEST_ARGS ?= "-vv --workspace=all"
 
-DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
+DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
 DOCKER_TAG = ocrd/docstruct
 
 help:
 	@echo
 	@echo "  Targets"
 	@echo
-	@echo "    deps    Install only Python deps via pip"
-	@echo "    install Install full Python package via pip"
-	@echo "    docker  Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)"
+	@echo "    deps         Install only Python deps via pip"
+	@echo "    install      Install full Python package via pip"
+	@echo "    install-dev  Install in editable mode"
+	@echo "    build        Build binary and source Python package"
+	@echo "    docker       Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)"
+	@echo "    test         Run tests via Pytest"
+	@echo "    repo/assets  Clone OCR-D/assets to ./repo/assets"
+	@echo "    tests/assets Copy to ./tests/assets"
+	@echo ""
+	@echo "  Variables"
+	@echo ""
+	@echo "    DOCKER_TAG  Docker container tag ($(DOCKER_TAG))"
+	@echo "    PYTEST_ARGS Additional runtime options for pytest ($(PYTEST_ARGS))"
+	@echo "                (See --help, esp. custom option --workspace)"
 
 # Install Python deps via pip
 deps:
 	$(PIP) install -r requirements.txt
 
+deps-test:
+	$(PIP) install -r requirements-test.txt
+
 # Install Python package via pip
 install:
 	$(PIP) install .
 
+install-dev:
+	$(PIP) install -e .
+
+build:
+	$(PIP) install build wheel
+	$(PYTHON) -m build .
+
+# Run test
+test: tests/assets
+	$(PYTHON) -m pytest  tests --durations=0 $(PYTEST_ARGS)
+
+coverage:
+	coverage erase
+	$(MAKE) test PYTHON="coverage run"
+	coverage report -m
+
+#
+# Assets
+#
+
+# Update OCR-D/assets submodule
+.PHONY: repos always-update tests/assets
+repo/assets: always-update
+	git submodule sync --recursive $@
+	if git submodule status --recursive $@ | grep -qv '^ '; then \
+		git submodule update --init --recursive $@ && \
+		touch $@; \
+	fi
+
+benner_herrnhuterey04_1748.ocrd.zip:
+	wget https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/$@
+
+# Setup test assets
+tests/assets: benner_herrnhuterey04_1748.ocrd.zip
+tests/assets: repo/assets
+	mkdir -p $@
+	cp -a $</data/* $@
+	$(foreach BAG,$(filter %.zip,$^),ocrd zip spill -d $@/$(basename $(BAG)) $(BAG))
+
 docker:
 	docker build \
 	--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
@@ -28,4 +82,4 @@ docker:
 	--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
 	-t $(DOCKER_TAG) .
 
-.PHONY: help deps install docker
+.PHONY: help deps deps-test install install-dev build test coverage docker
diff --git a/docstruct/config.py b/docstruct/config.py
deleted file mode 100644
index 01e0b23..0000000
--- a/docstruct/config.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import json
-from pkg_resources import resource_string
-
-OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
diff --git a/docstruct/ocrd-tool.json b/docstruct/ocrd-tool.json
index 32abf25..7e3dc14 100644
--- a/docstruct/ocrd-tool.json
+++ b/docstruct/ocrd-tool.json
@@ -1,5 +1,5 @@
 {
-  "version": "0.0.1",
+  "version": "0.0.2",
   "git_url": "https://github.com/bertsky/docstruct",
   "tools": {
     "ocrd-docstruct": {
@@ -7,6 +7,8 @@
       "categories": ["Layout analysis"],
       "description": "Parsing page-level text regions with headings and reading order, create a dummy logical structMap",
       "steps": ["layout/analysis"],
+      "input_file_grp_cardinality": 1,
+      "output_file_grp_cardinality": 0,
       "parameters": {
         "mode": {
           "type": "string",
diff --git a/docstruct/proc.py b/docstruct/proc.py
index ae78b9e..4e30f8b 100644
--- a/docstruct/proc.py
+++ b/docstruct/proc.py
@@ -1,20 +1,22 @@
-import click
+import os
+from typing import Optional, get_args
+import uuid
+
 from lxml import etree as ET
+import click
 
-from ocrd import Processor
+from ocrd import Processor, Workspace
 from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
 from ocrd_utils import (
-    getLogger,
-    assert_file_grp_cardinality,
     xywh_from_points,
     xywh_from_bbox,
     bbox_from_xywh
 )
 from ocrd_modelfactory import page_from_file
-from ocrd_models.ocrd_page import to_xml
-from ocrd_models.ocrd_mets import OcrdMets
-
+from ocrd_models.ocrd_file import OcrdFileType
+from ocrd_models.ocrd_page import OcrdPage
 from ocrd_models.ocrd_mets import OcrdMets
+from ocrd.mets_server import ClientSideOcrdMets
 from ocrd_models.constants import (
     NAMESPACES as NS,
     TAG_METS_DIV,
@@ -22,79 +24,105 @@
     TAG_METS_STRUCTMAP,
 )
 
-from .config import OCRD_TOOL
-
 # should go into ocrd_models.constants:
 TAG_METS_STRUCTLINK = '{%s}structLink' % NS['mets']
 TAG_METS_SMLINK = '{%s}smLink' % NS['mets']
 TAG_METS_AREA = '{%s}area' % NS['mets']
 TAG_METS_SEQ = '{%s}seq' % NS['mets']
 
-TOOL = 'ocrd-docstruct'
 
 class OcrdDocStruct(Processor):
+    max_workers = 1
 
-    def __init__(self, *args, **kwargs):
-        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
-        kwargs['version'] = OCRD_TOOL['version']
-        super().__init__(*args, **kwargs)
-        self.last_result = [] 
-        self.log_links = {}
-        self.first = None
+    @property
+    def executable(self):
+        return 'ocrd-docstruct'
 
-    def create_logmap_smlink(self):
-        LOG = getLogger('OcrdDocStruct')
-        el_root = self.workspace.mets._tree.getroot()
-        self.log = el_root.find('mets:structMap[@TYPE="LOGICAL"]', NS)
-        if self.log is None:
-            self.log = ET.SubElement(el_root, TAG_METS_STRUCTMAP)
-            self.log.set('TYPE', 'LOGICAL')
-            LOG.info('mets:structMap LOGICAL created')
+    def create_logmap_smlink(self, mets: OcrdMets):
+        el_root = mets._tree.getroot()
+        if (structmap := el_root.find(TAG_METS_STRUCTMAP + '[@TYPE="LOGICAL"]')) is None:
+            structmap = ET.SubElement(el_root, TAG_METS_STRUCTMAP)
+            structmap.set('TYPE', 'LOGICAL')
+            self.logger.info('mets:structMap LOGICAL created')
         else:
-            LOG.warning('mets:structMap LOGICAL already exists, adding to it')
-        self.log_map = {div.get('ID'): div for div in self.log.xpath('.//mets:div', namespaces=NS)}
-        self.log_ids = [id_ for id_ in self.log_map.keys() if id_ and id_.startswith("LOG_")]
-        self.phy_ids = self.workspace.mets.physical_pages
-        self.link = el_root.find(TAG_METS_STRUCTLINK)
-        if self.link is None and self.parameter['mode'] != 'enmap':
-            self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK)
+            self.logger.warning('mets:structMap LOGICAL already exists, adding to it')
+        self.log = structmap
+        divs = list(self.log.iterdescendants(TAG_METS_DIV))
+        self.log_map = {div.get('ID'): div for div in divs}
+        self.log_ids = [div.get('ID') for div in divs]
+        self.phy_ids = mets.physical_pages
         self.link_map = dict()
-        if self.link is not None:
+        if (structlink := el_root.find(TAG_METS_STRUCTLINK)) is not None:
+            self.link = structlink
             for smlink in self.link.findall(TAG_METS_SMLINK):
                 smlink_phy = smlink.get('{' + NS['xlink'] + '}to')
                 smlink_log = smlink.get('{' + NS['xlink'] + '}from')
                 self.link_map.setdefault(smlink_phy, list()).append(smlink_log)
+        elif self.parameter['mode'] != 'enmap':
+            self.link = ET.SubElement(el_root, TAG_METS_STRUCTLINK)
+        else:
+            self.link = None
 
-    def process(self):
+    def reset(self):
+        del self.log
+        del self.log_map
+        del self.log_ids
+        del self.phy_ids
+        del self.link_map
+        del self.link
+        del self.results
+
+    def process_workspace(self, workspace: Workspace) -> None:
         """
         """
-        LOG = getLogger('OcrdDocStruct')
-        assert_file_grp_cardinality(self.input_file_grp, 1)
+        if isinstance(workspace.mets, ClientSideOcrdMets):
+            # serialise and write METS to disk
+            # (in-memory changes could come from prio processing step)
+            workspace.save_mets()
+            # instantiate (read and parse) METS from disk (read-only, metadata are constant)
+            ws = Workspace(workspace.resolver, workspace.directory,
+                           mets_basename=os.path.basename(workspace.mets_target))
+        else:
+            ws = workspace
+        self.create_logmap_smlink(ws.mets)
+        self.results = []
+        super().process_workspace(workspace)
+        self.update_mets()
+        self.reset()
+        ws.save_mets()
+        if isinstance(workspace.mets, ClientSideOcrdMets):
+            workspace.reload_mets()
+
+    def process_page_file(self, input_file : OcrdFileType) -> None:
+        assert isinstance(input_file, get_args(OcrdFileType))
+        page_id = input_file.pageId
+        self._base_logger.info("processing page %s", page_id)
+        self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
+        try:
+            input_pcgts = page_from_file(input_file)
+            assert isinstance(input_pcgts, OcrdPage)
+        except ValueError as err:
+            # not PAGE and not an image to generate PAGE for
+            self._base_logger.error(f"non-PAGE input for page {page_id}: {err}")
+            return
+
         mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink
         # FIXME: more parameters (what kind of region types, geometric rules etc)
 
-        self.create_logmap_smlink()
-
-        results = []
-        for input_file in self.input_files:
-            page_id = input_file.pageId or input_file.ID
-            LOG.info("INPUT FILE %s", page_id)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
-            page = pcgts.get_Page()
-            if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']:
-                LOG.info("skipping page type %s", page.get_type())
-                continue
-            if page.get_type() in ['table-of-contents', 'index']:
-                # FIXME use directly
-                LOG.info("skipping page type %s", page.get_type())
-            results.extend(self.extract_text(page, input_file))
-        self.write_to_mets(results)
+        page = input_pcgts.get_Page()
+        if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']:
+            self.logger.info("skipping page type %s", page.get_type())
+            return
+        if page.get_type() in ['table-of-contents', 'index']:
+            # FIXME use directly
+            self.logger.info("skipping page type %s", page.get_type())
+            return
+        self.results.extend(self.extract_text(page, input_file))
 
     def extract_text(self, page, input_file):
         """
         get text regions in reading order, put them into a hierarchy (via heuristic rules)
         """
-        LOG = getLogger('OcrdDocStruct')
         target = self.parameter['type']
         result = []
         # FIXME: what about non-text regions (tables, images)?
@@ -119,7 +147,7 @@ def extract_text(self, page, input_file):
             region_xywh = xywh_from_points(region.get_Coords().points)
             region_text = region.get_TextEquiv()
             if not region_text:
-                LOG.warning("skipping empty text region %s", region.id)
+                self.logger.warning("skipping empty text region %s", region.id)
                 continue
             region_text = region_text[0].Unicode
             # FIXME: textual cues
@@ -131,19 +159,10 @@ def extract_text(self, page, input_file):
                 result.append([input_file, region.id, region_xywh, 'text', ''])
         return result
 
-    def write_to_mets(self, results):  
-        LOG = getLogger('OcrdDocStruct')
+    def update_mets(self):
         mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink
-        log_ids = sorted(int(id_[4:]) for id_ in self.log_ids
-                         if id_[4:].isnumeric())
-        if log_ids:
-            last_id = log_ids[-1]
-        else:
-            last_id = 0
         def add_div(parent, div_type, text):
-            nonlocal last_id
-            last_id += 1
-            div_id = "LOG_" + str(last_id)
+            div_id = 'uuid-' + str(uuid.uuid4())
             div = ET.SubElement(parent, TAG_METS_DIV)
             div.set('TYPE', div_type)            
             div.set('ID', div_id)
@@ -155,19 +174,17 @@ def add_div(parent, div_type, text):
         def add_link(page_id, div_id):
             # add mets:smLink entry to mets:structLink (for dfg representation)
             link = ET.SubElement(self.link, TAG_METS_SMLINK)
-            link.set('{' + NS['xlink'] + '}to', page_id)
             link.set('{' + NS['xlink'] + '}from', div_id)
+            link.set('{' + NS['xlink'] + '}to', page_id)
             self.link_map.setdefault(page_id, []).append(div_id)
             return link
         def add_area(parent, file_id, region_id):
             # add mets:fptr/mets:area entry to mets:div (for enmap representation)
-            fptr = parent.find(TAG_METS_FPTR)
-            if fptr is None:
+            if (fptr := parent.find(TAG_METS_FPTR)) is None:
                 fptr = ET.SubElement(parent, TAG_METS_FPTR)
-            if fptr.find(TAG_METS_SEQ):
-                fptr = fptr.find(TAG_METS_SEQ)
-            elif fptr.find(TAG_METS_AREA):
-                area = fptr.find(TAG_METS_AREA)
+            if (seq := fptr.find(TAG_METS_SEQ)) is not None:
+                fptr = seq
+            elif (area := fptr.find(TAG_METS_AREA)) is not None:
                 fptr.remove(area)
                 fptr = ET.SubElement(fptr, TAG_METS_SEQ)
                 fptr.append(area)
@@ -179,13 +196,13 @@ def add_area(parent, file_id, region_id):
         div = None
         last_type = None
         last_page = None
-        for input_file, region_id, region_xywh, region_type, region_text in results:
+        for input_file, region_id, region_xywh, region_type, region_text in self.results:
             page_id = input_file.pageId
             if region_type == 'text':
                 if div is None:
-                    LOG.warning("%s: skipping region '%s' prior to first heading", page_id, region_id)
+                    self.logger.warning("%s: skipping region '%s' prior to first heading", page_id, region_id)
                     continue
-                LOG.info("continuing with text region %s on page %s", region_id, page_id)
+                self.logger.info("continuing with text region %s on page %s", region_id, page_id)
                 if mode == 'enmap':
                     # add to current div
                     add_area(div, input_file.ID, region_id)
@@ -199,18 +216,18 @@ def add_area(parent, file_id, region_id):
                     loglist = self.link_map.get(page_id, [])
                     if len(loglist):
                         log = self.log_map[loglist[-1]]
-                        LOG.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE'))
+                        self.logger.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE'))
                     else:
                         # get deepest embedded, still non-structural existing div
-                        log = next([log for log in reversed(self.log.iterdescendants(TAG_METS_DIV))
+                        log = next((log for log in reversed(list(self.log.iterdescendants(TAG_METS_DIV)))
                                     if log.get('TYPE').lower() in [
                                             # 'serial', 'multivolume_work', 'newspaper',
                                             'issue', # 'month', 'year', 
                                             'part', 'folder', 'map', 'illustration', 'additional',
                                             'volume', 'monograph', # 'chapter',
                                             'letter', 'fascicle', 'fragment', 'manuscript', 'bundle',
-                                    ]], self.log)
-                        LOG.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE'))
+                                    ]), self.log)
+                        self.logger.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE'))
                     div = log
                 div_type = div.get('TYPE').lower()
                 if (div_type in [
@@ -230,7 +247,7 @@ def add_area(parent, file_id, region_id):
                 else:
                     # coordination
                     div = add_div(div.getparent(), region_type, region_text)
-                LOG.info("continuing with %s region %s on page %s", region_type, region_id, page_id)
+                self.logger.info("continuing with %s region %s on page %s", region_type, region_id, page_id)
                 if mode == 'enmap':
                     # add to new div
                     add_area(div, input_file.ID, region_id)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b9d2025
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"]
+
+[project]
+name = "docstruct"
+authors = [
+    {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"},
+]
+description = "Document structure detection from PAGE to METS"
+readme = "README.md"
+license.text = "Apache-2.0"
+requires-python = ">=3.8"
+
+dynamic = ["version", "dependencies"]
+
+# https://pypi.org/classifiers/
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Other Audience",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Text Processing",
+]
+
+[project.scripts]
+ocrd-docstruct = "docstruct.proc:cli"
+
+[project.urls]
+Homepage = "https://github.com/bertsky/docstruct"
+Repository = "https://github.com/bertsky/docstruct.git"
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+optional-dependencies.test = {file = ["requirements-test.txt"]}
+
+[tool.setuptools]
+packages = ["docstruct"]
+package-data = {"*" = ["ocrd-tool.json"]}
+
+[tool.coverage.run]
+branch = true
+source = ["docstruct"]
diff --git a/repo/assets b/repo/assets
new file mode 160000
index 0000000..d004ab7
--- /dev/null
+++ b/repo/assets
@@ -0,0 +1 @@
+Subproject commit d004ab7211fb3d57801e783b184a7ded1e2f5e4b
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..934c793
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,3 @@
+pytest
+pytest-subtests
+coverage
diff --git a/requirements.txt b/requirements.txt
index 121fdae..1aa85af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-ocrd >= 2.38.0
+ocrd >= 3.3.0
 lxml
 click
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 71cc8b7..0000000
--- a/setup.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Installs:
-
-    - ocrd-docstruct
-"""
-import codecs
-
-import json
-from setuptools import setup, find_packages
-
-with open('./ocrd-tool.json', 'r') as f:
-    version = json.load(f)['version']
-    
-setup(
-    name='docstruct',
-    version=version,
-    description='Document structure detection from PAGE to METS',
-    long_description=codecs.open('README.md', encoding='utf-8').read(),
-    long_description_content_type='text/markdown',
-    author='Robert Sachunsky',
-    author_email='sachunsky@informatik.uni-leipzig.de',
-    url='https://github.com/bertsky/docstruct',
-    license='Apache License 2.0',
-    packages=find_packages(exclude=('tests', 'docs')),
-    install_requires=open('requirements.txt').read().split('\n'),
-    package_data={
-        '': ['*.json', '*.yml', '*.yaml'],
-    },
-    entry_points={
-        'console_scripts': [
-            'ocrd-docstruct=docstruct.proc:cli',
-        ]
-    },
-)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..01c7e12
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,99 @@
+# pylint: disable=unused-import
+
+from multiprocessing import Process
+from time import sleep
+from random import seed, sample
+import os
+import pytest
+
+from ocrd import Resolver, Workspace, OcrdMetsServer
+from ocrd_utils import (
+    pushd_popd,
+    disableLogging,
+    initLogging,
+    setOverrideLogLevel,
+    config,
+    MIMETYPE_PAGE
+)
+
+from .assets import assets
+
+WORKSPACES = {
+    "manifesto": assets.path_to('communist_manifesto/data/mets.xml'),
+    "aufklaerung": assets.path_to('kant_aufklaerung_1784/data/mets.xml'),
+    "herrnhuterey04": assets.path_to('benner_herrnhuterey04_1748.ocrd/mets.xml'),
+}
+
+#@pytest.fixture(params=WORKSPACES.keys())
+@pytest.fixture
+def workspace(tmpdir, pytestconfig, asset):
+    initLogging()
+    if pytestconfig.getoption('verbose') > 0:
+        setOverrideLogLevel('DEBUG')
+    with pushd_popd(tmpdir):
+        directory = str(tmpdir)
+        resolver = Resolver()
+        url = WORKSPACES[asset]
+        workspace = resolver.workspace_from_url(url, dst_dir=directory) # download=True
+        workspace.name = asset # for debugging
+        # determine GT file group and download PAGE files
+        gtGrp = None
+        for file in workspace.find_files(mimetype=MIMETYPE_PAGE):
+            if file.url.startswith("file:/"):
+                # ignore broken and irrelevant groups
+                workspace.remove_file(file.ID, force=True)
+            elif 'GT' in file.fileGrp and (gtGrp or file.fileGrp) == file.fileGrp:
+                gtGrp = file.fileGrp
+                workspace.download_file(file)
+        yield workspace, gtGrp
+    disableLogging()
+
+def pytest_addoption(parser):
+    parser.addoption("--workspace",
+                     action="append",
+                     choices=list(WORKSPACES) + ["all"],
+                     help="workspace(s) to run on (set 'all' to use all)"
+    )
+
+@pytest.hookimpl
+def pytest_generate_tests(metafunc):
+    if "asset" in metafunc.fixturenames:
+        ws = metafunc.config.getoption("workspace")
+        if ws == ['all']:
+            ws = list(WORKSPACES)
+        elif not ws:
+            ws = ["aufklaerung"] # default
+        metafunc.parametrize("asset", ws)
+
+CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache']
+
+@pytest.fixture(params=CONFIGS)
+def processor_kwargs(request, workspace):
+    config.OCRD_DOWNLOAD_INPUT = False # only pre-downloaded pages
+    workspace, gt_grp = workspace
+    config.OCRD_MISSING_OUTPUT = "ABORT" # --debug
+    if 'metscache' in request.param:
+        config.OCRD_METS_CACHING = True
+        #print("enabled METS caching")
+    if 'pageparallel' in request.param:
+        config.OCRD_MAX_PARALLEL_PAGES = 4
+        #print("enabled page-parallel processing")
+        def _start_mets_server(*args, **kwargs):
+            #print("running with METS server")
+            server = OcrdMetsServer(*args, **kwargs)
+            server.startup()
+        process = Process(target=_start_mets_server,
+                          kwargs={'workspace': workspace, 'url': 'mets.sock'})
+        process.start()
+        sleep(1)
+        # instantiate client-side workspace
+        asset = workspace.name
+        workspace = Workspace(workspace.resolver, workspace.directory,
+                              mets_server_url='mets.sock',
+                              mets_basename=os.path.basename(workspace.mets_target))
+        workspace.name = asset
+        yield {'workspace': workspace, 'input_file_grp': gt_grp, 'mets_server_url': 'mets.sock'}
+        process.terminate()
+    else:
+        yield {'workspace': workspace, 'input_file_grp': gt_grp}
+    config.reset_defaults()
diff --git a/tests/test_all.py b/tests/test_all.py
new file mode 100644
index 0000000..6266570
--- /dev/null
+++ b/tests/test_all.py
@@ -0,0 +1,84 @@
+# pylint: disable=import-error
+
+import os
+from pathlib import Path
+import pytest
+
+from ocrd import Workspace, run_processor
+from ocrd_models.constants import NAMESPACES as NS
+from ocrd_validators.xsd_mets_validator import XsdMetsValidator
+
+from docstruct.proc import OcrdDocStruct
+
+# from ocrd_pagetopdf
+def get_structure(mets):
+    metsroot = mets._tree.getroot()
+    structlink = metsroot.find('mets:structLink', NS)
+    smlinks = {link.get('{http://www.w3.org/1999/xlink}from'):
+               link.get('{http://www.w3.org/1999/xlink}to')
+               for link in reversed(structlink.findall('./mets:smLink', NS)
+                                    if structlink is not None else [])}
+    phymap = metsroot.find('mets:structMap[@TYPE="PHYSICAL"]', NS)
+    topdiv = next(phymap.iterfind('./mets:div', NS))
+    pages = {page.get('ID'): page.get('ORDER') or order
+             for order, page in enumerate(topdiv.findall('./mets:div', NS))
+             if page.get('TYPE') == "page"}
+    logmap = metsroot.find('mets:structMap[@TYPE="LOGICAL"]', NS)
+    if logmap is None:
+        return None
+    if (topdiv := logmap.find('./mets:div', NS)) is None:
+        return None
+    # descend to deepest ADM
+    while (topdiv.get('ADMID') is None and
+           (div := topdiv.find('./mets:div', NS)) is not None):
+        topdiv = div
+    # we want to dive into multivolume_work, periodical, newspaper, year, month...
+    # we are looking for issue, volume, monograph, lecture, dossier, act, judgement, study, paper, *_thesis, report, register, file, fragment, manuscript...
+    while ((div := topdiv.find('./mets:div', NS)) is not None and
+           div.get('ADMID') is not None):
+        topdiv = div
+    #for div in topdiv.iterdescendants('{%s}div' % NS['mets']):
+    # recursive:
+    def find_depth(div, depth=0):
+        div_id = div.get('ID', div.getparent().get('ID'))
+        return {
+            'label': div.get('LABEL') or div.get('ORDERLABEL') or '',
+            'type': div.get('TYPE') or '',
+            'id': div_id,
+            'page': pages.get(smlinks.get(div_id, ''), ''),
+            'depth': depth,
+            'subs': [find_depth(subdiv, depth+1)
+                     for subdiv in div.findall('./mets:div', NS)]
+        }
+    struct = find_depth(topdiv)
+    return struct
+
+def test_docstruct(processor_kwargs, subtests):
+    ws = processor_kwargs['workspace']
+    input_file_grp = processor_kwargs['input_file_grp']
+    if not input_file_grp:
+        pytest.skip(f"workspace asset '{ws.name}' has no PAGE GT fileGrp")
+    # for tests w/ METS Server, retrieve a new OcrdMets directly from the file
+    offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target))
+    structure_old = get_structure(offline_ws.mets)
+    mets_old = offline_ws.mets.to_xml(xmllint=True).decode('utf-8')
+    for mode in ['enmap', 'dfg']:
+        with subtests.test(mode=mode):
+            run_processor(OcrdDocStruct,
+                          output_file_grp="", # as long as core#1321 is open, we must something here
+                          parameter=dict(mode=mode),
+                          **processor_kwargs,
+            )
+            ws.save_mets()
+            offline_ws.reload_mets()
+            structure_new = get_structure(offline_ws.mets)
+            assert structure_old != structure_new
+            if structure_old:
+                assert structure_old['id'] == structure_new['id']
+                assert len(structure_new['subs']) > 0
+            report = XsdMetsValidator.validate(Path(ws.mets_target))
+            assert not report.errors
+        # reset METS to previous state
+        with open(ws.mets_target, 'w') as mets_file:
+            mets_file.write(mets_old)
+        ws.reload_mets()