diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml new file mode 100644 index 0000000..81ea755 --- /dev/null +++ b/.github/workflows/test-python.yml @@ -0,0 +1,34 @@ +name: Pytest CI + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + make install-dev + make deps-test + - name: Test with pytest + run: make coverage PYTEST_ARGS="-vv --workspace=all --junitxml=pytest.xml" + - name: Get coverage results + run: | + coverage report --format=markdown >> $GITHUB_STEP_SUMMARY + coverage xml + - name: Store coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-report_${{ matrix.python-version }} + path: pytest.xml diff --git a/.gitignore b/.gitignore index 4074f70..c14e1ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +tests/assets/ +/*.zip + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5b24fbb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "repo/assets"] + path = repo/assets + url = https://github.com/OCR-D/assets diff --git a/Dockerfile b/Dockerfile index c87f290..1a8a6f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,20 +3,41 @@ FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://ocr-d.de/kontakt" \ + maintainer="https://ocr-d.de/en/contact" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/bertsky/docstruct" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="nmalign" \ + org.opencontainers.image.description="Document structure detection from PAGE to METS" \ + org.opencontainers.image.source="https://github.com/bertsky/docstruct" \ + org.opencontainers.image.documentation="https://github.com/bertsky/docstruct/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONIOENCODING=utf8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# avoid HOME/.local/share (hard to predict USER here) +# so let XDG_DATA_HOME coincide with fixed system location +# (can still be overridden by derived stages) +ENV XDG_DATA_HOME /usr/local/share +# avoid the need for an extra volume for persistent resource user db +# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml) +ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources WORKDIR /build/docstruct -COPY setup.py . -COPY docstruct/ocrd-tool.json . -COPY docstruct ./docstruct -COPY requirements.txt . -COPY README.md . -COPY Makefile . -RUN make install -RUN rm -rf /build/docstruct +COPY . . +COPY ocrd-tool.json . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# install everything and reduce image size +RUN make install && rm -rf /build/docstruct +# smoke test +RUN ocrd-docstruct -h WORKDIR /data VOLUME ["/data"] diff --git a/Makefile b/Makefile index eabe93b..fbf845f 100644 --- a/Makefile +++ b/Makefile @@ -1,26 +1,80 @@ PYTHON = python3 PIP = pip3 PYTHONIOENCODING=utf8 +PYTEST_ARGS ?= "-vv --workspace=all" -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0 +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0 DOCKER_TAG = ocrd/docstruct help: @echo @echo " Targets" @echo - @echo " deps Install only Python deps via pip" - @echo " install Install full Python package via pip" - @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" + @echo " deps Install only Python deps via pip" + @echo " install Install full Python package via pip" + @echo " install-dev Install in editable mode" + @echo " build Build binary and source Python package" + @echo " docker Build a Docker image $(DOCKER_TAG) from $(DOCKER_BASE_IMAGE)" + @echo " test Run tests via Pytest" + @echo " repo/assets Clone OCR-D/assets to ./repo/assets" + @echo " tests/assets Copy to ./tests/assets" + @echo "" + @echo " Variables" + @echo "" + @echo " DOCKER_TAG Docker container tag ($(DOCKER_TAG))" + @echo " PYTEST_ARGS Additional runtime options for pytest ($(PYTEST_ARGS))" + @echo " (See --help, esp. custom option --workspace)" # Install Python deps via pip deps: $(PIP) install -r requirements.txt +deps-test: + $(PIP) install -r requirements-test.txt + # Install Python package via pip install: $(PIP) install . +install-dev: + $(PIP) install -e . + +build: + $(PIP) install build wheel + $(PYTHON) -m build . + +# Run test +test: tests/assets + $(PYTHON) -m pytest tests --durations=0 $(PYTEST_ARGS) + +coverage: + coverage erase + $(MAKE) test PYTHON="coverage run" + coverage report -m + +# +# Assets +# + +# Update OCR-D/assets submodule +.PHONY: repos always-update tests/assets +repo/assets: always-update + git submodule sync --recursive $@ + if git submodule status --recursive $@ | grep -qv '^ '; then \ + git submodule update --init --recursive $@ && \ + touch $@; \ + fi + +benner_herrnhuterey04_1748.ocrd.zip: + wget https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/$@ + +# Setup test assets +tests/assets: benner_herrnhuterey04_1748.ocrd.zip +tests/assets: repo/assets + mkdir -p $@ + cp -a $ None: """ """ - LOG = getLogger('OcrdDocStruct') - assert_file_grp_cardinality(self.input_file_grp, 1) + if isinstance(workspace.mets, ClientSideOcrdMets): + # serialise and write METS to disk + # (in-memory changes could come from prio processing step) + workspace.save_mets() + # instantiate (read and parse) METS from disk (read-only, metadata are constant) + ws = Workspace(workspace.resolver, workspace.directory, + mets_basename=os.path.basename(workspace.mets_target)) + else: + ws = workspace + self.create_logmap_smlink(ws.mets) + self.results = [] + super().process_workspace(workspace) + self.update_mets() + self.reset() + ws.save_mets() + if isinstance(workspace.mets, ClientSideOcrdMets): + workspace.reload_mets() + + def process_page_file(self, input_file : OcrdFileType) -> None: + assert isinstance(input_file, get_args(OcrdFileType)) + page_id = input_file.pageId + self._base_logger.info("processing page %s", page_id) + self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + input_pcgts = page_from_file(input_file) + assert isinstance(input_pcgts, OcrdPage) + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") + return + mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink # FIXME: more parameters (what kind of region types, geometric rules etc) - self.create_logmap_smlink() - - results = [] - for input_file in self.input_files: - page_id = input_file.pageId or input_file.ID - LOG.info("INPUT FILE %s", page_id) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page = pcgts.get_Page() - if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']: - LOG.info("skipping page type %s", page.get_type()) - continue - if page.get_type() in ['table-of-contents', 'index']: - # FIXME use directly - LOG.info("skipping page type %s", page.get_type()) - results.extend(self.extract_text(page, input_file)) - self.write_to_mets(results) + page = input_pcgts.get_Page() + if page.get_type() in ['front-cover', 'back-cover', 'title', 'blank']: + self.logger.info("skipping page type %s", page.get_type()) + return + if page.get_type() in ['table-of-contents', 'index']: + # FIXME use directly + self.logger.info("skipping page type %s", page.get_type()) + return + self.results.extend(self.extract_text(page, input_file)) def extract_text(self, page, input_file): """ get text regions in reading order, put them into a hierarchy (via heuristic rules) """ - LOG = getLogger('OcrdDocStruct') target = self.parameter['type'] result = [] # FIXME: what about non-text regions (tables, images)? @@ -119,7 +147,7 @@ def extract_text(self, page, input_file): region_xywh = xywh_from_points(region.get_Coords().points) region_text = region.get_TextEquiv() if not region_text: - LOG.warning("skipping empty text region %s", region.id) + self.logger.warning("skipping empty text region %s", region.id) continue region_text = region_text[0].Unicode # FIXME: textual cues @@ -131,19 +159,10 @@ def extract_text(self, page, input_file): result.append([input_file, region.id, region_xywh, 'text', '']) return result - def write_to_mets(self, results): - LOG = getLogger('OcrdDocStruct') + def update_mets(self): mode = self.parameter['mode'] # enmap/mets:area or dfg/mets:structLink - log_ids = sorted(int(id_[4:]) for id_ in self.log_ids - if id_[4:].isnumeric()) - if log_ids: - last_id = log_ids[-1] - else: - last_id = 0 def add_div(parent, div_type, text): - nonlocal last_id - last_id += 1 - div_id = "LOG_" + str(last_id) + div_id = 'uuid-' + str(uuid.uuid4()) div = ET.SubElement(parent, TAG_METS_DIV) div.set('TYPE', div_type) div.set('ID', div_id) @@ -155,19 +174,17 @@ def add_div(parent, div_type, text): def add_link(page_id, div_id): # add mets:smLink entry to mets:structLink (for dfg representation) link = ET.SubElement(self.link, TAG_METS_SMLINK) - link.set('{' + NS['xlink'] + '}to', page_id) link.set('{' + NS['xlink'] + '}from', div_id) + link.set('{' + NS['xlink'] + '}to', page_id) self.link_map.setdefault(page_id, []).append(div_id) return link def add_area(parent, file_id, region_id): # add mets:fptr/mets:area entry to mets:div (for enmap representation) - fptr = parent.find(TAG_METS_FPTR) - if fptr is None: + if (fptr := parent.find(TAG_METS_FPTR)) is None: fptr = ET.SubElement(parent, TAG_METS_FPTR) - if fptr.find(TAG_METS_SEQ): - fptr = fptr.find(TAG_METS_SEQ) - elif fptr.find(TAG_METS_AREA): - area = fptr.find(TAG_METS_AREA) + if (seq := fptr.find(TAG_METS_SEQ)) is not None: + fptr = seq + elif (area := fptr.find(TAG_METS_AREA)) is not None: fptr.remove(area) fptr = ET.SubElement(fptr, TAG_METS_SEQ) fptr.append(area) @@ -179,13 +196,13 @@ def add_area(parent, file_id, region_id): div = None last_type = None last_page = None - for input_file, region_id, region_xywh, region_type, region_text in results: + for input_file, region_id, region_xywh, region_type, region_text in self.results: page_id = input_file.pageId if region_type == 'text': if div is None: - LOG.warning("%s: skipping region '%s' prior to first heading", page_id, region_id) + self.logger.warning("%s: skipping region '%s' prior to first heading", page_id, region_id) continue - LOG.info("continuing with text region %s on page %s", region_id, page_id) + self.logger.info("continuing with text region %s on page %s", region_id, page_id) if mode == 'enmap': # add to current div add_area(div, input_file.ID, region_id) @@ -199,18 +216,18 @@ def add_area(parent, file_id, region_id): loglist = self.link_map.get(page_id, []) if len(loglist): log = self.log_map[loglist[-1]] - LOG.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE')) + self.logger.info("starting at last existing div for page: %s[%s]", log.get('ID'), log.get('TYPE')) else: # get deepest embedded, still non-structural existing div - log = next([log for log in reversed(self.log.iterdescendants(TAG_METS_DIV)) + log = next((log for log in reversed(list(self.log.iterdescendants(TAG_METS_DIV))) if log.get('TYPE').lower() in [ # 'serial', 'multivolume_work', 'newspaper', 'issue', # 'month', 'year', 'part', 'folder', 'map', 'illustration', 'additional', 'volume', 'monograph', # 'chapter', 'letter', 'fascicle', 'fragment', 'manuscript', 'bundle', - ]], self.log) - LOG.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE')) + ]), self.log) + self.logger.info("starting at deepest existing div: %s[%s]", log.get('ID'), log.get('TYPE')) div = log div_type = div.get('TYPE').lower() if (div_type in [ @@ -230,7 +247,7 @@ def add_area(parent, file_id, region_id): else: # coordination div = add_div(div.getparent(), region_type, region_text) - LOG.info("continuing with %s region %s on page %s", region_type, region_id, page_id) + self.logger.info("continuing with %s region %s on page %s", region_type, region_id, page_id) if mode == 'enmap': # add to new div add_area(div, input_file.ID, region_id) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b9d2025 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] + +[project] +name = "docstruct" +authors = [ + {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, +] +description = "Document structure detection from PAGE to METS" +readme = "README.md" +license.text = "Apache-2.0" +requires-python = ">=3.8" + +dynamic = ["version", "dependencies"] + +# https://pypi.org/classifiers/ +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Intended Audience :: Other Audience", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing", +] + +[project.scripts] +ocrd-docstruct = "docstruct.proc:cli" + +[project.urls] +Homepage = "https://github.com/bertsky/docstruct" +Repository = "https://github.com/bertsky/docstruct.git" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} +optional-dependencies.test = {file = ["requirements-test.txt"]} + +[tool.setuptools] +packages = ["docstruct"] +package-data = {"*" = ["ocrd-tool.json"]} + +[tool.coverage.run] +branch = true +source = ["docstruct"] diff --git a/repo/assets b/repo/assets new file mode 160000 index 0000000..d004ab7 --- /dev/null +++ b/repo/assets @@ -0,0 +1 @@ +Subproject commit d004ab7211fb3d57801e783b184a7ded1e2f5e4b diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..934c793 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,3 @@ +pytest +pytest-subtests +coverage diff --git a/requirements.txt b/requirements.txt index 121fdae..1aa85af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -ocrd >= 2.38.0 +ocrd >= 3.3.0 lxml click diff --git a/setup.py b/setup.py deleted file mode 100644 index 71cc8b7..0000000 --- a/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Installs: - - - ocrd-docstruct -""" -import codecs - -import json -from setuptools import setup, find_packages - -with open('./ocrd-tool.json', 'r') as f: - version = json.load(f)['version'] - -setup( - name='docstruct', - version=version, - description='Document structure detection from PAGE to METS', - long_description=codecs.open('README.md', encoding='utf-8').read(), - long_description_content_type='text/markdown', - author='Robert Sachunsky', - author_email='sachunsky@informatik.uni-leipzig.de', - url='https://github.com/bertsky/docstruct', - license='Apache License 2.0', - packages=find_packages(exclude=('tests', 'docs')), - install_requires=open('requirements.txt').read().split('\n'), - package_data={ - '': ['*.json', '*.yml', '*.yaml'], - }, - entry_points={ - 'console_scripts': [ - 'ocrd-docstruct=docstruct.proc:cli', - ] - }, -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..01c7e12 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,99 @@ +# pylint: disable=unused-import + +from multiprocessing import Process +from time import sleep +from random import seed, sample +import os +import pytest + +from ocrd import Resolver, Workspace, OcrdMetsServer +from ocrd_utils import ( + pushd_popd, + disableLogging, + initLogging, + setOverrideLogLevel, + config, + MIMETYPE_PAGE +) + +from .assets import assets + +WORKSPACES = { + "manifesto": assets.path_to('communist_manifesto/data/mets.xml'), + "aufklaerung": assets.path_to('kant_aufklaerung_1784/data/mets.xml'), + "herrnhuterey04": assets.path_to('benner_herrnhuterey04_1748.ocrd/mets.xml'), +} + +#@pytest.fixture(params=WORKSPACES.keys()) +@pytest.fixture +def workspace(tmpdir, pytestconfig, asset): + initLogging() + if pytestconfig.getoption('verbose') > 0: + setOverrideLogLevel('DEBUG') + with pushd_popd(tmpdir): + directory = str(tmpdir) + resolver = Resolver() + url = WORKSPACES[asset] + workspace = resolver.workspace_from_url(url, dst_dir=directory) # download=True + workspace.name = asset # for debugging + # determine GT file group and download PAGE files + gtGrp = None + for file in workspace.find_files(mimetype=MIMETYPE_PAGE): + if file.url.startswith("file:/"): + # ignore broken and irrelevant groups + workspace.remove_file(file.ID, force=True) + elif 'GT' in file.fileGrp and (gtGrp or file.fileGrp) == file.fileGrp: + gtGrp = file.fileGrp + workspace.download_file(file) + yield workspace, gtGrp + disableLogging() + +def pytest_addoption(parser): + parser.addoption("--workspace", + action="append", + choices=list(WORKSPACES) + ["all"], + help="workspace(s) to run on (set 'all' to use all)" + ) + +@pytest.hookimpl +def pytest_generate_tests(metafunc): + if "asset" in metafunc.fixturenames: + ws = metafunc.config.getoption("workspace") + if ws == ['all']: + ws = list(WORKSPACES) + elif not ws: + ws = ["aufklaerung"] # default + metafunc.parametrize("asset", ws) + +CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache'] + +@pytest.fixture(params=CONFIGS) +def processor_kwargs(request, workspace): + config.OCRD_DOWNLOAD_INPUT = False # only pre-downloaded pages + workspace, gt_grp = workspace + config.OCRD_MISSING_OUTPUT = "ABORT" # --debug + if 'metscache' in request.param: + config.OCRD_METS_CACHING = True + #print("enabled METS caching") + if 'pageparallel' in request.param: + config.OCRD_MAX_PARALLEL_PAGES = 4 + #print("enabled page-parallel processing") + def _start_mets_server(*args, **kwargs): + #print("running with METS server") + server = OcrdMetsServer(*args, **kwargs) + server.startup() + process = Process(target=_start_mets_server, + kwargs={'workspace': workspace, 'url': 'mets.sock'}) + process.start() + sleep(1) + # instantiate client-side workspace + asset = workspace.name + workspace = Workspace(workspace.resolver, workspace.directory, + mets_server_url='mets.sock', + mets_basename=os.path.basename(workspace.mets_target)) + workspace.name = asset + yield {'workspace': workspace, 'input_file_grp': gt_grp, 'mets_server_url': 'mets.sock'} + process.terminate() + else: + yield {'workspace': workspace, 'input_file_grp': gt_grp} + config.reset_defaults() diff --git a/tests/test_all.py b/tests/test_all.py new file mode 100644 index 0000000..6266570 --- /dev/null +++ b/tests/test_all.py @@ -0,0 +1,84 @@ +# pylint: disable=import-error + +import os +from pathlib import Path +import pytest + +from ocrd import Workspace, run_processor +from ocrd_models.constants import NAMESPACES as NS +from ocrd_validators.xsd_mets_validator import XsdMetsValidator + +from docstruct.proc import OcrdDocStruct + +# from ocrd_pagetopdf +def get_structure(mets): + metsroot = mets._tree.getroot() + structlink = metsroot.find('mets:structLink', NS) + smlinks = {link.get('{http://www.w3.org/1999/xlink}from'): + link.get('{http://www.w3.org/1999/xlink}to') + for link in reversed(structlink.findall('./mets:smLink', NS) + if structlink is not None else [])} + phymap = metsroot.find('mets:structMap[@TYPE="PHYSICAL"]', NS) + topdiv = next(phymap.iterfind('./mets:div', NS)) + pages = {page.get('ID'): page.get('ORDER') or order + for order, page in enumerate(topdiv.findall('./mets:div', NS)) + if page.get('TYPE') == "page"} + logmap = metsroot.find('mets:structMap[@TYPE="LOGICAL"]', NS) + if logmap is None: + return None + if (topdiv := logmap.find('./mets:div', NS)) is None: + return None + # descend to deepest ADM + while (topdiv.get('ADMID') is None and + (div := topdiv.find('./mets:div', NS)) is not None): + topdiv = div + # we want to dive into multivolume_work, periodical, newspaper, year, month... + # we are looking for issue, volume, monograph, lecture, dossier, act, judgement, study, paper, *_thesis, report, register, file, fragment, manuscript... + while ((div := topdiv.find('./mets:div', NS)) is not None and + div.get('ADMID') is not None): + topdiv = div + #for div in topdiv.iterdescendants('{%s}div' % NS['mets']): + # recursive: + def find_depth(div, depth=0): + div_id = div.get('ID', div.getparent().get('ID')) + return { + 'label': div.get('LABEL') or div.get('ORDERLABEL') or '', + 'type': div.get('TYPE') or '', + 'id': div_id, + 'page': pages.get(smlinks.get(div_id, ''), ''), + 'depth': depth, + 'subs': [find_depth(subdiv, depth+1) + for subdiv in div.findall('./mets:div', NS)] + } + struct = find_depth(topdiv) + return struct + +def test_docstruct(processor_kwargs, subtests): + ws = processor_kwargs['workspace'] + input_file_grp = processor_kwargs['input_file_grp'] + if not input_file_grp: + pytest.skip(f"workspace asset '{ws.name}' has no PAGE GT fileGrp") + # for tests w/ METS Server, retrieve a new OcrdMets directly from the file + offline_ws = Workspace(ws.resolver, ws.directory, mets_basename=os.path.basename(ws.mets_target)) + structure_old = get_structure(offline_ws.mets) + mets_old = offline_ws.mets.to_xml(xmllint=True).decode('utf-8') + for mode in ['enmap', 'dfg']: + with subtests.test(mode=mode): + run_processor(OcrdDocStruct, + output_file_grp="", # as long as core#1321 is open, we must something here + parameter=dict(mode=mode), + **processor_kwargs, + ) + ws.save_mets() + offline_ws.reload_mets() + structure_new = get_structure(offline_ws.mets) + assert structure_old != structure_new + if structure_old: + assert structure_old['id'] == structure_new['id'] + assert len(structure_new['subs']) > 0 + report = XsdMetsValidator.validate(Path(ws.mets_target)) + assert not report.errors + # reset METS to previous state + with open(ws.mets_target, 'w') as mets_file: + mets_file.write(mets_old) + ws.reload_mets()