diff --git a/.circleci/config.yml b/.circleci/config.yml index 1e64ed6..7ba22cf 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,7 +11,7 @@ jobs: - run: apt-get update && apt-get install -y --no-install-recommends make wget unzip - checkout - run: make deps-ubuntu - - run: make install VIRTUAL_ENV=/usr/local + - run: make install - run: ocrd resmgr download -n https://ub-backup.bib.uni-mannheim.de/~stweil/tesstrain/german_print/tessdata_best/german_print_0.877_1254744_7309067.traineddata ocrd-tesserocr-recognize german_print.traineddata diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b05c3a5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +test/ +dist/ +build/ diff --git a/.gitignore b/.gitignore index eff4653..404b10a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ *~ test/data* +.pytest_cache +__pycache__ +*.py[cod] +*.egg-info +/build +/dist diff --git a/Dockerfile b/Dockerfile index 4cede7d..2e272d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,22 +3,29 @@ FROM $DOCKER_BASE_IMAGE ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://github.com/bertsky/workflow-configuration/issues" \ + maintainer="https://ocr-d.de/en/contact" \ org.label-schema.vcs-ref=$VCS_REF \ org.label-schema.vcs-url="https://github.com/bertsky/workflow-configuration" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="workflow-configuration" \ + org.opencontainers.image.description="" \ + org.opencontainers.image.source="https://github.com/bertsky/workflow-configuration" \ + org.opencontainers.image.documentation="https://github.com/bertsky/workflow-configuration/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core SHELL ["/bin/bash", "-c"] -WORKDIR /build/workflow-configuration +WORKDIR /build/module +COPY . . COPY ocrd-tool.json . -COPY ocrd-make ocrd-import ocrd-page-transform xsl-transform . -COPY Makefile *.mk . -COPY *.xsl . -COPY README.md . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# install everything and reduce image size RUN make deps-ubuntu -RUN make install VIRTUAL_ENV=/usr/local -RUN rm -fr /build/workflow-configuration +RUN make install && rm -fr /build/module WORKDIR /data VOLUME ["/data"] diff --git a/Makefile b/Makefile index 4004368..89eeec4 100644 --- a/Makefile +++ b/Makefile @@ -1,35 +1,21 @@ # OCR-D workflow configuration installation makefile # # Install workflow configurations persistently by running: -# `make install` -# (in the git repo), which will copy workflow.mk (as `Makefile`), all -# preconfigured makefiles, and some shell scripts into -# a fixed target directory under the VIRTUAL_ENV prefix). -# -# For installation via shell-script: -VIRTUAL_ENV ?= $(CURDIR)/local -# copy `ocrd-make` here: -BINDIR = $(abspath $(VIRTUAL_ENV))/bin -# copy the makefiles here: -SHAREDIR = $(abspath $(VIRTUAL_ENV))/share/workflow-configuration - -# we need associative arrays, process substitution etc. -# also, fail on failed intermediates as well: -SHELL = bash -o pipefail - -CONFIGURATION := $(abspath $(firstword $(MAKEFILE_LIST))) +# `make install` or `pip install .` (in the git repo), +# which will copy all distribution files (`Makefile` and all +# preconfigured makefiles `*.mk`, as well as some Python +# scripts and XSL transforms `*.xsl`) into the Python +# site directory. +# Using venv is recommended. -CONFIGDIR := $(dir $(CONFIGURATION)) +PYTHON = python3 +PIP = pip3 +PYTHONIOENCODING=utf8 -EXISTING_MAKEFILES = $(patsubst $(CONFIGDIR)/%,%,$(wildcard $(CONFIGDIR)/*.mk)) -EXISTING_TRANSFORMS = $(patsubst $(CONFIGDIR)/%,%,$(wildcard $(CONFIGDIR)/*.xsl)) +SHELL = bash -o pipefail -ifeq ($(filter workflow.mk,$(EXISTING_MAKEFILES)),) -$(error "Found no .mk makefiles in source directory $(CONFIGDIR)") -endif -ifeq ($(EXISTING_TRANSFORMS),) -$(error "Found no .xsl transforms in source directory $(CONFIGDIR)") -endif +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0 +DOCKER_TAG = bertsky/workflow-configuration help: @echo "Installing OCR-D workflow configurations:" @@ -39,51 +25,46 @@ help: @echo @echo " Targets:" @echo " * help (this message)" - @echo " * test (run test suite)" - @echo " * deps-ubuntu (install extra system packages needed here, beyond ocrd and processors)" - @echo " * install (copy $(SHPROGS) and configuration makefiles to" - @echo " * VIRTUAL_ENV=$(VIRTUAL_ENV)" - @echo " * from repository workdir)" - @echo " * uninstall (remove $(SHPROGS) and configuration makefiles from" - @echo " * VIRTUAL_ENV=$(VIRTUAL_ENV))" + @echo " * deps-ubuntu (install system packages needed here, beyond ocrd and processors)" + @echo " * deps (install Python packages needed here)" + @echo " * install (install this package via $(PIP)" + @echo " build (build source and binary distribution)" + @echo " * uninstall (remove this package via $(PIP)" @echo " * %.mk (any filename with suffix .mk not existing yet: spawn new makefile from pattern)" @echo " * test (run test suite)" @echo @echo " Variables:" @echo - @echo " * VIRTUAL_ENV: directory prefix to use for installation" + @echo " * PYTHON (name of Python version binary [$(PYTHON)])" + @echo " * PIP (name of Python pip version binary [$(PIP)])" .PHONY: help deps-ubuntu: apt-get -y install parallel xmlstarlet bc sed libdbd-sqlite3-perl -XSLPROGS =$(EXISTING_TRANSFORMS:%.xsl=%) -SHPROGS = ocrd-make ocrd-import ocrd-page-transform -PROGS = $(SHPROGS) $(XSLPROGS) -install-bin: $(PROGS:%=$(BINDIR)/%) | $(BINDIR) - -$(SHPROGS:%=$(BINDIR)/%): $(BINDIR)/%: % - sed 's,^SHAREDIR=.*,SHAREDIR="$(SHAREDIR)",' < $< > $@ - chmod +x $@ +deps: requirements.txt + $(PIP) install -r $< -$(XSLPROGS:%=$(BINDIR)/%): %: xsl-transform - sed 's,^SHAREDIR=.*,SHAREDIR="$(SHAREDIR)",' < $< > $@ - chmod +x $@ +install: + $(PIP) install . -$(BINDIR) $(SHAREDIR): - @mkdir -p $@ +install-dev: + $(PIP) install -e . -install: install-bin | $(SHAREDIR) - cp -Lf $(EXISTING_MAKEFILES) $(EXISTING_TRANSFORMS) ocrd-tool.json $(SHAREDIR) - mv $(SHAREDIR)/workflow.mk $(SHAREDIR)/Makefile +build: + $(PIP) install build + $(PYTHON) -m build . uninstall: - $(RM) $(PROGS:%=$(BINDIR)/%) - $(RM) -r $(SHAREDIR) + $(PIP) uninstall workflow_configuration +TEST_WORKFLOW = -f all-tess-MODEL.mk MODEL=german_print \ + -f transform.mk TROPTIONS="-P xsl page-extract-text.xsl \ + -P xslt-params '-s level=line' -P mimetype text/plain" \ + -f cat-files.mk define testrecipe = -function testfun { pushd `mktemp -d` && cp -pr $(abspath $^) . && /usr/bin/time ocrd-make -f all-tess-MODEL.mk MODEL=german_print LOGLEVEL=ERROR $(^F) "$$@" && $(RM) -r $$DIRSTACK; }; testfun +function testfun { pushd `mktemp -d` && cp -pr $(abspath $^) . && /usr/bin/time ocrd-make $(TEST_WORKFLOW) LOGLEVEL=ERROR $(^F) "$$@" && cat $(^F:%=%.*.log) && $(RM) -r $$DIRSTACK; }; testfun endef test: test/data1 test/data2 $(testrecipe) @@ -106,16 +87,15 @@ test/data2: ocrd workspace -d $@ rename-group ORIGINAL OCR-D-IMG ocrd workspace -d $@ prune-files -DOCKER_BASE_IMAGE = docker.io/ocrd/core-cuda-torch:v2.69.0 -DOCKER_TAG ?= bertsky/workflow-configuration docker: docker build \ + -t $(DOCKER_TAG) \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ - --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ - --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - -t $(DOCKER_TAG) . + --build-arg VCS_REF=$(git rev-parse --short HEAD) \ + --build-arg BUILD_DATE=$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + . -.PHONY: deps-ubuntu install install-bin uninstall test docker +.PHONY: deps-ubuntu deps install install-dev build uninstall test docker # spawn a new configuration define skeleton = @@ -123,24 +103,21 @@ define skeleton = # # Install by copying (or symlinking) makefiles into a directory # where all OCR-D workspaces (unpacked BagIts) reside and running -# `make` there (or including files from there). +# `ocrd-make` there (or including files from there). # # Call via: -# `make -f WORKFLOW-CONFIG.mk` +# `ocrd-make -f WORKFLOW-CONFIG.mk` # # To rebuild partially, you must pass -W to `make`: -# `make -f WORKFLOW-CONFIG.mk -W FILEGRP` +# `ocrd-make -f WORKFLOW-CONFIG.mk -W FILEGRP` # # To build in parallel, use `-j [CPUS] [-l [LOADLEVEL]]` etc. # # To get general help: -# `make -f WORKFLOW-CONFIG.mk help` +# `ocrd-make --help` # # To get a description of the workflow: -# `make -f WORKFLOW-CONFIG.mk info` - -### -# From here on, custom configuration begins. +# `ocrd-make -f WORKFLOW-CONFIG.mk info` INPUT = OCR-D-IMG @@ -150,7 +127,7 @@ $$(INPUT): OUTPUT = OCR-D-OUT $$(OUTPUT): $$(INPUT) $$(OUTPUT): TOOL = ocrd-dummy -$$(OUTPUT): PARAMS = +$$(OUTPUT): OPTIONS = info: @echo "This is a dummy configuration that creates a copy $$(OUTPUT) of the input fileGrp $$(INPUT)" @@ -159,11 +136,6 @@ info: .DEFAULT_GOAL = $$(OUTPUT) -# Down here, custom configuration ends. -### - -SELFDIR := $$(dir $$(abspath $$(firstword $$(MAKEFILE_LIST)))) -include $$(SELFDIR)/Makefile endef export skeleton @@ -171,12 +143,9 @@ export skeleton %.mk: @echo >$@ "$$skeleton" + # do not search for implicit rules here: %/Makefile: ; Makefile: ; -local.mk: ; ocrd-tool.json: ; -$(CONFIGURATION): ; -$(EXISTING_MAKEFILES): ; -$(EXISTING_TRANSFORMS): ; -$(PROGS): ; +local.mk: ; diff --git a/README.md b/README.md index 9fe2c76..41e7f65 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,9 @@ ## OCR-D workflow configurations based on makefiles -This provides an attempt at running [OCR-D](https://ocr-d.de) workflows configured and controlled via makefiles using [GNU bash](http://www.gnu.org/software/bash), [GNU make](http://www.gnu.org/software/make/) and [GNU parallel](http://www.gnu.org/software/parallel). +This provides an attempt at running [OCR-D](https://ocr-d.de) workflows +configured and controlled via makefiles using [GNU bash](http://www.gnu.org/software/bash), +[GNU make](http://www.gnu.org/software/make/) and [GNU parallel](http://www.gnu.org/software/parallel). Makefilization offers the following _advantages_: @@ -63,9 +65,14 @@ Or equivalently, install the following packages: - `xmlstarlet` - `bc` and `sed` -Additionally, you must of course install [ocrd](https://github.com/OCR-D/core) itself along with its dependencies in the current shell environment. Moreover, depending on the specific configurations you want to use (i.e. the processors it contains), additional modules must be installed. See [OCR-D setup guide](https://ocr-d.de/en/setup) for instructions. +Additionally, you must of course install [ocrd](https://github.com/OCR-D/core) itself +along with its dependencies in the current Python virtual environment (venv). Moreover, +depending on the specific configurations you want to use (i.e. the processors it contains), +additional modules must be installed. See [OCR-D setup guide](https://ocr-d.de/en/setup) +for instructions. -(Yes, `workflow-configuration` is already part of [ocrd_all](https://github.com/OCR-D/ocrd_all), which is also available on [Dockerhub](https://hub.docker.com/r/ocrd/all).) +(Yes, `workflow-configuration` is already part of [ocrd_all](https://github.com/OCR-D/ocrd_all), +which is also available on [Dockerhub](https://hub.docker.com/r/ocrd/all).) ### Installation @@ -74,18 +81,13 @@ Run: make install -... if you are in a (Python) virtual environment. Otherwise specify the installation prefix directory via environment variable `VIRTUAL_ENV`. +... if you are in a (Python) virtual environment, which is recommended. -Assuming `$VIRTUAL_ENV/bin` is in your `PATH`, you can then call: - - cd WORKSPACE && make [OPTIONS] -f WORKFLOW-CONFIG.mk - make -C WORKSPACE [OPTIONS] -f WORKFLOW-CONFIG.mk - -... for processing single workspace directory, or ... +You can then call: ocrd-make [OPTIONS] -f WORKFLOW-CONFIG.mk WORKSPACE... -... for processing multiple workspaces at once (with the same interface as above). +... for processing any number of workspace directories. Where: @@ -95,10 +97,12 @@ Where: Calling workflows is possible from anywhere in your filesystem, but for the `WORKFLOW_CONFIG.mk` you may need to: -- either provide the `*.mk` configurations in the source directory at installation time (to ensure they are installed under the installation prefix and can always be found by file name only) +- either provide the `*.mk` configurations in the source directory at installation time + (to ensure they are installed under the site prefix and can always be found by file name) - or provide full paths at runtime (by absolute path name, or relative to the CWD). -(The previous version of `ocrd-make` tried to copy or symlink all makefiles to the runtime directory. You can still use those, but should remove the old `Makefile`.) +(The previous version of `ocrd-make` tried to copy or symlink all makefiles to the runtime directory. + You can still use those, but should remove the old `Makefile`.) ### Docker Image @@ -107,7 +111,8 @@ Instead of the above native installation steps, you can use the prebuilt image f docker pull bertsky/workflow-configuration docker run -V /path/to/data:/data bertsky/workflow-configuration ocrd-make ... -For general guidance on using Docker with OCR-D, see [User Guide](https://ocr-d.de/en/user_guide#translating-native-commands-to-docker-calls). +For general guidance on using Docker with OCR-D, see +[User Guide](https://ocr-d.de/en/user_guide#translating-native-commands-to-docker-calls). ### Usage @@ -126,21 +131,41 @@ To get help for the import tool:
-Usage: ocrd-import [OPTIONS] [DIRECTORY]
-
-with options:
- -i|--ignore      keep going after unknown file types
- -s|--skip SUFFIX ignore file names ending in given SUFFIX (repeatable)
- -R|--regex EXPR  only include paths matching given EXPR (repeatable)
- -C|--no-convert  do not attempt to convert image file types
- -r|--render DPI  when converting PDFs, render at DPI pixel density
- -P|--nonnum-ids  do not use numeric pageIds but basename patterns
- -B|--basename    only use basename for IDs
-
-Create OCR-D workspace meta-data (mets.xml) in DIRECTORY (or /home/xbert/unsortiert/arbeit/heyer/tools/ocrd_tesserocr), importing...
-* all image files (with known file extension or convertible via ImageMagick) under fileGrp OCR-D-IMG
-* all .xml files (if they validate as PAGE-XML) under fileGrp OCR-D-SEG-PAGE
-...but failing otherwise.
+Usage: ocrd-import [OPTIONS] WORKSPACE_DIR
+
+  Create OCR-D workspace meta-data (mets.xml) in WORKSPACE_DIR (or $PWD), importing...
+  * all image files (with known file extension or convertible via ImageMagick) under fileGrp `image_group`
+  * all .xml files (if they validate as PAGE-XML) under fileGrp `pagexml_group`
+  * all .xml files (if they validate as ALTO-XML) under fileGrp `altoxml_group`
+  ...but failing otherwise (unless `ignore` is set)
+
+Options:
+  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
+                                  Log level
+  -i, --ignore                    keep going after unknown file types
+  -s, --skip SUFFIX               ignore file names ending in given SUFFIX
+                                  (repeatable)
+  -R, --regex EXPR                only include paths matching given EXPR
+                                  (repeatable)
+  -C, --no-convert                do not attempt to convert image file types
+  -r, --render DPI                when converting PDFs, render at DPI pixel
+                                  density  [default: 300]
+  -P, --nonnum-ids                do not use numeric pageIds but basename
+                                  patterns
+  -B, --basename                  only use basename for IDs
+  -n, --dry-run                   only show resulting METS to stdout via pager
+  -I, --image-group TEXT          fileGrp to place detected or converted
+                                  images into  [default: OCR-D-IMG]
+  -X, --pagexml-group TEXT        fileGrp to place detected PAGE-XML into
+                                  [default: OCR-D-PAGE]
+  -A, --altoxml-group TEXT        fileGrp to place detected ALTO-XML into
+                                  [default: OCR-D-ALTO]
+  -G, --directory-groups          instead of assigning files to `image_group`
+                                  or `pagexml_group`, and trying to convert
+                                  everything else to images, create a group
+                                  for every subdirectory and auto-detect its
+                                  MIME types
+  -h, --help                      Show this message and exit.
 
@@ -162,6 +187,8 @@ To perform various tasks via XSLT on PAGE-XML files (these all share the same op page-remove-metadataitem # remove all MetadataItem entries page-remove-dead-regionrefs # remove non-existing regionRefs page-remove-empty-readingorder # remove empty ReadingOrder or groups + page-remove-empty-text-regions # remove empty TextRegion entries + page-remove-empty-lines # remove empty TextLine entries page-remove-all-regions # remove all *Region (and TextLine and Word and Glyph) entries page-remove-text-regions # remove all TextRegion (and TextLine and Word and Glyph) entries page-remove-regions # remove all *Region (and TextLine and Word and Glyph) entries of $type @@ -189,21 +216,26 @@ To perform various tasks via XSLT on PAGE-XML files (these all share the same op
 Usage: NAME [OPTIONS] [FILE]
 
-with options:
- -s name=value    set param NAME to string literal VALUE (repeatable)
- -p name=value    set param NAME to XPath expression VALUE (repeatable)
- -i|--inplace     overwrite input file with result of transformation
- -P|--pretty      pretty-print output (line breaks with indentation)
- -d|--diff        show diff between input and output
- -D|--dump        just print the transformation stylesheet (XSL)
- -h|--help        just show this message
-
-Open PAGE-XML file FILE (or stdin) and apply the XSL transformation "NAME.xsl"
-Write the result to stdout, unless...
- -i / --inplace is given - in which case the result is written back to the
-                           file silently, or
- -d / --diff is given - in which case the result will be compared to the
-                        input and a patch shown on stdout.
+  Open PAGE file XMLFILE (or stdin) and apply the XSL transformation "page-add-nsprefix-pc.xsl"
+  Write the result to stdout, unless...
+  -i / --inplace is given - in which case the result is written back to the
+                            file silently, or
+  -d / --diff is given    - in which case the result will be compared to the
+                            input and a patch shown on stdout.
+
+Options:
+  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
+                                  Log level
+  -s, --string-param NAME=VALUE   set param NAME to string literal VALUE
+  -p, --xpath-param NAME=VALUE    set param NAME to XPath expression VALUE
+  -i, --inplace                   overwrite input file with result of
+                                  transformation
+  -P, --pretty                    pretty-print output (line breaks with
+                                  indentation
+  -d, --diff                      show diff between input and output via pager
+  -D, --dump                      just print the transformation stylesheet
+                                  (XSL)
+  -h, --help                      Show this message and exit.
 
@@ -232,50 +264,59 @@ use `ocrd-page-transform` and pass the filename of the transformation as paramet
-Usage: ocrd-page-transform [OPTIONS]
+Usage: ocrd-page-transform [worker|server] [OPTIONS]
 
   apply arbitrary XSL transformation file for PAGE-XML
 
-  > Processor base class and helper functions. A processor is a tool
-  > that implements the uniform OCR-D command-line interface for run-
-  > time data processing. That is, it executes a single workflow step,
-  > or a combination of workflow steps, on the workspace (represented by
-  > local METS). It reads input files for all or requested physical
-  > pages of the input fileGrp(s), and writes output files for them into
-  > the output fileGrp(s). It may take  a number of optional or
-  > mandatory parameters. Process the :py:attr:`workspace`  from the
-  > given :py:attr:`input_file_grp` to the given
-  > :py:attr:`output_file_grp` for the given :py:attr:`page_id` under
-  > the given :py:attr:`parameter`.
-
-  > (This contains the main functionality and needs to be overridden by
-  > subclasses.)
+  > Transform pages with the given XSLT.
 
-Options:
+  > Open the input PAGE element hierarchy and process it with the XSLT
+  > processor parsed from the `xsl` resource file, passing `xslt-params`
+  > as XSLT parameters (if any).
+
+  > Generate a new PAGE object from the resulting hierarchy, finally
+  > serialise and add it as new output file.
+
+Subcommands:
+    worker      Start a processing worker rather than do local processing
+    server      Start a processor server rather than do local processing
+
+Options for processing:
+  -m, --mets URL-PATH             URL or file path of METS to process [./mets.xml]
+  -w, --working-dir PATH          Working directory of local workspace [dirname(URL-PATH)]
   -I, --input-file-grp USE        File group(s) used as input
   -O, --output-file-grp USE       File group(s) used as output
-  -g, --page-id ID                Physical page ID(s) to process
+  -g, --page-id ID                Physical page ID(s) to process instead of full document []
   --overwrite                     Remove existing output pages/images
-                                  (with --page-id, remove only those)
+                                  (with "--page-id", remove only those).
+                                  Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE
+  --debug                         Abort on any errors with full stack trace.
+                                  Short-hand for OCRD_MISSING_OUTPUT=ABORT
   --profile                       Enable profiling
-  --profile-file                  Write cProfile stats to this file. Implies --profile
+  --profile-file PROF-PATH        Write cProfile stats to PROF-PATH. Implies "--profile"
   -p, --parameter JSON-PATH       Parameters, either verbatim JSON string
                                   or JSON file path
   -P, --param-override KEY VAL    Override a single JSON object key-value pair,
                                   taking precedence over --parameter
-  -m, --mets URL-PATH             URL or file path of METS to process
-  -w, --working-dir PATH          Working directory of local workspace
+  -U, --mets-server-url URL       URL of a METS Server for parallel incremental access to METS
+                                  If URL starts with http:// start an HTTP server there,
+                                  otherwise URL is a path to an on-demand-created unix socket
   -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
-                                  Log level
+                                  Override log level globally [INFO]
+  --log-filename LOG-PATH         File to redirect stderr logging to (overriding ocrd_logging.conf).
+
+Options for information:
   -C, --show-resource RESNAME     Dump the content of processor resource RESNAME
   -L, --list-resources            List names of processor resources
-  -J, --dump-json                 Dump tool description as JSON and exit
-  -h, --help                      This help message
+  -J, --dump-json                 Dump tool description as JSON
+  -D, --dump-module-dir           Show the 'module' resource location path for this processor
+  -h, --help                      Show this message
   -V, --version                   Show version
 
 Parameters:
    "xsl" [string - REQUIRED]
-    File path of the XSL transformation script
+    File path of the XSL transformation script (see `ocrd resmgr` for
+    prepackaged and user-installed files available by file name)
    "xslt-params" [string - ""]
     Assignment of XSL transformation parameter values, given as in
     `xmlstarlet` (which differentiates between `-s name=value` for
@@ -284,9 +325,10 @@ Parameters:
    "pretty-print" [number - 0]
     Reformat with line breaks and this many spaces of indentation after
     XSL transformation (unless zero).
-    "mimetype" [string - "application/vnd.prima.page+xml"]
+   "mimetype" [string - "application/vnd.prima.page+xml"]
     MIME type to register the output files under (should correspond to
     `xsl` result)
+
 
@@ -308,21 +350,26 @@ likewise wrapped as standalone CLIs `mets-...`:
 Usage: NAME [OPTIONS] [FILE]
 
-with options:
- -s name=value    set param NAME to string literal VALUE (repeatable)
- -p name=value    set param NAME to XPath expression VALUE (repeatable)
- -i|--inplace     overwrite input file with result of transformation
- -P|--pretty      pretty-print output (line breaks with indentation)
- -d|--diff        show diff between input and output
- -D|--dump        just print the transformation stylesheet (XSL)
- -h|--help        just show this message
-
-Open METS-XML file FILE (or stdin) and apply the XSL transformation "NAME.xsl"
-Write the result to stdout, unless...
- -i / --inplace is given - in which case the result is written back to the
-                           file silently, or
- -d / --diff is given - in which case the result will be compared to the
-                        input and a patch shown on stdout.
+  Open METS file XMLFILE (or stdin) and apply the XSL transformation "mets-copy-agents.xsl"
+  Write the result to stdout, unless...
+  -i / --inplace is given - in which case the result is written back to the
+                            file silently, or
+  -d / --diff is given    - in which case the result will be compared to the
+                            input and a patch shown on stdout.
+
+Options:
+  -l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
+                                  Log level
+  -s, --string-param NAME=VALUE   set param NAME to string literal VALUE
+  -p, --xpath-param NAME=VALUE    set param NAME to XPath expression VALUE
+  -i, --inplace                   overwrite input file with result of
+                                  transformation
+  -P, --pretty                    pretty-print output (line breaks with
+                                  indentation
+  -d, --diff                      show diff between input and output via pager
+  -D, --dump                      just print the transformation stylesheet
+                                  (XSL)
+  -h, --help                      Show this message and exit.
 
@@ -338,14 +385,11 @@ To run a configuration... (Yes, you can have to look inside and browse its rules!) 3. Execute: - cd WORKSPACE && make [OPTIONS] -f WORKFLOW-CONFIG.mk # or - make -C WORKSPACE [OPTIONS] -f WORKFLOW-CONFIG.mk - - ... for processing single workspace directory, or ... - ocrd-make [OPTIONS] -f WORKFLOW-CONFIG.mk all - (The special target `all` (which is also the default goal) will search for all workspaces in the current directory recursively.) You can also run on a subset of workspaces by passing these as goals on the command line... + (The special target `all` (which is also the default goal) will search for all workspaces + in the current directory recursively.) You can also run on a subset of workspaces + by passing these as goals on the command line... ocrd-make -f WORKFLOW-CONFIG.mk PATH/TO/WORKSPACE1 PATH/TO/WORKSPACE2 ... @@ -403,22 +447,22 @@ Options -j and -l are intercepted.) To get help: - [ocrd-]make help + ocrd-make help To get a short description of the chosen configuration: - [ocrd-]make -f CONFIGURATION.mk info + ocrd-make -f CONFIGURATION.mk info To see the command sequence that would be executed for the chosen configuration (in the format of `ocrd process`): - [ocrd-]make -f CONFIGURATION.mk show + ocrd-make -f CONFIGURATION.mk show To run a workflow server for the command sequence that would be executed for the chosen configuration (to be controlled via `ocrd workflow client` or HTTP): - [ocrd-]make -f CONFIGURATION.mk server + ocrd-make -f CONFIGURATION.mk server To spawn a new configuration file, in the directory of the source repository, do: @@ -443,11 +487,6 @@ You can also use that pattern to specify any fileGrp other than the `.DEFAULT_GO ocrd-make -f CONFIGURATION.mk .DEFAULT_GOAL=OCR-D-SEG-LINE all -If you run `make` in the workspace directly instead of having `ocrd-make` do it recursively, then no `all` target exists and you can directly set the target fileGrp to replace `.DEFAULT_GOAL`: - - make -C WORKSPACE -f CONFIGURATION.mk -W OCR-D-BIN - make -C WORKSPACE -f CONFIGURATION.mk OCR-D-SEG-LINE - There are 6 **special variables** and 1 **additional option**: ##### LOGLEVEL @@ -455,14 +494,12 @@ There are 6 **special variables** and 1 **additional option**: To override the default (or configured) log levels for all processors and libraries, use `LOGLEVEL`. For example, to get debugging everywhere, do: ocrd-make -f CONFIGURATION.mk all LOGLEVEL=DEBUG - make -C WORKSPACE -f CONFIGURATION.mk LOGLEVEL=DEBUG ##### PAGES To process only a subset of pages in all fileGrps, set `PAGES`. For example, to only consider pages `PHYS_0005` through `PHYS_0007`, do: ocrd-make -f CONFIGURATION.mk all PAGES=PHYS_0005..PHYS_0007 - make -C WORKSPACE -f CONFIGURATION.mk PAGES=PHYS_0005..PHYS_0007 The variable gets interpreted as the usual [--page-id parameter](https://ocr-d.de/en/spec/cli#-g---page-id-id) by processors, so it supports range expressions, comma-separated lists and regular expressions. @@ -470,7 +507,6 @@ range expressions, comma-separated lists and regular expressions. If the METS provides physical page labels (`@ORDER` or `@ORDERLABEL`), then these work as well: ocrd-make -f CONFIGURATION.mk all PAGES=5..7 - make -C WORKSPACE -f CONFIGURATION.mk PAGES=5..7 ##### TIMEOUT @@ -582,7 +618,6 @@ Next, edit the file to your needs: Write rules using file groups as prerequisite #### Recommendations -- Keep the comments and the `include Makefile` directive in the file. - Change/customize at least the `info` target, and the `INPUT` and `OUTPUT` name/rule. - Copy/paste rules from the existing configurations. - Define variables with the names of all target/prerequisite file groups, so rules and dependent targets can re-use them (and the names can be easily changed later). @@ -632,8 +667,6 @@ EVAL: TOOL = ocrd-cor-asv-ann-evaluate # we must override the default goal to be our desired overall target: .DEFAULT_GOAL = EVAL -# ALWAYS necessary: -include Makefile ``` ### Testing diff --git a/all-tess-MODEL.mk b/all-tess-MODEL.mk deleted file mode 100644 index 5963286..0000000 --- a/all-tess-MODEL.mk +++ /dev/null @@ -1,45 +0,0 @@ -# This file can run a workflow on a single workspace (non-recursively). -# -# Install by copying (or symlinking) makefiles into a directory -# where all OCR-D workspaces (unpacked BagIts) reside and running -# `make` there (or including files from there). -# -# Call via: -# `make -f WORKFLOW-CONFIG.mk` -# -# To rebuild partially, you must pass -W to `make`: -# `make -f WORKFLOW-CONFIG.mk -W FILEGRP` -# -# To build in parallel, use `-j [CPUS] [-l [LOADLEVEL]]` etc. -# -# To get general help: -# `make -f WORKFLOW-CONFIG.mk help` -# -# To get a description of the workflow: -# `make -f WORKFLOW-CONFIG.mk info` - -### -# From here on, custom configuration begins. - -INPUT = OCR-D-IMG - -$(INPUT): - ocrd workspace find -G $@ --download - -OUTPUT = OCR-D-OCR-TESS -$(OUTPUT): $(INPUT) -$(OUTPUT): TOOL = ocrd-tesserocr-recognize -$(OUTPUT): PARAMS = "segmentation_level": "region", "model": "$(or $(MODEL),Fraktur+Latin)", "shrink_polygons": true #, "auto_model": true - -info: - @echo "This is a simple workflow with Tesseract segmentation+recognition" - @echo "from $(INPUT) to $(OUTPUT) with recognition model MODEL=$(MODEL)" - -.PHONY: info - -.DEFAULT_GOAL = $(OUTPUT) - -# Down here, custom configuration ends. -### - -include Makefile diff --git a/all-tess-frak2021.mk b/all-tess-frak2021.mk deleted file mode 100644 index 7a6f343..0000000 --- a/all-tess-frak2021.mk +++ /dev/null @@ -1,45 +0,0 @@ -# This file can run a workflow on a single workspace (non-recursively). -# -# Install by copying (or symlinking) makefiles into a directory -# where all OCR-D workspaces (unpacked BagIts) reside and running -# `make` there (or including files from there). -# -# Call via: -# `make -f WORKFLOW-CONFIG.mk` -# -# To rebuild partially, you must pass -W to `make`: -# `make -f WORKFLOW-CONFIG.mk -W FILEGRP` -# -# To build in parallel, use `-j [CPUS] [-l [LOADLEVEL]]` etc. -# -# To get general help: -# `make -f WORKFLOW-CONFIG.mk help` -# -# To get a description of the workflow: -# `make -f WORKFLOW-CONFIG.mk info` - -### -# From here on, custom configuration begins. - -INPUT = OCR-D-IMG - -$(INPUT): - ocrd workspace find -G $@ --download - -OUTPUT = OCR-D-OCR-TESS-ALL-FRAK2021 -$(OUTPUT): $(INPUT) -$(OUTPUT): TOOL = ocrd-tesserocr-recognize -$(OUTPUT): PARAMS = "segmentation_level": "region", "model": "frak2021+GT4HistOCR+frk+deu-frak+deu+Fraktur+Latin", "shrink_polygons": true #, "auto_model": true - -info: - @echo "This is a simple workflow with Tesseract segmentation+recognition" - @echo "from $(INPUT) to $(OUTPUT) with various Fraktur models" - -.PHONY: info - -.DEFAULT_GOAL = $(OUTPUT) - -# Down here, custom configuration ends. -### - -include Makefile diff --git a/gt-binarize-page-olena-sauvola-clip-resegment-dewarp-ocr-ocropy-tesseract.mk b/gt-binarize-page-olena-sauvola-clip-resegment-dewarp-ocr-ocropy-tesseract.mk deleted file mode 100644 index 2a6a9f0..0000000 --- a/gt-binarize-page-olena-sauvola-clip-resegment-dewarp-ocr-ocropy-tesseract.mk +++ /dev/null @@ -1,89 +0,0 @@ -# Install by copying (or symlinking) makefiles into a directory -# where all OCR-D workspaces (unpacked BagIts) reside. Then -# chdir to that location. - -# Call via: -# `make -f WORKFLOW-CONFIG.mk WORKSPACE-DIRS` or -# `make -f WORKFLOW-CONFIG.mk all` or just -# `make -f WORKFLOW-CONFIG.mk` -# To rebuild partially, you must pass -W to recursive make: -# `make -f WORKFLOW-CONFIG.mk EXTRA_MAKEFLAGS="-W FILEGRP"` - -### -# From here on, custom configuration begins. - -info: - @echo "Read GT line segmentation," - @echo "then binarize pages," - @echo "then clip regions," - @echo "then resegment+dewarp lines," - @echo "then recognize lines with various Ocropus+Tesseract models," - @echo "and finally evaluate OCR quality by measuring" - @echo "character error rates on line texts w.r.t. GT." - -INPUT = OCR-D-GT-SEG-LINE - -$(INPUT): - ocrd workspace find -G $@ --download - ocrd workspace find -G OCR-D-IMG --download # just in case - -BIN = $(INPUT)-BINPAGE-sauvola - -$(BIN): $(INPUT) -$(BIN): TOOL = ocrd-olena-binarize -$(BIN): PARAMS = "impl": "sauvola-ms-split" - -CLIP = $(BIN)-CLIP - -$(CLIP): $(BIN) -$(CLIP): TOOL = ocrd-cis-ocropy-clip - -RESEG = $(CLIP)-RESEG - -$(RESEG): $(CLIP) -$(RESEG): TOOL = ocrd-cis-ocropy-resegment - -DEW = $(RESEG)-DEWARP - -$(DEW): $(RESEG) -$(DEW): TOOL = ocrd-cis-ocropy-dewarp - -OCR1 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-fraktur-%) -OCR2 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-frakturjze-%) -OCR3 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-%) -OCR4 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-Latin-%) -OCR5 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-%) -OCR6 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-deu-%) -OCR7 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-gt4histocr-%) -OCR8 = $(DEW:$(INPUT)-%=OCR-D-OCR-CALA-gt4histocr-%) - -$(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8): $(DEW) - -$(OCR1) $(OCR2): TOOL = ocrd-cis-ocropy-recognize -$(OCR1): PARAMS = "textequiv_level": "glyph", "model": "fraktur.pyrnn" -$(OCR2): PARAMS = "textequiv_level": "glyph", "model": "fraktur-jze.pyrnn" - -$(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7): TOOL = ocrd-tesserocr-recognize -$(OCR3): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur" -$(OCR4): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur+script/Latin" -$(OCR5): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk" -$(OCR6): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk+deu" -$(OCR7): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "GT4HistOCR_2000000+GT4HistOCR_300000+GT4HistOCR_100000" - -$(OCR8): TOOL = ocrd-calamari-recognize -$(OCR8): GPU = 1 -$(OCR8): PARAMS = "checkpoint" : "$(VIRTUAL_ENV)/share/calamari/GT4HistOCR/*.ckpt.json" - -OUTPUT = $(DEW)-OCR - -$(OUTPUT): $(INPUT) $(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8) -$(OUTPUT): TOOL = ocrd-cor-asv-ann-evaluate -$(OUTPUT): PARAMS = "metric" : "historic_latin" - -.DEFAULT_GOAL = $(OUTPUT) - -# Down here, custom configuration ends. -### - -include Makefile - diff --git a/gt-binarize-page-olena-sauvola-denoise-ocropy-clip-resegment-dewarp-ocr-ocropy-tesseract.mk b/gt-binarize-page-olena-sauvola-denoise-ocropy-clip-resegment-dewarp-ocr-ocropy-tesseract.mk deleted file mode 100644 index 9196bc7..0000000 --- a/gt-binarize-page-olena-sauvola-denoise-ocropy-clip-resegment-dewarp-ocr-ocropy-tesseract.mk +++ /dev/null @@ -1,95 +0,0 @@ -# Install by copying (or symlinking) makefiles into a directory -# where all OCR-D workspaces (unpacked BagIts) reside. Then -# chdir to that location. - -# Call via: -# `make -f WORKFLOW-CONFIG.mk WORKSPACE-DIRS` or -# `make -f WORKFLOW-CONFIG.mk all` or just -# `make -f WORKFLOW-CONFIG.mk` -# To rebuild partially, you must pass -W to recursive make: -# `make -f WORKFLOW-CONFIG.mk EXTRA_MAKEFLAGS="-W FILEGRP"` - -### -# From here on, custom configuration begins. - -info: - @echo "Read GT line segmentation," - @echo "then binarize+denoise pages," - @echo "then clip regions," - @echo "then resegment+dewarp lines," - @echo "then recognize lines with various Ocropus+Tesseract models," - @echo "and finally evaluate OCR quality by measuring" - @echo "character error rates on line texts w.r.t. GT." - -INPUT = OCR-D-GT-SEG-LINE - -$(INPUT): - ocrd workspace find -G $@ --download - ocrd workspace find -G OCR-D-IMG --download # just in case - -BIN = $(INPUT)-BINPAGE-sauvola - -$(BIN): $(INPUT) -$(BIN): TOOL = ocrd-olena-binarize -$(BIN): PARAMS = "impl": "sauvola-ms-split" - -DEN = $(BIN)-DENOISE-ocropy - -$(DEN): $(BIN) -$(DEN): TOOL = ocrd-cis-ocropy-denoise -$(DEN): PARAMS = "level-of-operation": "page", "noise_maxsize": 3.0 - -CLIP = $(DEN)-CLIP - -$(CLIP): $(DEN) -$(CLIP): TOOL = ocrd-cis-ocropy-clip - -RESEG = $(CLIP)-RESEG - -$(RESEG): $(CLIP) -$(RESEG): TOOL = ocrd-cis-ocropy-resegment - -DEW = $(RESEG)-DEWARP - -$(DEW): $(RESEG) -$(DEW): TOOL = ocrd-cis-ocropy-dewarp - -OCR1 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-fraktur-%) -OCR2 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-frakturjze-%) -OCR3 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-%) -OCR4 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-Latin-%) -OCR5 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-%) -OCR6 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-deu-%) -OCR7 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-gt4histocr-%) -OCR8 = $(DEW:$(INPUT)-%=OCR-D-OCR-CALA-gt4histocr-%) - -$(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8): $(DEW) - -$(OCR1) $(OCR2): TOOL = ocrd-cis-ocropy-recognize -$(OCR1): PARAMS = "textequiv_level": "glyph", "model": "fraktur.pyrnn" -$(OCR2): PARAMS = "textequiv_level": "glyph", "model": "fraktur-jze.pyrnn" - -$(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7): TOOL = ocrd-tesserocr-recognize -$(OCR3): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur" -$(OCR4): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur+script/Latin" -$(OCR5): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk" -$(OCR6): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk+deu" -$(OCR7): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "GT4HistOCR_2000000+GT4HistOCR_300000+GT4HistOCR_100000" - -$(OCR8): TOOL = ocrd-calamari-recognize -$(OCR8): GPU = 1 -$(OCR8): PARAMS = "checkpoint" : "$(VIRTUAL_ENV)/share/calamari/GT4HistOCR/*.ckpt.json" - -OUTPUT = $(DEW)-OCR - -$(OUTPUT): $(INPUT) $(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8) -$(OUTPUT): TOOL = ocrd-cor-asv-ann-evaluate -$(OUTPUT): PARAMS = "metric" : "historic_latin" - -.DEFAULT_GOAL = $(OUTPUT) - -# Down here, custom configuration ends. -### - -include Makefile - diff --git a/gt-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-clip-deskew-region-tesseract-resegment-dewarp-ocr-ocropy-tesseract-extract-lines.mk b/gt-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-clip-deskew-region-tesseract-resegment-dewarp-ocr-ocropy-tesseract-extract-lines.mk deleted file mode 100644 index 8dd03cb..0000000 --- a/gt-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-clip-deskew-region-tesseract-resegment-dewarp-ocr-ocropy-tesseract-extract-lines.mk +++ /dev/null @@ -1,139 +0,0 @@ -# Install by copying (or symlinking) makefiles into a directory -# where all OCR-D workspaces (unpacked BagIts) reside. Then -# chdir to that location. - -# Call via: -# `make -f WORKFLOW-CONFIG.mk WORKSPACE-DIRS` or -# `make -f WORKFLOW-CONFIG.mk all` or just -# `make -f WORKFLOW-CONFIG.mk` -# To rebuild partially, you must pass -W to recursive make: -# `make -f WORKFLOW-CONFIG.mk EXTRA_MAKEFLAGS="-W FILEGRP"` - -### -# From here on, custom configuration begins. - -info: - @echo "Read GT line segmentation," - @echo "then binarize+denoise+deskew pages," - @echo "then clip+deskew regions," - @echo "then resegment+dewarp lines," - @echo "then recognize lines with various Ocropus+Tesseract models," - @echo "and finally extract line images and line texts" - @echo "(both the GT and OCR versions) into one directory," - @echo "with conventional filename suffixes for OCR/post-correction training." - -INPUT = OCR-D-GT-SEG-LINE - -$(INPUT): - ocrd workspace find -G $@ --download - ocrd workspace find -G OCR-D-IMG --download # just in case - -BIN = $(INPUT)-BINPAGE-sauvola - -$(BIN): $(INPUT) -$(BIN): TOOL = ocrd-olena-binarize -$(BIN): PARAMS = "impl": "sauvola-ms-split" - -DEN = $(BIN)-DENOISE-ocropy - -$(DEN): $(BIN) -$(DEN): TOOL = ocrd-cis-ocropy-denoise -$(DEN): PARAMS = "level-of-operation": "page", "noise_maxsize": 3.0 - -FLIP = $(DEN)-DESKEW-tesseract - -$(FLIP): $(DEN) -$(FLIP): TOOL = ocrd-tesserocr-deskew -$(FLIP): PARAMS = "operation_level": "page" - -DESK = $(FLIP)-DESKEW-ocropy - -$(DESK): $(FLIP) -$(DESK): TOOL = ocrd-cis-ocropy-deskew -$(DESK): PARAMS = "level-of-operation": "page", "maxskew": 5 - -CLIP = $(DESK)-CLIP - -$(CLIP): $(DESK) -$(CLIP): TOOL = ocrd-cis-ocropy-clip - -FLIP2 = $(CLIP)-DESKEW-tesseract - -$(FLIP2): $(CLIP) -$(FLIP2): TOOL = ocrd-tesserocr-deskew -$(FLIP2): PARAMS = "operation_level": "region" - -DESK2 = $(FLIP2)-DESKEW-ocropy - -$(DESK2): $(FLIP2) -$(DESK2): TOOL = ocrd-cis-ocropy-deskew -$(DESK2): PARAMS = "level-of-operation": "region" - -RESEG = $(DESK2)-RESEG - -$(RESEG): $(DESK2) -$(RESEG): TOOL = ocrd-cis-ocropy-resegment - -DEW = $(RESEG)-DEWARP - -$(DEW): $(RESEG) -$(DEW): TOOL = ocrd-cis-ocropy-dewarp - -OCR1 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-fraktur-%) -OCR2 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-frakturjze-%) -OCR3 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-%) -OCR4 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-Latin-%) -OCR5 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-%) -OCR6 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-deu-%) -OCR7 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-gt4histocr-%) -OCR8 = $(DEW:$(INPUT)-%=OCR-D-OCR-CALA-gt4histocr-%) - -$(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8): $(DEW) - -$(OCR1) $(OCR2): TOOL = ocrd-cis-ocropy-recognize -$(OCR1): PARAMS = "textequiv_level": "glyph", "model": "fraktur.pyrnn" -$(OCR2): PARAMS = "textequiv_level": "glyph", "model": "fraktur-jze.pyrnn" - -$(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7): TOOL = ocrd-tesserocr-recognize -$(OCR3): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur" -$(OCR4): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur+script/Latin" -$(OCR5): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk" -$(OCR6): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk+deu" -$(OCR7): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "GT4HistOCR_2000000+GT4HistOCR_300000+GT4HistOCR_100000" - -$(OCR8): TOOL = ocrd-calamari-recognize -$(OCR8): GPU = 1 -$(OCR8): PARAMS = "checkpoint" : "$(VIRTUAL_ENV)/share/calamari/GT4HistOCR/*.ckpt.json" - -LINES = $(patsubst %,OCR-D-IMG-LINES-%,$(DEW) $(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8)) - -$(LINES): OCR-D-IMG-LINES-%: % -$(LINES): TOOL = ocrd-segment-extract-lines -$(LINES): PARAMS = "transparency": true - -OUTPUT = OCR-D-IMG-LINES - -$(OUTPUT): $(LINES) - @mkdir -p $(OUTPUT) - set -e; \ - ln -frs $/dev/null -} -trap cleanup EXIT - -# FIXME: bash says under BUGS "There may be only one active coprocess at a time." which causes spurious warnings here. -# (We therefore temporarily silence stderr to silence the execute_coproc warnings.) -exec 4>&2 -{ -coproc critical { ocrd log -n ocrd-import critical - >& 4 2>& 4; } -coproc error { ocrd log -n ocrd-import error - >& 4 2>& 4; } -coproc warning { ocrd log -n ocrd-import warning - >& 4 2>& 4; } -coproc info { ocrd log -n ocrd-import info - >& 4 2>& 4; } -coproc debug { ocrd log -n ocrd-import debug - >& 4 2>& 4; } -} 2>/dev/null - -function critical { echo "$1" >& ${critical[1]}; } -function error { echo "$1" >& ${error[1]}; } -function warning { echo "$1" >& ${warning[1]}; } -function info { echo "$1" >& ${info[1]}; } -function debug { echo "$1" >& ${debug[1]}; } - -((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && critical "bash $BASH_VERSION is too old. Please install 4.4 or newer" && exit 2 - -ignore=0 -skip=() -regex=() -convert=1 -dpi=300 -numpageid=1 -onlybasename=0 -while (($#)); do - case "${1:--h}" in - -h|-[-]help) - cat <1)) && warning "non-first argument(s) will be ignored: '${@:2}'" - -set -eE -declare -A MIMETYPES -eval MIMETYPES=( $(ocrd bashlib constants EXT_TO_MIME) ) -MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE) -DIRECTORY="${1:-.}" -if ! test -d "$DIRECTORY"; then - critical "not a directory: '$DIRECTORY'" - false -fi - -# avoid damaging/replacing existing workspaces: -if test -f "$DIRECTORY"/mets.xml || test -d "$DIRECTORY"/data -a -f "$DIRECTORY"/data/mets.xml; then - critical "Directory '$1' already is a workspace" - false -fi - -# trap to back-off from mets.xml and subdir in case of failure: -function backout { - set +e - critical "Cancelled '$DIRECTORY'" - ocrd workspace -U mets.sock server stop - test -v PID_SERVER && kill $PID_SERVER &>/dev/null - rm -f mets.xml - rmdir --ignore-fail-on-non-empty OCR-D-IMG OCR-D-SEG-PAGE 2>/dev/null - popd > /dev/null - exit 1 -} -trap backout ERR INT - -info "analysing '$DIRECTORY'" -pushd "$DIRECTORY" > /dev/null -ocrd workspace init > /dev/null -ocrd workspace -U mets.sock server start > /dev/null & - -PID_SERVER=$! -PID_TOP=$$ - -sleep 1 # wait for server to become available - -num=0 zeros=0000 -IFS=$'\n' -for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); do - IFS=$' \t\n' - let num++ || true - page=p${zeros:0:$((4-${#num}))}$num - group=OCR-D-IMG - file="${file#./}" - for suffix in "${skip[@]}"; do - if test "$file" != "${file%$suffix}"; then - info "skipping file '$file'" - continue - fi - done - if ((${#regex[*]})); then - match=0 - for expr in "${regex[@]}"; do - expr="${expr#$DIRECTORY}" - if [[ "$file" =~ $expr ]]; then - match=1 - break - fi - done - if ((match)); then - info "matching file '$file'" - else - continue - fi - fi - if test -z "$file"; then - warning "ignoring empty file $file" - continue - fi - # guess MIME type - name="$(basename "$file")" - suffix=."${name##*.}" - mimetype=${MIMETYPES[${suffix,,[A-Z]}]} - # create ID from path - base="${name%$suffix}" - name="$(dirname "${file#./}")" - if test "$name" != . && ! ((onlybasename)); then - base="${name//\//_}_$base" - fi - # XSD ID must start with letter and not contain colons or spaces - # also, avoid . in IDs, because downstream it will confuse filename suffix detection - base="${base//[ :.,]/_}" - if ! [[ ${base:0:1} =~ [a-zA-Z] ]]; then - base=f${base} - fi - if ! ((numpageid)); then - page=$base - fi - #debug "found file '$file' (base=$base page=$page mimetype=$mimetype)" - case "$mimetype" in - ${MIMETYPE_PAGE}) - # FIXME should really validate this is PAGE-XML (cf. core#353) - if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \ - && fgrep -qw 'PcGts' "$file"; then - group=OCR-D-SEG-PAGE - elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \ - && fgrep -qw alto "$file"; then - mimetype=application/alto+xml - group=OCR-D-SEG-ALTO - elif (($ignore)); then - warning "unknown type of file '$file'" - continue - else - critical "unknown type of file '$file'" - false - fi - ;; - image/tiff|image/jpeg|image/png) - # directly supported - ;; - *) - case "$suffix" in - .pdf|.PDF) - inopts=(-units PixelsPerInch -density $((2*$dpi))) - outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi) - ;; - *) - inopts=() - outopts=() - esac - if (($convert)) && \ - mkdir -p OCR-D-IMG && \ - warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \ - convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then - mimetype=image/tiff - IFS=$'\n' - files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort)) - IFS=$' \t\n' - info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)" - if ((${#files[*]}>1)); then - for file in "${files[@]}"; do - file="${file#OCR-D-IMG/}" - base="${file%.tif}" - info "adding -g ${page}_${base:(-4)} -G $group -m $mimetype -i $base '$file'" - ocrd workspace -U mets.sock add -G $group -m $mimetype -g ${page}_${base:(-4)} -i "$base" "$file" - done - # there's no danger of clashes with other files here - continue - else - file="${files[0]}" - file="${file#./}" - fi - elif (($ignore)); then - warning "unknown type of file '$file'" - continue - else - critical "unknown type of file '$file'" - false - fi - ;; - esac - # file IDs must contain fileGrp, otherwise processors will have to prevent - # ID clashes by using numeric IDs - if [[ "$base" != $group* ]]; then - base=${group}_"$base" - fi - # finally, add the file to the METS - info "adding -g $page -G $group -m $mimetype -i $base '$file'" - ocrd workspace -U mets.sock add -G $group -m $mimetype -g $page -i "$base" "$file" -done - -# undo backout trap -trap "" ERR -# persist METS -ocrd workspace -U mets.sock server stop -wait $PID_SERVER -# ensure these exist in the file system, too -# (useful for ocrd-make) -mkdir -p OCR-D-IMG OCR-D-SEG-PAGE -popd > /dev/null - -info "Success on '$DIRECTORY'" - diff --git a/ocrd-page-transform b/ocrd-page-transform deleted file mode 100755 index 0df3858..0000000 --- a/ocrd-page-transform +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC2086 - -set -eu -set -o pipefail -# set -x - -### arbitrary XSL transformation for PAGE-XML in OCR-D -# -# Finds and downloads all files in the input fileGrp -# of the workspace. Then for each page, finds the -# corresponding PAGE-XML file, and processes it with -# the given XSLT. The result is added to the output -# fileGrp. - -which ocrd >/dev/null 2>/dev/null || { echo >&2 "ocrd not in \$PATH. Panicking"; exit 1; } -((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && echo >&2 "bash $BASH_VERSION is too old. Please install bash 4.4 or newer." && exit 1 - -SHAREDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" -PRESERVE_NAMESPACE=1 # 1 preserves the input file's PAGE namespace prefix and URL (version) -ADD_METADATAITEM=1 # 1 adds a MetadataItem detailling the transform params used - -MIMETYPE_PAGE=$(ocrd bashlib constants MIMETYPE_PAGE) -declare -A NAMESPACES -eval "NAMESPACES=( $(ocrd bashlib constants NAMESPACES) )" - -function process_file { - local in_fpath="$1" in_id="$2" in_pageId="$3" xsl="$4" param="$5" out_fpath="$6" out_id="$7" out_file_grp="$8" pretty="$9" - - # to become independent of whether and what - # namespace prefix is used for PAGE-XML, - # we first have to know the namespace: - namespace=$(xmlstarlet sel -t -m '/*[1]' -v 'namespace-uri()' "$in_fpath") - # now (using --no-doc-namespace) we can - # safely query with -N pc=${namespace} - # and safely add with prefix ${ns_prefix}: - ns_prefix=$(xmlstarlet sel -t -m '/*[1]' -v 'substring-before(name(),"PcGts")' "$in_fpath"; true) - - function ingest { - if ((PRESERVE_NAMESPACE)); then - # preserve namespace and prefix - cat "$1" - else - # stylesheet transforms to standard namespace: - xmlstarlet tr <(cat < - - - - - - - - - - - -EOF - ) "$1" - fi - } - - function addmeta { - declare -a options - options+=( --no-doc-namespace ed - -N "pc=$namespace" - -u '/pc:PcGts/@pcGtsId' - -v "$out_id" ) - if ((ADD_METADATAITEM)); then - # insert agent - options+=( - -s '/pc:PcGts/pc:Metadata' - -t elem -n "${ns_prefix}MetadataItem" - # bind previous element to "new-item": - --var new-item '$prev' - -s '$new-item' -t attr -n type - -v "processingStep" - -s '$new-item' -t attr -n name - -v "*" - -s '$new-item' -t attr -n value - -v "$OCRD_TOOL_NAME" - # add "Labels" for params: - -s '$new-item' -t elem -n "${ns_prefix}Labels" - # bind previous element to "new-labels": - --var new-labels '$prev' - -s '$new-labels' -t attr -n externalModel - -v ocrd-tool - -s '$new-labels' -t attr -n externalId - -v parameters - ) - for key in ${!params[@]}; do - # shellcheck disable=SC2016 - options+=( # add another "Label": - -s '$new-labels' -t elem -n "${ns_prefix}Label" - # bind previous element to "new-label": - --var new-label '$prev' - -s '$new-label' -t attr -n value - -v "${params[$key]}" - -s '$new-label' -t attr -n type - -v "$key" ) - done - tool_version=$(ocrd ocrd-tool "$OCRD_TOOL_JSON" version | sed 's,^Version ",,;s,".*$,,') - core_version=$(ocrd --version | sed 's/ocrd, version //') - options+=( - # add "Labels" for versions: - -s '$new-item' -t elem -n "${ns_prefix}Labels" - # bind previous element to "new-labels": - --var new-labels '$prev' - # add another "Label": - -s '$new-labels' -t attr -n externalModel - -v ocrd-tool - -s '$new-labels' -t attr -n externalId - -v version - -s '$new-labels' -t elem -n "${ns_prefix}Label" - # bind previous element to "new-label": - --var new-label '$prev' - -s '$new-label' -t attr -n value - -v "$tool_version" - -s '$new-label' -t attr -n type - -v "$OCRD_TOOL_NAME" - -s '$new-labels' -t elem -n "${ns_prefix}Label" - # bind previous element to "new-label": - --var new-label '$prev' - -s '$new-label' -t attr -n value - -v "$core_version" - -s '$new-label' -t attr -n type - -v "ocrd/core" - ) - fi - xmlstarlet "${options[@]}" - } - - function pprint { - if ((pretty)); then - xmlstarlet fo -s $pretty - else - cat - fi - } - - ingest "$in_fpath" | xmlstarlet tr "$xsl" $param | addmeta | pprint >"$out_fpath" - -} - -function main { - # Load ocrd bashlib functions - # shellcheck source=../core/ocrd/bashlib/lib.bash - source $(ocrd bashlib filename) - ocrd__wrap "$SHAREDIR/ocrd-tool.json" "ocrd-page-transform" "$@" - ocrd__minversion 2.58.1 - - local xsl="${params[xsl]}" - local xsltparam="${params[xslt-params]}" - local pretty="${params[pretty-print]}" - if test -e "$xsl"; then - xsl="$(realpath "$xsl")" - elif ocrd__list_resources | fgrep -q "/$xsl"; then - xsl="$(ocrd__list_resources | fgrep -m1 "/$xsl")" - else - ocrd__raise "cannot find xsl resource '$xsl'" - fi - cd "${ocrd__argv[working_dir]}" - local out_file_grp=${ocrd__argv[output_file_grp]} - - for ((n=0; n<${#ocrd__files[*]}; n++)); do - local in_fpath="$(ocrd__input_file $n local_filename)" - local in_id="$(ocrd__input_file $n ID)" - local in_pageId="$(ocrd__input_file $n pageId)" - local in_mimetype="$(ocrd__input_file $n mimetype)" - local out_id="$(ocrd__input_file $n outputFileId)" - local out_fpath="$out_file_grp/${out_id}.xml" - local out_mimetype="${params[mimetype]}" - - if ! test -f "${in_fpath#file://}"; then - ocrd log error "input file ID=${in_id} (pageId=${in_pageId} MIME=${in_mimetype}) is not on disk" - continue - fi - mkdir -p $out_file_grp - - ocrd log info "processing PAGE-XML input file $in_id ($in_pageId)" - process_file "$in_fpath" "$in_id" "$in_pageId" "$xsl" "$xsltparam" "$out_fpath" "$out_id" "$out_file_grp" $pretty - - # Add PAGE file to METS - declare -a add_options - if [ -n "$in_pageId" ]; then - add_options=( -g $in_pageId ) - else - add_options=() - fi - if [[ "${ocrd__argv[overwrite]}" == true ]];then - add_options+=( --force ) - fi - add_options+=( -G $out_file_grp - -m $out_mimetype - -i "$out_id" - "$out_fpath" ) - declare -a workspace_options - if [[ -n "${ocrd__argv[mets_server_url]}" ]];then - workspace_options+=( -U "${ocrd__argv[mets_server_url]}" ) - fi - ocrd workspace "${workspace_options[@]}" add "${add_options[@]}" - done -} - - -main "$@" diff --git a/ocrd-tool.json b/ocrd-tool.json deleted file mode 100644 index 35e2f86..0000000 --- a/ocrd-tool.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "version": "0.1.3", - "git_url": "https://github.com/bertsky/workflow-configuration", - "tools": { - "ocrd-page-transform": { - "executable": "ocrd-page-transform", - "description": "apply arbitrary XSL transformation file for PAGE-XML", - "parameters": { - "xsl": { - "description": "File path of the XSL transformation script", - "type": "string", - "format": "uri", - "content-type": "text/xsl", - "required": true - }, - "xslt-params": { - "description": "Assignment of XSL transformation parameter values, given as in `xmlstarlet` (which differentiates between `-s name=value` for literal `value` and `-p name=value` for XPath expression `value`), white-space separated.", - "type": "string", - "default": "" - }, - "pretty-print": { - "description": "Reformat with line breaks and this many spaces of indentation after XSL transformation (unless zero).", - "type": "number", - "format": "integer", - "default": 0 - }, - "mimetype": { - "description": "MIME type to register the output files under (should correspond to `xsl` result)", - "type": "string", - "default": "application/vnd.prima.page+xml" - } - } - } - } -} diff --git a/ocrd-tool.json b/ocrd-tool.json new file mode 120000 index 0000000..af147cc --- /dev/null +++ b/ocrd-tool.json @@ -0,0 +1 @@ +workflow_configuration/ocrd-tool.json \ No newline at end of file diff --git a/page-remove-regions.xsl b/page-remove-regions.xsl deleted file mode 100644 index a34e7d3..0000000 --- a/page-remove-regions.xsl +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..81e92c5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,80 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] + +[project] +name = "workflow_configuration" +authors = [ + {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, +] +description = "a makefilization for OCR-D workflows, with configuration examples" +readme = "README.md" +license = {text = "Apache License 2.0"} +requires-python = ">=3.8" +keywords = ["ocr", "ocr-d"] + +dynamic = ["version", "dependencies", "optional-dependencies"] + +# https://pypi.org/classifiers/ +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Intended Audience :: Other Audience", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing", +] + +[project.scripts] +ocrd-import = "workflow_configuration.ocrd_import:cli" +ocrd-page-transform = "workflow_configuration.ocrd_page_transform:cli" +page-add-nsprefix-pc = "workflow_configuration.xsl_transform:cli" +page-ensure-readingorder = "workflow_configuration.xsl_transform:cli" +page-ensure-textequiv-conf = "workflow_configuration.xsl_transform:cli" +page-ensure-textequiv-index = "workflow_configuration.xsl_transform:cli" +page-ensure-textequiv-unicode = "workflow_configuration.xsl_transform:cli" +page-extract-glyphs = "workflow_configuration.xsl_transform:cli" +page-extract-lines = "workflow_configuration.xsl_transform:cli" +page-extract-text = "workflow_configuration.xsl_transform:cli" +page-extract-words = "workflow_configuration.xsl_transform:cli" +page-fix-coords = "workflow_configuration.xsl_transform:cli" +page-flatten-tableregions = "workflow_configuration.xsl_transform:cli" +page-move-alternativeimage-below-page = "workflow_configuration.xsl_transform:cli" +page-remove-all-regions = "workflow_configuration.xsl_transform:cli" +page-remove-alternativeimages = "workflow_configuration.xsl_transform:cli" +page-remove-dead-regionrefs = "workflow_configuration.xsl_transform:cli" +page-remove-empty-lines = "workflow_configuration.xsl_transform:cli" +page-remove-empty-readingorder = "workflow_configuration.xsl_transform:cli" +page-remove-empty-text-regions = "workflow_configuration.xsl_transform:cli" +page-remove-glyphs = "workflow_configuration.xsl_transform:cli" +page-remove-lines = "workflow_configuration.xsl_transform:cli" +page-remove-metadataitem = "workflow_configuration.xsl_transform:cli" +page-remove-regions = "workflow_configuration.xsl_transform:cli" +page-remove-text-regions = "workflow_configuration.xsl_transform:cli" +page-remove-textequiv = "workflow_configuration.xsl_transform:cli" +page-remove-words = "workflow_configuration.xsl_transform:cli" +page-rename-id-clashes = "workflow_configuration.xsl_transform:cli" +page-rm-nsprefix-pc = "workflow_configuration.xsl_transform:cli" +page-set-nsversion-2019 = "workflow_configuration.xsl_transform:cli" +page-sort-textequiv-index = "workflow_configuration.xsl_transform:cli" +page-textequiv-lines-to-regions = "workflow_configuration.xsl_transform:cli" +page-textequiv-words-to-lines = "workflow_configuration.xsl_transform:cli" +page-unflatten-tableregions = "workflow_configuration.xsl_transform:cli" +mets-add-nsprefix-mets = "workflow_configuration.xsl_transform:cli" +mets-alias-filegrp = "workflow_configuration.xsl_transform:cli" +mets-copy-agents = "workflow_configuration.xsl_transform:cli" +fix-page-coords = "workflow_configuration.fix_page_coords:cli" +ocrd-make = "workflow_configuration.shellscript:cli" + +[project.urls] +Homepage = "https://github.com/bertsky/workflow_configuration" +Repository = "https://github.com/bertsky/workflow_configuration.git" + + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools] +packages = ["workflow_configuration"] +package-data = {"*" = ["*.json", "*.xsl", "*.mk", "*.sh", "Makefile"]} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..71f73b3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +ocrd >= 3.0 diff --git a/workflow.mk b/workflow_configuration/Makefile similarity index 91% rename from workflow.mk rename to workflow_configuration/Makefile index 174df0c..ea374b7 100644 --- a/workflow.mk +++ b/workflow_configuration/Makefile @@ -32,6 +32,7 @@ # instead of `make` as above. # + # make all targets as intermediate and not to be removed # (because we must remove via METS): .SECONDARY: @@ -54,7 +55,7 @@ CONFIGURATION := $(abspath $(firstword $(MAKEFILE_LIST))) CONFIGDIR := $(dir $(CONFIGURATION)) CONFIGNAME := $(basename $(notdir $(CONFIGURATION))) -ifeq ($(filter-out Makefile workflow.mk,$(notdir $(MAKEFILE_LIST))),) +ifeq ($(filter-out Makefile,$(notdir $(MAKEFILE_LIST))),) ifneq ($(MAKECMDGOALS),help) $(error Did you forget to select a workflow configuration makefile?) endif @@ -70,7 +71,6 @@ help: @echo " * help (this message)" @echo " * info (short self-description of the selected configuration)" @echo " * show (print command sequence that would be executed for the selected configuration)" - @echo " * server (start workflow server for the selected configuration; control via 'ocrd workflow client')" @echo @echo " Targets (data processing):" @echo " * % (name of the target fileGrp, overriding the default goal)" @@ -91,15 +91,8 @@ help: show: $(.DEFAULT_GOAL) export PATH VIRTUAL_ENV -server: PORT ?= 5000 -server: HOST ?= 127.0.0.1 -server: TIMEOUT ?= 0 -server: WORKERS ?= 1 -server: - IFS=$$'\n' TASKS=($$($(MAKE) -s --no-print-directory -R -f $(CONFIGURATION) show | sed -n "s/'$$//;s/^'ocrd-//p")); \ - ocrd workflow server -j $(WORKERS) -t $(TIMEOUT) -h $(HOST) -p $(PORT) $(and $(LOGLEVEL),-l $(LOGLEVEL)) "$${TASKS[@]}" 2>&1 | tee -a _server.$(CONFIGNAME).log -.PHONY: show server +.PHONY: show ifneq ($(wildcard $(CURDIR)/mets.xml),) # we are inside workspace @@ -219,7 +212,7 @@ else ifeq ($(PAGEWISE),1) # page-wise: determine list of pages and split up into pseudo-targets PAGE_RANGE = $(shell ocrd workspace list-page $(and $(PAGES),-r $(PAGES))) %: - @$(if $(and $(TOOL),$<),$(info building "$@" from "$<" $(and $(PAGEWISE),page-wise) with pattern rule for "$(TOOL)"),$(error No recipe to build "$@" from "$<" with "$(TOOL)")) + @$(if $(and $(TOOL),$<),$(info building "$@" from "$<" page-wise with pattern rule for "$(TOOL)"),$(error No recipe to build "$@" from "$<" with "$(TOOL)")) $(file > $@.json, { $(PARAMS) }) $(MAKE) $(foreach PAGE,$(PAGE_RANGE),PAGE/$(PAGE)) -f $(CONFIGURATION) -I $(CONFIGDIR) PAGEWISE=2 TARGET=$@ PREREQ=$< $(and $(JOBS),-j $(filter-out 0,$(JOBS))) $(and $(LOAD),-l $(filter-out 0,$(LOAD))) else ifeq ($(PAGEWISE),2) @@ -246,8 +239,13 @@ ifndef METS_SOCKET .NOTPARALLEL: endif +# allow chaining makefiles +unexport INPUT +unexport OUTPUT +.DEFAULT_GOAL = $(OUTPUT) + else # (if not inside workspace) -ifeq ($(filter help info show server,$(MAKECMDGOALS)),) +ifeq ($(filter help info show,$(MAKECMDGOALS)),) $(error No workspaces in "$(CURDIR)", and no generic goals among "$(MAKECMDGOALS)") endif # (if pseudo-target) endif # (if inside workspace) diff --git a/workflow_configuration/__init__.py b/workflow_configuration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workflow_configuration/all-tess-MODEL.mk b/workflow_configuration/all-tess-MODEL.mk new file mode 100644 index 0000000..95705e5 --- /dev/null +++ b/workflow_configuration/all-tess-MODEL.mk @@ -0,0 +1,8 @@ +INPUT := $(or $(OUTPUT),$(INPUT),OCR-D-IMG) + +OCR-D-OCR-TESS: $(INPUT) +OCR-D-OCR-TESS: TOOL = ocrd-tesserocr-recognize +OCR-D-OCR-TESS: MODEL ?= Fraktur+Latin +OCR-D-OCR-TESS: OPTIONS = -P segmentation_level region -P model $(MODEL) -P shrink_polygons true # -P auto_model true + +OUTPUT := OCR-D-OCR-TESS diff --git a/workflow_configuration/all-tess-frak2021.mk b/workflow_configuration/all-tess-frak2021.mk new file mode 100644 index 0000000..b5f7e2c --- /dev/null +++ b/workflow_configuration/all-tess-frak2021.mk @@ -0,0 +1,9 @@ +INPUT := $(or $(OUTPUT),$(INPUT),OCR-D-IMG) + +OCR-D-OCR-TESS-FRAK2021: $(INPUT) +OCR-D-OCR-TESS-FRAK2021: TOOL = ocrd-tesserocr-recognize +OCR-D-OCR-TESS-FRAK2021: OPTIONS = -P segmentation_level region \ + -P model frak2021+GT4HistOCR+frk+deu-frak+deu+Fraktur+Latin \ + -P shrink_polygons true # -P auto_model true + +OUTPUT := OCR-D-OCR-TESS-FRAK2021 diff --git a/workflow_configuration/cat-files.mk b/workflow_configuration/cat-files.mk new file mode 100644 index 0000000..1b0891d --- /dev/null +++ b/workflow_configuration/cat-files.mk @@ -0,0 +1,7 @@ +INPUT := $(or $(OUTPUT),$(INPUT),OCR-D-IMG) + +OUTPUT: $(INPUT) +OUTPUT: + @shopt -s nullglob; cat $&2 "ERROR: cannot resolve workflow path name '$RES'" + exit 1 + fi + echo "$RES" +} + makeopts=() targets=() # consume all arguments, sift our own vs. make's @@ -49,11 +69,10 @@ while (($#)); do ;; -f) shift - WORKFLOW="$1" + WORKFLOWS+=($(resolve_wf "$1")) ;; --file=*|--makefile=*) - WORKFLOW="$1" - WORKFLOW="${WORKFLOW#*=}" + WORKFLOWS+=($(resolve_wf "${1#*=}")) ;; -j|--jobs) PARALLEL=1 @@ -128,7 +147,6 @@ Running OCR-D workflow configurations on multiple workspaces: * help (this message) * info (short self-description of the selected configuration) * show (print command sequence that would be executed for the selected configuration) - * server (start workflow server for the selected configuration; control via 'ocrd workflow client') Targets (data processing): * all (recursively find all directories with a mets.xml, default goal) @@ -165,14 +183,13 @@ EOF ;; *=*) makeopts+=( "$1" ) - eval ${1%=*}=${1#*=} + eval "${1%%=*}"='"${1#*=}"' ;; all) ALL=1 ;; - info|show|server) - make "${makeopts[@]}" -I $SHAREDIR -f $WORKFLOW $1 - exit + info|show) + targets=($1) ;; *) if ! [[ -d "$1" ]]; then @@ -185,33 +202,27 @@ EOF shift done -if [[ -z "$WORKFLOW" ]]; then +if (( ${#WORKFLOWS[*]} == 0 )); then echo >&2 "ERROR: must set concrete workflow file (-f option)" exit 1 fi +# combine workflows +CFGNAME=$(for path in "${WORKFLOWS[@]}"; do echo -n $(basename "${path%.mk}")+; done) +CFGNAME=${CFGNAME%+} +WORKFLOW=$(mktemp -t -u ocrd-make-XXXXXXX.mk) +# if workflows are not multi-staged, avoid re-including Makefile +cat "${WORKFLOWS[@]}" | sed "/^include Makefile/d" > $WORKFLOW +echo "include Makefile" >> $WORKFLOW +cleanup() { + set +e + rm -f $WORKFLOW 2>/dev/null +} +trap cleanup EXIT -if [[ "$WORKFLOW" = "${WORKFLOW#/}" ]]; then - # relative path - if [[ -e "$WORKFLOW" ]]; then - WORKFLOW="$PWD/$WORKFLOW" - elif [[ -e "$SCRIPTDIR/$WORKFLOW" ]]; then - WORKFLOW="$SCRIPTDIR/$WORKFLOW" - elif [[ -e "$SHAREDIR/$WORKFLOW" ]]; then - WORKFLOW="$SHAREDIR/$WORKFLOW" - fi -fi -if ! [[ -e "$WORKFLOW" ]]; then - echo >&2 "ERROR: cannot resolve path name '$WORKFLOW'" - exit 1 -fi - -CFGDIR="$(realpath $(dirname "$WORKFLOW"))" -CFGNAME="$(basename "${WORKFLOW%.mk}")" if [[ -n "$XFERHOST" ]]; then # will be copied via to host via --bf relative to --wd - WORKFLOW="$CFGDIR/./${CFGNAME}.mk" - # sharedir will be added on host - makeopts+=( -f "$CFGNAME.mk" ) + # sharedir will be added on host side + makeopts+=( -f "WORKFLOW" ) else # include directory of workflow config itself, # in case it includes a local.mk or Makefile @@ -219,6 +230,12 @@ else makeopts+=( -R -I "$SHAREDIR" -f "$WORKFLOW" ) fi +for target in "${targets[@]}"; do + if [ "$target" = info -o "$target" = show ]; then + make "${makeopts[@]}" $target + exit + fi +done ((${#targets[*]})) || ALL=1 if ((ALL)); then # find all */mets.xml @@ -257,6 +274,7 @@ set +e if ((PARALLEL)); then parallelopts=(--progress --joblog $CFGNAME.$$.log --files --tag) + echo >&2 "INFO: joblog=$CFGNAME.$$.log" if [[ -n "$XFERHOST" ]]; then parallelopts+=(--jobs 1) # default is 100% i.e. num cores @@ -285,6 +303,7 @@ if ((PARALLEL)); then # sqlite3 "file:$CFGNAME.sqlite?immutable=1&mode=ro" '.headers on' '.mode csv' 'SELECT * FROM jobs;' # schema: Seq,Host,Starttime,JobRuntime,Send,Receive,Exitval,_Signal,Command,V1,Stdout,Stderr parallelopts+=(--sqlandworker sqlite3:///$JOBDB/jobs) + echo >&2 "INFO: sqlite3 JOBDB=$JOBDB/jobs" fi # # --halt soon,fail=3 exit when 3 jobs fail, but wait for running jobs to complete. @@ -297,25 +316,24 @@ if ((PARALLEL)); then # will most likely use a different SHAREDIR, so wrap via ocrd-make there; # also, we usually need to activate our venv for OCR-D on the remote, # hence optional extra commands XFERINIT: - parallel "${parallelopts[@]}" "$XFERINIT" "${XFERINIT:+;}" ocrd-make "${makeopts[@]}" {} "2>&1" ::: "${targets[@]}" + parallel "${parallelopts[@]}" "$XFERINIT" "${XFERINIT:+;}" ocrd-make "${makeopts[@]@Q}" {} "2>&1" ::: "${targets[@]}" elif ((METSSERV)); then parallel "${parallelopts[@]}" \ ocrd workspace -d {} -U {}/mets.sock server start "2>&1" "&" \ 'sleep 2;' \ - make "${makeopts[@]}" METS_SOCKET=mets.sock -C {} "2>&1" \ + make "${makeopts[@]@Q}" METS_SOCKET=mets.sock -C {} "2>&1" \ ';result=$?;' \ ocrd workspace -d {} -U {}/mets.sock server stop "2>&1" \ ';exit $result' \ ::: "${targets[@]}" else - parallel "${parallelopts[@]}" make "${makeopts[@]}" -C {} "2>&1" ::: "${targets[@]}" + parallel "${parallelopts[@]}" make "${makeopts[@]@Q}" -C {} "2>&1" ::: "${targets[@]}" fi | while read dir log; do - echo $dir cat $log >> ${dir%%/}.$CFGNAME.log rm $log done echo $CFGNAME.$$.log - exitcodes=( $(cat $_ | cut -d" " -f7 | sed 1d) ) + exitcodes=( $(cat $CFGNAME.$$.log | cut -d" " -f7 | sed 1d) ) for ((i=0; i<${#targets[*]}; i++)); do ((${exitcodes[$i]:-(-1)}==0)) && echo -n "success:" || echo -n "failure:" echo " ${targets[$i]}" diff --git a/workflow_configuration/ocrd-tool.json b/workflow_configuration/ocrd-tool.json new file mode 100644 index 0000000..0574431 --- /dev/null +++ b/workflow_configuration/ocrd-tool.json @@ -0,0 +1,45 @@ +{ + "version": "0.2.0", + "git_url": "https://github.com/bertsky/workflow-configuration", + "dockerhub": "ocrd/workflow-configuration", + "tools": { + "ocrd-page-transform": { + "executable": "ocrd-page-transform", + "description": "apply arbitrary XSL transformation file for PAGE-XML", + "categories": ["Quality assurance"], + "steps": [ + "preprocessing/characterization", + "layout/segmentation", + "recognition/post-correction", + "post-processing/format-conversion" + ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "parameters": { + "xsl": { + "description": "File path of the XSL transformation script (see `ocrd resmgr` for prepackaged and user-installed files available by file name)", + "type": "string", + "format": "uri", + "content-type": "text/xsl", + "required": true + }, + "xslt-params": { + "description": "Assignment of XSL transformation parameter values, given as in `xmlstarlet` (which differentiates between `-s name=value` for literal `value` and `-p name=value` for XPath expression `value`), white-space separated.", + "type": "string", + "default": "" + }, + "pretty-print": { + "description": "Reformat with line breaks and this many spaces of indentation after XSL transformation (unless zero).", + "type": "number", + "format": "integer", + "default": 0 + }, + "mimetype": { + "description": "MIME type to register the output files under (should correspond to `xsl` result)", + "type": "string", + "default": "application/vnd.prima.page+xml" + } + } + } + } +} diff --git a/workflow_configuration/ocrd_import.py b/workflow_configuration/ocrd_import.py new file mode 100644 index 0000000..e2bfcbc --- /dev/null +++ b/workflow_configuration/ocrd_import.py @@ -0,0 +1,285 @@ +from __future__ import absolute_import + +import click +import re +import os +import sys +import subprocess +import multiprocessing as mp +from time import sleep +from tempfile import TemporaryDirectory +from shutil import move +from logging import getLogger, ERROR + +from ocrd.decorators import ocrd_loglevel +from ocrd import ( + Resolver, + Workspace, + OcrdMetsServer, +) +from ocrd.mets_server import ClientSideOcrdMets +from ocrd_models import OcrdMets +from ocrd_utils import ( + pushd_popd, + initLogging, + setOverrideLogLevel, + make_xml_id, + guess_media_type, + EXT_TO_MIME, + MIMETYPE_PAGE, +) + +def _start_mets_server(directory, url, log_level): + initLogging() + setOverrideLogLevel(log_level) + # silentium! + getLogger('ocrd.models.ocrd_mets.server').setLevel(log_level or ERROR) + getLogger('uvicorn.error').setLevel(log_level or ERROR) + workspace = Workspace(Resolver(), directory, OcrdMets.empty_mets()) + server = OcrdMetsServer(workspace, url) + server.startup() + +@click.command(context_settings={'help_option_names': ['-h', '--help']}) +@ocrd_loglevel +@click.option('-i', '--ignore', is_flag=True, help='keep going after unknown file types') +@click.option('-s', '--skip', metavar='SUFFIX', multiple=True, help='ignore file names ending in given SUFFIX (repeatable)') +@click.option('-R', '--regex', metavar='EXPR', multiple=True, help='only include paths matching given EXPR (repeatable)') +@click.option('-C', '--no-convert', is_flag=True, help='do not attempt to convert image file types') +@click.option('-r', '--render', metavar='DPI', default=300, show_default=True, type=float, help='when converting PDFs, render at DPI pixel density') +@click.option('-P', '--nonnum-ids', is_flag=True, help='do not use numeric pageIds but basename patterns') +@click.option('-B', '--basename', is_flag=True, help='only use basename for IDs') +@click.option('-n', '--dry-run', is_flag=True, help='only show resulting METS to stdout via pager') +@click.option('-I', '--image-group', default='OCR-D-IMG', show_default=True, help='fileGrp to place detected or converted images into') +@click.option('-X', '--pagexml-group', default='OCR-D-PAGE', show_default=True, help='fileGrp to place detected PAGE-XML into') +@click.option('-A', '--altoxml-group', default='OCR-D-ALTO', show_default=True, help='fileGrp to place detected ALTO-XML into') +@click.option('-G', '--directory-groups', is_flag=True, help='instead of assigning files to `image_group` or `pagexml_group`, and trying to convert everything else to images, create a group for every subdirectory and auto-detect its MIME types') +@click.argument('workspace_dir', type=click.Path(file_okay=False)) +def cli(workspace_dir, dry_run, log_level, **kwargs): + """ + \b + Create OCR-D workspace meta-data (mets.xml) in WORKSPACE_DIR (or $PWD), importing... + * all image files (with known file extension or convertible via ImageMagick) under fileGrp `image_group` + * all .xml files (if they validate as PAGE-XML) under fileGrp `pagexml_group` + * all .xml files (if they validate as ALTO-XML) under fileGrp `altoxml_group` + ...but failing otherwise (unless `ignore` is set) + """ + initLogging() + ctxt = mp.get_context('spawn') # avoid forking, because the child then kills the tmpdir + with TemporaryDirectory() as tmpdir: + mets_server_url = os.path.join(tmpdir, 'mets.sock') + mets_server = ctxt.Process(target=_start_mets_server, + args=(workspace_dir if not dry_run else tmpdir, + mets_server_url, + log_level), + # auto-kill in case of failure + daemon=True) + mets_server.start() + sleep(2) + assert mets_server.is_alive() # not much worth (also true when not running *yet*) + sys.exit(0 if ocrd_import(tmpdir, workspace_dir, mets_server_url, + log_level=log_level, dry_run=dry_run, + **kwargs) + else 1) + +def ocrd_import(tmpdir, workspace_dir, mets_server_url, + log_level=None, + ignore=False, + skip=None, + regex=None, + no_convert=False, + render=300, + nonnum_ids=False, + basename=False, + dry_run=False, + image_group='OCR-D-IMG', + pagexml_group='OCR-D-SEG-PAGE', + altoxml_group='OCR-D-SEG-ALTO', + directory_groups=False, +): + if os.path.exists(os.path.join(workspace_dir, 'mets.xml')) or \ + os.path.exists(os.path.join(workspace_dir, 'data', 'mets.xml')): + raise ValueError("Directory '%s' already is a workspace" % workspace_dir) + LOG = getLogger("ocrd.import") + assert os.path.exists(mets_server_url) + mets = ClientSideOcrdMets(mets_server_url) + LOG.info("analysing '%s'", workspace_dir) + if ignore is None: + ignore = [] + if skip is None: + skip = [] + if regex is None: + regex = [] + else: + regex = [re.compile(expr) for expr in regex] + pages = dict() + with pushd_popd(workspace_dir): + for dirname, dirs, files in os.walk(".", followlinks=True): + dirname = dirname[2:] # remove ./ prefix + for fname in files: + fpath = os.path.join(dirname, fname) + LOG.debug("inspecting file '%s'", fpath) + if os.path.getsize(fpath) == 0: + LOG.warning("ignoring empty file '%s'", fpath) + continue + if fname.endswith("*.log") or \ + any(fname.endswith(suffix) for suffix in skip): + LOG.info("skipping file '%s'", fpath) + continue + if regex: + if any(expr.fullmatch(fname) for expr in regex): + LOG.info("matching file '%s'", fpath) + else: + continue + base, suffix = os.path.splitext(fname) + # create ID from path + if not basename: + base = dirname.replace('/', '_') + '_' + base + # XML ID must start with letter and not contain colons or spaces + # also, avoid . in IDs, because downstream it will confuse filename suffix detection + if not base[0].isalpha(): + base = "f" + base # to avoid "id_" prefix for backwards compatibility + base = make_xml_id(base) + # guess MIME type + #mime = EXT_TO_MIME.get(suffix.lower(), "") + mime = guess_media_type(fpath, application_xml=MIMETYPE_PAGE) + if mime == MIMETYPE_PAGE: + with open(fpath, 'r') as fd: + content = fd.read() + if "http://schema.primaresearch.org/PAGE/gts/pagecontent/" in content and \ + (":PcGts " in content or " None: + if self.parameter['mimetype'] == MIMETYPE_PAGE: + return super().process_page_file(*input_files) + # from core's ocrd.processor.base + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], get_args(OcrdFileType)) + page_id = input_files[0].pageId + self._base_logger.info("processing page %s", page_id) + for i, input_file in enumerate(input_files): + assert isinstance(input_file, get_args(OcrdFileType)) + self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") + try: + page_ = page_from_file(input_file) + assert isinstance(page_, OcrdPage) + input_pcgts[i] = page_ + except ValueError as err: + # not PAGE and not an image to generate PAGE for + self._base_logger.error(f"non-PAGE input for page {page_id}: {err}") + output_file_id = make_file_id(input_files[0], self.output_file_grp) + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': + # short-cut avoiding useless computation: + raise FileExistsError( + f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" + ) + result = self.xslt(input_pcgts[0].etree, **self.xsltparams) + output_file_ext = MIME_TO_EXT.get(self.parameter['mimetype'], '') + self.workspace.add_file( + file_id=output_file_id, + file_grp=self.output_file_grp, + page_id=page_id, + local_filename=os.path.join(self.output_file_grp, output_file_id + output_file_ext), + mimetype=self.parameter['mimetype'], + content=str(result), + ) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + pcgts = input_pcgts[0] + result = self.xslt(pcgts.etree, **self.xsltparams) + for error in self.xslt.error_log: + self.logger.error(error) + root = result.getroot() + assert root is not None, "transform yields non-XML result; try setting `mimetype` parameter correctly" + root = PcGtsType.factory() + root.build(result.getroot()) + return OcrdPageResult(OcrdPage(root, result, {}, {})) + +@click.command() +@ocrd_cli_options +def cli(*args, **kwargs): + return ocrd_cli_wrap_processor(PageTransform, *args, **kwargs) diff --git a/page-add-nsprefix-pc.xsl b/workflow_configuration/page-add-nsprefix-pc.xsl similarity index 100% rename from page-add-nsprefix-pc.xsl rename to workflow_configuration/page-add-nsprefix-pc.xsl diff --git a/page-ensure-readingorder.xsl b/workflow_configuration/page-ensure-readingorder.xsl similarity index 100% rename from page-ensure-readingorder.xsl rename to workflow_configuration/page-ensure-readingorder.xsl diff --git a/page-ensure-textequiv-conf.xsl b/workflow_configuration/page-ensure-textequiv-conf.xsl similarity index 100% rename from page-ensure-textequiv-conf.xsl rename to workflow_configuration/page-ensure-textequiv-conf.xsl diff --git a/page-ensure-textequiv-index.xsl b/workflow_configuration/page-ensure-textequiv-index.xsl similarity index 100% rename from page-ensure-textequiv-index.xsl rename to workflow_configuration/page-ensure-textequiv-index.xsl diff --git a/page-ensure-textequiv-unicode.xsl b/workflow_configuration/page-ensure-textequiv-unicode.xsl similarity index 100% rename from page-ensure-textequiv-unicode.xsl rename to workflow_configuration/page-ensure-textequiv-unicode.xsl diff --git a/page-extract-glyphs.xsl b/workflow_configuration/page-extract-glyphs.xsl similarity index 100% rename from page-extract-glyphs.xsl rename to workflow_configuration/page-extract-glyphs.xsl diff --git a/page-extract-lines.xsl b/workflow_configuration/page-extract-lines.xsl similarity index 100% rename from page-extract-lines.xsl rename to workflow_configuration/page-extract-lines.xsl diff --git a/page-extract-text.xsl b/workflow_configuration/page-extract-text.xsl similarity index 100% rename from page-extract-text.xsl rename to workflow_configuration/page-extract-text.xsl diff --git a/page-extract-words.xsl b/workflow_configuration/page-extract-words.xsl similarity index 100% rename from page-extract-words.xsl rename to workflow_configuration/page-extract-words.xsl diff --git a/page-fix-coords.xsl b/workflow_configuration/page-fix-coords.xsl similarity index 100% rename from page-fix-coords.xsl rename to workflow_configuration/page-fix-coords.xsl diff --git a/page-flatten-tableregions.xsl b/workflow_configuration/page-flatten-tableregions.xsl similarity index 100% rename from page-flatten-tableregions.xsl rename to workflow_configuration/page-flatten-tableregions.xsl diff --git a/page-move-alternativeimage-below-page.xsl b/workflow_configuration/page-move-alternativeimage-below-page.xsl similarity index 100% rename from page-move-alternativeimage-below-page.xsl rename to workflow_configuration/page-move-alternativeimage-below-page.xsl diff --git a/page-remove-all-regions.xsl b/workflow_configuration/page-remove-all-regions.xsl similarity index 96% rename from page-remove-all-regions.xsl rename to workflow_configuration/page-remove-all-regions.xsl index 6e1c4ca..eb58f92 100644 --- a/page-remove-all-regions.xsl +++ b/workflow_configuration/page-remove-all-regions.xsl @@ -22,6 +22,7 @@ + diff --git a/page-remove-alternativeimages.xsl b/workflow_configuration/page-remove-alternativeimages.xsl similarity index 100% rename from page-remove-alternativeimages.xsl rename to workflow_configuration/page-remove-alternativeimages.xsl diff --git a/page-remove-dead-regionrefs.xsl b/workflow_configuration/page-remove-dead-regionrefs.xsl similarity index 100% rename from page-remove-dead-regionrefs.xsl rename to workflow_configuration/page-remove-dead-regionrefs.xsl diff --git a/page-remove-text-regions.xsl b/workflow_configuration/page-remove-empty-lines.xsl similarity index 88% rename from page-remove-text-regions.xsl rename to workflow_configuration/page-remove-empty-lines.xsl index 646098a..93c0a17 100644 --- a/page-remove-text-regions.xsl +++ b/workflow_configuration/page-remove-empty-lines.xsl @@ -7,7 +7,7 @@ standalone="yes" encoding="UTF-8" omit-xml-declaration="no"/> - + diff --git a/page-remove-empty-readingorder.xsl b/workflow_configuration/page-remove-empty-readingorder.xsl similarity index 100% rename from page-remove-empty-readingorder.xsl rename to workflow_configuration/page-remove-empty-readingorder.xsl diff --git a/workflow_configuration/page-remove-empty-text-regions.xsl b/workflow_configuration/page-remove-empty-text-regions.xsl new file mode 100644 index 0000000..35c4c52 --- /dev/null +++ b/workflow_configuration/page-remove-empty-text-regions.xsl @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/page-remove-glyphs.xsl b/workflow_configuration/page-remove-glyphs.xsl similarity index 100% rename from page-remove-glyphs.xsl rename to workflow_configuration/page-remove-glyphs.xsl diff --git a/page-remove-lines.xsl b/workflow_configuration/page-remove-lines.xsl similarity index 100% rename from page-remove-lines.xsl rename to workflow_configuration/page-remove-lines.xsl diff --git a/page-remove-metadataitem.xsl b/workflow_configuration/page-remove-metadataitem.xsl similarity index 100% rename from page-remove-metadataitem.xsl rename to workflow_configuration/page-remove-metadataitem.xsl diff --git a/workflow_configuration/page-remove-regions.xsl b/workflow_configuration/page-remove-regions.xsl new file mode 100644 index 0000000..88e7566 --- /dev/null +++ b/workflow_configuration/page-remove-regions.xsl @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/workflow_configuration/page-remove-text-regions.xsl b/workflow_configuration/page-remove-text-regions.xsl new file mode 100644 index 0000000..1c05948 --- /dev/null +++ b/workflow_configuration/page-remove-text-regions.xsl @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/page-remove-textequiv.xsl b/workflow_configuration/page-remove-textequiv.xsl similarity index 100% rename from page-remove-textequiv.xsl rename to workflow_configuration/page-remove-textequiv.xsl diff --git a/page-remove-words.xsl b/workflow_configuration/page-remove-words.xsl similarity index 100% rename from page-remove-words.xsl rename to workflow_configuration/page-remove-words.xsl diff --git a/page-rename-id-clashes.xsl b/workflow_configuration/page-rename-id-clashes.xsl similarity index 100% rename from page-rename-id-clashes.xsl rename to workflow_configuration/page-rename-id-clashes.xsl diff --git a/page-rm-nsprefix-pc.xsl b/workflow_configuration/page-rm-nsprefix-pc.xsl similarity index 100% rename from page-rm-nsprefix-pc.xsl rename to workflow_configuration/page-rm-nsprefix-pc.xsl diff --git a/page-set-nsversion-2019.xsl b/workflow_configuration/page-set-nsversion-2019.xsl similarity index 100% rename from page-set-nsversion-2019.xsl rename to workflow_configuration/page-set-nsversion-2019.xsl diff --git a/page-sort-textequiv-index.xsl b/workflow_configuration/page-sort-textequiv-index.xsl similarity index 100% rename from page-sort-textequiv-index.xsl rename to workflow_configuration/page-sort-textequiv-index.xsl diff --git a/page-textequiv-lines-to-regions.xsl b/workflow_configuration/page-textequiv-lines-to-regions.xsl similarity index 100% rename from page-textequiv-lines-to-regions.xsl rename to workflow_configuration/page-textequiv-lines-to-regions.xsl diff --git a/page-textequiv-words-to-lines.xsl b/workflow_configuration/page-textequiv-words-to-lines.xsl similarity index 100% rename from page-textequiv-words-to-lines.xsl rename to workflow_configuration/page-textequiv-words-to-lines.xsl diff --git a/page-unflatten-tableregions.xsl b/workflow_configuration/page-unflatten-tableregions.xsl similarity index 100% rename from page-unflatten-tableregions.xsl rename to workflow_configuration/page-unflatten-tableregions.xsl diff --git a/workflow_configuration/shellscript.py b/workflow_configuration/shellscript.py new file mode 100644 index 0000000..56a43fe --- /dev/null +++ b/workflow_configuration/shellscript.py @@ -0,0 +1,13 @@ +import os +import sys + +from ocrd_utils import resource_filename + +def cli(): + name = os.path.basename(sys.argv[0]) + script = resource_filename(__package__, name + '.sh') + #os.environ["PATH"] = f"{resource}:{os.getenv('PATH')}" + os.execv(script, sys.argv) + +if __name__ == "__main__": + cli() diff --git a/workflow_configuration/transform.mk b/workflow_configuration/transform.mk new file mode 100644 index 0000000..09c8505 --- /dev/null +++ b/workflow_configuration/transform.mk @@ -0,0 +1,8 @@ +INPUT := $(or $(OUTPUT),$(INPUT),OCR-D-IMG) + +$(INPUT)-XSL: $(INPUT) +$(INPUT)-XSL: TOOL = ocrd-page-transform +$(INPUT)-XSL: TROPTIONS ?= -P xsl page-extract-text.xsl -P xslt-params "-s level=line" -P mimetype text/plain +$(INPUT)-XSL: OPTIONS = $(TROPTIONS) + +OUTPUT := $(INPUT)-XSL diff --git a/workflow_configuration/xsl_transform.py b/workflow_configuration/xsl_transform.py new file mode 100644 index 0000000..20154ad --- /dev/null +++ b/workflow_configuration/xsl_transform.py @@ -0,0 +1,90 @@ +import sys +import os +from difflib import unified_diff + +import click +from lxml import etree as ET + +from ocrd.decorators import ocrd_loglevel +from ocrd_utils import resource_filename, initLogging, getLogger +from ocrd_models.constants import NAMESPACES +from ocrd_models.utils import xmllint_format + +NAME = os.path.basename(sys.argv[0]) +XSL = resource_filename(__package__, NAME + '.xsl') +assert XSL.exists(), XSL + +if NAME.startswith('page-'): + TYPE = "PAGE" +elif NAME.startswith('mets-'): + TYPE = "METS" +else: + TYPE = "input" +HELP = f""" +\b +Open {TYPE} file XMLFILE (or stdin) and apply the XSL transformation "{XSL.name}" +Write the result to stdout, unless... +-i / --inplace is given - in which case the result is written back to the + file silently, or +-d / --diff is given - in which case the result will be compared to the + input and a patch shown on stdout. +""" + +@click.command(context_settings=dict(help_option_names=['-h', '--help'])) +@ocrd_loglevel +@click.option('-s', '--string-param', multiple=True, metavar='NAME=VALUE', help='set param NAME to string literal VALUE') +@click.option('-p', '--xpath-param', multiple=True, metavar='NAME=VALUE', help='set param NAME to XPath expression VALUE') +@click.option('-i', '--inplace', is_flag=True, help='overwrite input file with result of transformation') +@click.option('-P', '--pretty', is_flag=True, help='pretty-print output (line breaks with indentation') +@click.option('-d', '--diff', is_flag=True, help='show diff between input and output via pager') +@click.option('-D', '--dump', is_flag=True, help='just print the transformation stylesheet (XSL)') +@click.argument('xmlfile', type=click.Path(dir_okay=False, allow_dash=True), required=False) +def cli(log_level, string_param, xpath_param, inplace, pretty, diff, dump, xmlfile): + if dump: + click.echo(open(XSL).read()) + sys.exit(0) + initLogging() + LOG = getLogger("ocrd.xsl_transform") + LOG.info("parsing xsl='%s'", str(XSL)) + xsl = ET.parse(XSL) + xslt = ET.XSLT(xsl) + xsltparams = dict() + for setting in string_param: + key, val = setting.split('=') + xsltparams[key] = "'%s'" % val + for setting in xpath_param: + key, val = setting.split('=') + xsltparams[key] = ET.XPath("'%s'" % val, namespaces={ + 'page': NAMESPACES['page'], + 'pc': NAMESPACES['page'], + 'mets': NAMESPACES['mets']}) + if not xmlfile or xmlfile == '-': + xmlinput = sys.stdin.read() + else: + xmlinput = open(xmlfile).read() + # ET.parse(xmlfile) + result = xslt(ET.fromstring(xmlinput.encode("utf-8")), **xsltparams) + for error in xslt.error_log: + LOG.error(error) + if result.getroot() is None: + # plain xsl:output + ret = str(result) + else: + root = result.getroot() + ret = ET.tostring(ET.ElementTree(root), pretty_print=True, encoding='UTF-8') + if pretty: + ret = xmllint_format(ret) + ret = ret.decode('utf-8') + if diff: + if pretty: + xmlinput = xmllint_format(xmlinput.encode('utf-8')).decode('utf-8') + click.echo_via_pager(unified_diff(xmlinput.split('\n'), ret.split('\n'))) + elif inplace: + assert xmlfile and xmlfile != '-' + with open(xmlfile, 'w') as output: + output.write(ret) + else: + click.echo(ret) + + +cli.help = HELP diff --git a/xsl-transform b/xsl-transform deleted file mode 100644 index aa3c5e8..0000000 --- a/xsl-transform +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env bash - -SHAREDIR=$(cd $(dirname "$0") && pwd) - -function log { - echo >&2 "$(date +%T.%3N) $LEVEL ocrd-import - $1" -} -function critical { LEVEL=CRITICAL log "$1"; } -function error { LEVEL=ERROR log "$1"; } -function warning { LEVEL=WARNING log "$1"; } -function info { LEVEL=INFO log "$1"; } -function debug { LEVEL=DEBUG log "$1"; } - -((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && critical "bash $BASH_VERSION is too old. Please install 4.4 or newer" && exit 2 - -name=$(basename $0) -if [[ "$name" =~ ^page- ]]; then - type="PAGE-XML" -elif [[ "$name" =~ ^mets- ]]; then - type="METS-XML" -else - type="input" -fi -parameters=() -pretty=0 -inplace=0 -diff=0 -while (($#)); do - case "${1:--h}" in - -h|-[-]help) - cat <1)) && warning "non-first argument(s) will be ignored: '${@:2}'" -file="${1:--}" - -set -e - -test -e "$SHAREDIR"/$name.xsl - -if test "x$file" = x-; then - file=$(mktemp) - cat > $file -fi -output="$(xmlstarlet tr "$SHAREDIR"/$name.xsl "${parameters[@]}" "$file")" -if ((pretty)); then - output="$(echo "$output" | xmlstarlet fo -s 2 -)" -fi -if ((diff)); then - diff -u <(cat "$file" | if ((pretty)); then xmlstarlet fo -s 2 -; fi) <(echo "$output") -elif ((inplace)); then - echo "$output" > "$file" -else - echo "$output" -fi