OCR-D · kba · Aug 15, 2020 · Aug 15, 2020 · Aug 16, 2020 · Aug 16, 2020
diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py
@@ -21,6 +21,7 @@ def get_help(self, ctx):
 from ocrd.decorators import ocrd_loglevel
 from .zip import zip_cli
 from .log import log_cli
+from .wf import wf_cli
 
 @click.group()
 @click.version_option()
@@ -37,3 +38,4 @@ def cli(**kwargs): # pylint: disable=unused-argument
 cli.add_command(zip_cli)
 cli.add_command(validate_cli)
 cli.add_command(log_cli)
+cli.add_command(wf_cli)
diff --git a/ocrd/ocrd/cli/validate.py b/ocrd/ocrd/cli/validate.py
@@ -5,7 +5,7 @@
 import codecs
 
 from ocrd import Resolver, Workspace
-from ocrd.task_sequence import ProcessorTask, validate_tasks
+from ocrd_models import OcrdWf, OcrdWfStep
 
 from ocrd_utils import (
     parse_json_string_or_file
@@ -16,6 +16,7 @@
     PageValidator,
     ParameterValidator,
     WorkspaceValidator,
+    OcrdWfValidator,
 )
 
 def _inform_of_result(report):
@@ -100,8 +101,10 @@ def validate_process(tasks, workspace):
     '''
     Validate a sequence of tasks passable to 'ocrd process'
     '''
+    wf_val = OcrdWfValidator()
     if workspace:
-        _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace)))
+        wf = OcrdWf(steps=[OcrdWfStep.parse(t) for t in tasks])
+        _inform_of_result(wf_val.validate(wf, workspace=Workspace(Resolver(), directory=workspace)))
     else:
-        for t in [ProcessorTask.parse(t) for t in tasks]:
-            _inform_of_result(t.validate())
+        for t in [OcrdWfStep.parse(t) for t in tasks]:
+            _inform_of_result(wf_val.step_is_resolveable(t))
diff --git a/ocrd/ocrd/cli/wf.py b/ocrd/ocrd/cli/wf.py
@@ -0,0 +1,40 @@
+# import os
+# from os.path import relpath, exists, join, isabs
+# from pathlib import Path
+# import sys
+# from glob import glob   # XXX pathlib.Path.glob does not support absolute globs
+import io
+import re
+
+import click
+
+from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
+from ocrd_validators import OcrdWfValidator
+from ocrd_models import OcrdWf, OcrdWfStep
+from ocrd_utils import getLogger, pushd_popd, EXT_TO_MIME
+
+log = getLogger('ocrd.cli.wf')
+
+# ----------------------------------------------------------------------
+# ocrd wf
+# ----------------------------------------------------------------------
+
+@click.group("wf")
+def wf_cli():
+    """
+    Working with OCRD-WF workflows
+    """
+
+# ----------------------------------------------------------------------
+# ocrd wf is-well-formed WF_FILE
+# ----------------------------------------------------------------------
+
+@wf_cli.command('is-well-formed')
+@click.argument('wf_file', required=True, type=click.File('r'))
+def validate_workspace(wf_file):
+    """
+    Try to parse an OCRD-WF workflow.
+    """
+    OcrdWf.parse(wf_file.read())
+    print("ok - well-formed")
+
diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py
@@ -118,7 +118,7 @@ def run_cli(
         args += ['--overwrite']
     log.debug("Running subprocess '%s'", ' '.join(args))
     result = run(args, check=False, stdout=PIPE, stderr=PIPE)
-    return result.returncode, result.stdout, result.stderr
+    return result.returncode, result.stdout.decode('utf-8'), result.stderr.decode('utf-8')
 
 def generate_processor_help(ocrd_tool):
     parameter_help = ''

diff --git a/ocrd/ocrd/task_sequence.py b/ocrd/ocrd/task_sequence.py
@@ -1,136 +1,22 @@
 import json
-from shlex import split as shlex_split
-from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module
-from subprocess import run, PIPE
-from collections import Counter
 
-from ocrd_utils import getLogger, parse_json_string_or_file, set_json_key_value_overrides
-# from collections import Counter
 from ocrd.processor.base import run_cli
 from ocrd.resolver import Resolver
-from ocrd_validators import ParameterValidator, WorkspaceValidator
-from ocrd_models import ValidationReport
-
-class ProcessorTask():
-
-    @classmethod
-    def parse(cls, argstr):
-        tokens = shlex_split(argstr)
-        executable = 'ocrd-%s' % tokens.pop(0)
-        input_file_grps = []
-        output_file_grps = []
-        parameters = {}
-        while tokens:
-            if tokens[0] == '-I':
-                for grp in tokens[1].split(','):
-                    input_file_grps.append(grp)
-                tokens = tokens[2:]
-            elif tokens[0] == '-O':
-                for grp in tokens[1].split(','):
-                    output_file_grps.append(grp)
-                tokens = tokens[2:]
-            elif tokens[0] == '-p':
-                parameters = {**parameters, **parse_json_string_or_file(tokens[1])}
-                tokens = tokens[2:]
-            elif tokens[0] == '-P':
-                set_json_key_value_overrides(parameters, tokens[1:3])
-                tokens = tokens[3:]
-            else:
-                raise Exception("Failed parsing task description '%s' with tokens remaining: '%s'" % (argstr, tokens))
-        return ProcessorTask(executable, input_file_grps, output_file_grps, parameters)
-
-    def __init__(self, executable, input_file_grps, output_file_grps, parameters):
-        self.executable = executable
-        self.input_file_grps = input_file_grps
-        self.output_file_grps = output_file_grps
-        self.parameters = parameters
-        self._ocrd_tool_json = None
-
-    @property
-    def ocrd_tool_json(self):
-        if self._ocrd_tool_json:
-            return self._ocrd_tool_json
-        result = run([self.executable, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True)
-        self._ocrd_tool_json = json.loads(result.stdout)
-        return self._ocrd_tool_json
-
-    def validate(self):
-        if not which(self.executable):
-            raise Exception("Executable not found in PATH: %s" % self.executable)
-        if not self.input_file_grps:
-            raise Exception("Task must have input file group")
-        # TODO uncomment and adapt once OCR-D/spec#121 lands
-        # # make implicit input/output groups explicit by defaulting to what is
-        # # provided in ocrd-tool.json
-        # actual_output_grps = [*self.ocrd_tool_json['output_file_grp']]
-        # for i, grp in enumerate(self.output_file_grps):
-            # actual_output_grps[i] = grp
-        # self.output_file_grps = actual_output_grps
-        # actual_input_grps = [*self.ocrd_tool_json['input_file_grp']]
-        # for i, grp in enumerate(self.input_file_grps):
-            # actual_input_grps[i] = grp
-        # self.input_file_grps = actual_input_grps
-        param_validator = ParameterValidator(self.ocrd_tool_json)
-        report = param_validator.validate(self.parameters)
-        if not report.is_valid:
-            raise Exception(report.errors)
-        # TODO remove once OCR-D/spec#121 lands
-        if 'output_file_grp' in self.ocrd_tool_json and not self.output_file_grps:
-            raise Exception("Processor requires output_file_grp but none was provided.")
-        return report
-
-    def __str__(self):
-        ret = '%s -I %s -O %s' % (
-            self.executable.replace('ocrd-', '', 1),
-            ','.join(self.input_file_grps),
-            ','.join(self.output_file_grps))
-        if self.parameters:
-            ret += " -p '%s'" % json.dumps(self.parameters)
-        return ret
-from ocrd_validators import WorkspaceValidator
 from ocrd_utils import getLogger
-from ocrd_models import ValidationReport
-
-def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
-    report = ValidationReport()
-    prev_output_file_grps = workspace.mets.file_groups
-
-    first_task = tasks[0]
-    first_task.validate()
-
-    # first task: check input/output file groups from METS
-    WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report)
-
-    prev_output_file_grps += first_task.output_file_grps
-    for task in tasks[1:]:
-        task.validate()
-        # check either existing fileGrp or output-file group of previous task matches current input_file_group
-        for input_file_grp in task.input_file_grps:
-            if not input_file_grp in prev_output_file_grps:
-                report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp)
-        if not overwrite:
-            WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report)
-        # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented
-        # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever.
-        #  if len(prev_output_file_grps) != len(set(prev_output_file_grps)):
-        #      report.add_error("Output file group specified multiple times: %s" % 
-        #          [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2])
-        prev_output_file_grps += task.output_file_grps
-    if not report.is_valid:
-        raise Exception("Invalid task sequence input/output file groups: %s" % report.errors)
-    return report
-
+from ocrd_validators import OcrdWfValidator
+from ocrd_models import OcrdWf, OcrdWfStep
 
 def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
     resolver = Resolver()
     workspace = resolver.workspace_from_url(mets)
     log = getLogger('ocrd.task_sequence.run_tasks')
-    tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]
+    steps = [OcrdWfStep.parse(task_str) for task_str in task_strs]
+    wf = OcrdWf(steps=steps)
 
-    validate_tasks(tasks, workspace, page_id, overwrite)
+    OcrdWfValidator().validate(wf, workspace, page_id=page_id, overwrite=overwrite)
 
     # Run the tasks
-    for task in tasks:
+    for task in steps:
 
         log.info("Start processing task '%s'", task)
 
@@ -150,9 +36,7 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
 
         # check return code
         if returncode != 0:
-            raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err))
-
-        log.info("Finished processing task '%s'", task)
+           raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err))
 
         # reload mets
         workspace.reload_mets()
@@ -161,3 +45,5 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
         for output_file_grp in task.output_file_grps:
             if not output_file_grp in workspace.mets.file_groups:
                 raise Exception("Invalid state: expected output file group not in mets: %s\nSTDOUT:\n%s\nSTDERR:\n%s" % (output_file_grp, out, err))
+
+        log.info("Finished processing task '%s'", task)
diff --git a/ocrd_models/ocrd_models/__init__.py b/ocrd_models/ocrd_models/__init__.py
@@ -6,4 +6,6 @@
 from .ocrd_file import OcrdFile
 from .ocrd_mets import OcrdMets
 from .ocrd_xml_base import OcrdXmlDocument
+from .ocrd_wf import OcrdWf
+from .ocrd_wf_step import OcrdWfStep
 from .report import ValidationReport
diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py
@@ -27,6 +27,7 @@
     'TAG_PAGE_TEXTEQUIV',
     'TAG_PAGE_TEXTREGION',
     'REGEX_FILE_ID',
+    'OCRD_WF_SHEBANG',
 ]
 
 REGEX_FILE_ID = re.compile('^[A-Za-z][^:]*$')
@@ -70,3 +71,5 @@
     'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
     'Separator', 'Table', 'Text', 'Unknown'
 ]
+
+OCRD_WF_SHEBANG = '#!/usr/bin/env ocrd-wf'
diff --git a/ocrd_models/ocrd_models/ocrd_wf.py b/ocrd_models/ocrd_models/ocrd_wf.py
@@ -0,0 +1,60 @@
+import io
+import re
+
+from .constants import OCRD_WF_SHEBANG
+from .ocrd_wf_step import OcrdWfStep
+
+class OcrdWf():
+
+    def __init__(self, steps=None, assignments=None):
+        self.steps = steps if steps else []
+        self.assignments = assignments if assignments else {}
+
+    @staticmethod
+    def parse_file(fname):
+        with io.open(fname, mode='r', encoding='utf-8') as f:
+            return OcrdWf.parse(f.read())
+
+    @staticmethod
+    def parse(src):
+        if src[0:len(OCRD_WF_SHEBANG)] != OCRD_WF_SHEBANG:
+            raise ValueError("OCRD-WF does not begin with '%s'!" % OCRD_WF_SHEBANG)
+        lines_wo_empty = []
+        # remove empty lines
+        for line in src.split("\n")[1:]:
+            if not re.fullmatch(r'^\s*$', line):
+                lines_wo_empty.append(line)
+        # strip comments
+        lines_wo_comment = []
+        for line in lines_wo_empty:
+            if not re.match(r"^\s*#", line):
+                lines_wo_comment.append(line)
+        lines_wo_continuation = []
+        # line continuation
+        n = 0
+        while n < len(lines_wo_comment):
+            continued_lines = 0
+            while lines_wo_comment[n].endswith('\\'):
+                lines_wo_comment[n] = re.sub(r"\s*\\$", "", lines_wo_comment[n])
+                continued_lines += 1
+                lines_wo_comment[n] += re.sub(r"^\s*", " ", lines_wo_comment[n + continued_lines])
+            lines_wo_continuation.append(lines_wo_comment[n])
+            n += 1 + continued_lines
+        assignments = {}
+        steps = []
+        for line in lines_wo_continuation:
+            if re.match(r'^[A-Za-z][A-Za-z0-9]*=', line):
+                k, v = line.split('=', 2)
+                assignments[k] = v
+            else:
+                steps.append(OcrdWfStep.parse(line))
+        return OcrdWf(assignments=assignments, steps=steps)
+
+    def __str__(self):
+        ret = '%s\n' % OCRD_WF_SHEBANG
+        for k in self.assignments:
+            v = self.assignments[k]
+            ret += '%s=%s\n' % (k, v)
+        for step in self.steps:
+            ret += '%s\n' % str(step)
+        return ret