diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index 1d1aeda263..8931a0df97 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -21,6 +21,7 @@ def get_help(self, ctx): from ocrd.decorators import ocrd_loglevel from .zip import zip_cli from .log import log_cli +from .wf import wf_cli @click.group() @click.version_option() @@ -37,3 +38,4 @@ def cli(**kwargs): # pylint: disable=unused-argument cli.add_command(zip_cli) cli.add_command(validate_cli) cli.add_command(log_cli) +cli.add_command(wf_cli) diff --git a/ocrd/ocrd/cli/validate.py b/ocrd/ocrd/cli/validate.py index ed2aea39e2..b3347dd1b2 100644 --- a/ocrd/ocrd/cli/validate.py +++ b/ocrd/ocrd/cli/validate.py @@ -5,7 +5,7 @@ import codecs from ocrd import Resolver, Workspace -from ocrd.task_sequence import ProcessorTask, validate_tasks +from ocrd_models import OcrdWf, OcrdWfStep from ocrd_utils import ( parse_json_string_or_file @@ -16,6 +16,7 @@ PageValidator, ParameterValidator, WorkspaceValidator, + OcrdWfValidator, ) def _inform_of_result(report): @@ -100,8 +101,10 @@ def validate_process(tasks, workspace): ''' Validate a sequence of tasks passable to 'ocrd process' ''' + wf_val = OcrdWfValidator() if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace))) + wf = OcrdWf(steps=[OcrdWfStep.parse(t) for t in tasks]) + _inform_of_result(wf_val.validate(wf, workspace=Workspace(Resolver(), directory=workspace))) else: - for t in [ProcessorTask.parse(t) for t in tasks]: - _inform_of_result(t.validate()) + for t in [OcrdWfStep.parse(t) for t in tasks]: + _inform_of_result(wf_val.step_is_resolveable(t)) diff --git a/ocrd/ocrd/cli/wf.py b/ocrd/ocrd/cli/wf.py new file mode 100644 index 0000000000..9b4f68526e --- /dev/null +++ b/ocrd/ocrd/cli/wf.py @@ -0,0 +1,40 @@ +# import os +# from os.path import relpath, exists, join, isabs +# from pathlib import Path +# import sys +# from glob import glob # XXX pathlib.Path.glob does not support absolute globs +import io +import re + +import click + +from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager +from ocrd_validators import OcrdWfValidator +from ocrd_models import OcrdWf, OcrdWfStep +from ocrd_utils import getLogger, pushd_popd, EXT_TO_MIME + +log = getLogger('ocrd.cli.wf') + +# ---------------------------------------------------------------------- +# ocrd wf +# ---------------------------------------------------------------------- + +@click.group("wf") +def wf_cli(): + """ + Working with OCRD-WF workflows + """ + +# ---------------------------------------------------------------------- +# ocrd wf is-well-formed WF_FILE +# ---------------------------------------------------------------------- + +@wf_cli.command('is-well-formed') +@click.argument('wf_file', required=True, type=click.File('r')) +def validate_workspace(wf_file): + """ + Try to parse an OCRD-WF workflow. + """ + OcrdWf.parse(wf_file.read()) + print("ok - well-formed") + diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 31acdd3613..f71d43c9cb 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -118,7 +118,7 @@ def run_cli( args += ['--overwrite'] log.debug("Running subprocess '%s'", ' '.join(args)) result = run(args, check=False, stdout=PIPE, stderr=PIPE) - return result.returncode, result.stdout, result.stderr + return result.returncode, result.stdout.decode('utf-8'), result.stderr.decode('utf-8') def generate_processor_help(ocrd_tool): parameter_help = '' diff --git a/ocrd/ocrd/task_sequence.py b/ocrd/ocrd/task_sequence.py index ee60524e58..4511dd128f 100644 --- a/ocrd/ocrd/task_sequence.py +++ b/ocrd/ocrd/task_sequence.py @@ -1,136 +1,22 @@ import json -from shlex import split as shlex_split -from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module -from subprocess import run, PIPE -from collections import Counter -from ocrd_utils import getLogger, parse_json_string_or_file, set_json_key_value_overrides -# from collections import Counter from ocrd.processor.base import run_cli from ocrd.resolver import Resolver -from ocrd_validators import ParameterValidator, WorkspaceValidator -from ocrd_models import ValidationReport - -class ProcessorTask(): - - @classmethod - def parse(cls, argstr): - tokens = shlex_split(argstr) - executable = 'ocrd-%s' % tokens.pop(0) - input_file_grps = [] - output_file_grps = [] - parameters = {} - while tokens: - if tokens[0] == '-I': - for grp in tokens[1].split(','): - input_file_grps.append(grp) - tokens = tokens[2:] - elif tokens[0] == '-O': - for grp in tokens[1].split(','): - output_file_grps.append(grp) - tokens = tokens[2:] - elif tokens[0] == '-p': - parameters = {**parameters, **parse_json_string_or_file(tokens[1])} - tokens = tokens[2:] - elif tokens[0] == '-P': - set_json_key_value_overrides(parameters, tokens[1:3]) - tokens = tokens[3:] - else: - raise Exception("Failed parsing task description '%s' with tokens remaining: '%s'" % (argstr, tokens)) - return ProcessorTask(executable, input_file_grps, output_file_grps, parameters) - - def __init__(self, executable, input_file_grps, output_file_grps, parameters): - self.executable = executable - self.input_file_grps = input_file_grps - self.output_file_grps = output_file_grps - self.parameters = parameters - self._ocrd_tool_json = None - - @property - def ocrd_tool_json(self): - if self._ocrd_tool_json: - return self._ocrd_tool_json - result = run([self.executable, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True) - self._ocrd_tool_json = json.loads(result.stdout) - return self._ocrd_tool_json - - def validate(self): - if not which(self.executable): - raise Exception("Executable not found in PATH: %s" % self.executable) - if not self.input_file_grps: - raise Exception("Task must have input file group") - # TODO uncomment and adapt once OCR-D/spec#121 lands - # # make implicit input/output groups explicit by defaulting to what is - # # provided in ocrd-tool.json - # actual_output_grps = [*self.ocrd_tool_json['output_file_grp']] - # for i, grp in enumerate(self.output_file_grps): - # actual_output_grps[i] = grp - # self.output_file_grps = actual_output_grps - # actual_input_grps = [*self.ocrd_tool_json['input_file_grp']] - # for i, grp in enumerate(self.input_file_grps): - # actual_input_grps[i] = grp - # self.input_file_grps = actual_input_grps - param_validator = ParameterValidator(self.ocrd_tool_json) - report = param_validator.validate(self.parameters) - if not report.is_valid: - raise Exception(report.errors) - # TODO remove once OCR-D/spec#121 lands - if 'output_file_grp' in self.ocrd_tool_json and not self.output_file_grps: - raise Exception("Processor requires output_file_grp but none was provided.") - return report - - def __str__(self): - ret = '%s -I %s -O %s' % ( - self.executable.replace('ocrd-', '', 1), - ','.join(self.input_file_grps), - ','.join(self.output_file_grps)) - if self.parameters: - ret += " -p '%s'" % json.dumps(self.parameters) - return ret -from ocrd_validators import WorkspaceValidator from ocrd_utils import getLogger -from ocrd_models import ValidationReport - -def validate_tasks(tasks, workspace, page_id=None, overwrite=False): - report = ValidationReport() - prev_output_file_grps = workspace.mets.file_groups - - first_task = tasks[0] - first_task.validate() - - # first task: check input/output file groups from METS - WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report) - - prev_output_file_grps += first_task.output_file_grps - for task in tasks[1:]: - task.validate() - # check either existing fileGrp or output-file group of previous task matches current input_file_group - for input_file_grp in task.input_file_grps: - if not input_file_grp in prev_output_file_grps: - report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp) - if not overwrite: - WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report) - # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented - # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever. - # if len(prev_output_file_grps) != len(set(prev_output_file_grps)): - # report.add_error("Output file group specified multiple times: %s" % - # [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2]) - prev_output_file_grps += task.output_file_grps - if not report.is_valid: - raise Exception("Invalid task sequence input/output file groups: %s" % report.errors) - return report - +from ocrd_validators import OcrdWfValidator +from ocrd_models import OcrdWf, OcrdWfStep def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): resolver = Resolver() workspace = resolver.workspace_from_url(mets) log = getLogger('ocrd.task_sequence.run_tasks') - tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] + steps = [OcrdWfStep.parse(task_str) for task_str in task_strs] + wf = OcrdWf(steps=steps) - validate_tasks(tasks, workspace, page_id, overwrite) + OcrdWfValidator().validate(wf, workspace, page_id=page_id, overwrite=overwrite) # Run the tasks - for task in tasks: + for task in steps: log.info("Start processing task '%s'", task) @@ -150,9 +36,7 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): # check return code if returncode != 0: - raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err)) - - log.info("Finished processing task '%s'", task) + raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err)) # reload mets workspace.reload_mets() @@ -161,3 +45,5 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): for output_file_grp in task.output_file_grps: if not output_file_grp in workspace.mets.file_groups: raise Exception("Invalid state: expected output file group not in mets: %s\nSTDOUT:\n%s\nSTDERR:\n%s" % (output_file_grp, out, err)) + + log.info("Finished processing task '%s'", task) diff --git a/ocrd_models/ocrd_models/__init__.py b/ocrd_models/ocrd_models/__init__.py index 9a31a2d4c7..242fa0ed2a 100644 --- a/ocrd_models/ocrd_models/__init__.py +++ b/ocrd_models/ocrd_models/__init__.py @@ -6,4 +6,6 @@ from .ocrd_file import OcrdFile from .ocrd_mets import OcrdMets from .ocrd_xml_base import OcrdXmlDocument +from .ocrd_wf import OcrdWf +from .ocrd_wf_step import OcrdWfStep from .report import ValidationReport diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py index cafe9b5c45..88f0d1259a 100644 --- a/ocrd_models/ocrd_models/constants.py +++ b/ocrd_models/ocrd_models/constants.py @@ -27,6 +27,7 @@ 'TAG_PAGE_TEXTEQUIV', 'TAG_PAGE_TEXTREGION', 'REGEX_FILE_ID', + 'OCRD_WF_SHEBANG', ] REGEX_FILE_ID = re.compile('^[A-Za-z][^:]*$') @@ -70,3 +71,5 @@ 'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', 'Separator', 'Table', 'Text', 'Unknown' ] + +OCRD_WF_SHEBANG = '#!/usr/bin/env ocrd-wf' diff --git a/ocrd_models/ocrd_models/ocrd_wf.py b/ocrd_models/ocrd_models/ocrd_wf.py new file mode 100644 index 0000000000..d58127780a --- /dev/null +++ b/ocrd_models/ocrd_models/ocrd_wf.py @@ -0,0 +1,60 @@ +import io +import re + +from .constants import OCRD_WF_SHEBANG +from .ocrd_wf_step import OcrdWfStep + +class OcrdWf(): + + def __init__(self, steps=None, assignments=None): + self.steps = steps if steps else [] + self.assignments = assignments if assignments else {} + + @staticmethod + def parse_file(fname): + with io.open(fname, mode='r', encoding='utf-8') as f: + return OcrdWf.parse(f.read()) + + @staticmethod + def parse(src): + if src[0:len(OCRD_WF_SHEBANG)] != OCRD_WF_SHEBANG: + raise ValueError("OCRD-WF does not begin with '%s'!" % OCRD_WF_SHEBANG) + lines_wo_empty = [] + # remove empty lines + for line in src.split("\n")[1:]: + if not re.fullmatch(r'^\s*$', line): + lines_wo_empty.append(line) + # strip comments + lines_wo_comment = [] + for line in lines_wo_empty: + if not re.match(r"^\s*#", line): + lines_wo_comment.append(line) + lines_wo_continuation = [] + # line continuation + n = 0 + while n < len(lines_wo_comment): + continued_lines = 0 + while lines_wo_comment[n].endswith('\\'): + lines_wo_comment[n] = re.sub(r"\s*\\$", "", lines_wo_comment[n]) + continued_lines += 1 + lines_wo_comment[n] += re.sub(r"^\s*", " ", lines_wo_comment[n + continued_lines]) + lines_wo_continuation.append(lines_wo_comment[n]) + n += 1 + continued_lines + assignments = {} + steps = [] + for line in lines_wo_continuation: + if re.match(r'^[A-Za-z][A-Za-z0-9]*=', line): + k, v = line.split('=', 2) + assignments[k] = v + else: + steps.append(OcrdWfStep.parse(line)) + return OcrdWf(assignments=assignments, steps=steps) + + def __str__(self): + ret = '%s\n' % OCRD_WF_SHEBANG + for k in self.assignments: + v = self.assignments[k] + ret += '%s=%s\n' % (k, v) + for step in self.steps: + ret += '%s\n' % str(step) + return ret diff --git a/ocrd_models/ocrd_models/ocrd_wf_step.py b/ocrd_models/ocrd_models/ocrd_wf_step.py new file mode 100644 index 0000000000..0fff58cc53 --- /dev/null +++ b/ocrd_models/ocrd_models/ocrd_wf_step.py @@ -0,0 +1,65 @@ +import json +from shlex import split as shlex_split, quote +# only in 3.8+ :( +# from shlex import join as shlex_join +from subprocess import run, PIPE + +from ocrd_utils import getLogger, parse_json_string_or_file, set_json_key_value_overrides + +LOG = getLogger('ocrd.wf.step') + +class OcrdWfStep(): + + @classmethod + def parse(cls, argstr): + tokens = shlex_split(argstr) + executable = tokens.pop(0) + if not executable.startswith('ocrd-'): + executable = 'ocrd-%s' % executable + input_file_grps = [] + output_file_grps = [] + parameters = {} + while tokens: + if tokens[0] == '-I': + for grp in tokens[1].split(','): + input_file_grps.append(grp) + tokens = tokens[2:] + elif tokens[0] == '-O': + for grp in tokens[1].split(','): + output_file_grps.append(grp) + tokens = tokens[2:] + elif tokens[0] == '-p': + parameters = {**parameters, **parse_json_string_or_file(tokens[1])} + tokens = tokens[2:] + elif tokens[0] == '-P': + set_json_key_value_overrides(parameters, tokens[1:3]) + tokens = tokens[3:] + else: + raise Exception("Failed parsing task description '%s' with tokens remaining: '%s'" % (argstr, tokens)) + return OcrdWfStep(executable, input_file_grps, output_file_grps, parameters) + + def __init__(self, executable, input_file_grps, output_file_grps, parameters): + self.executable = executable + self.input_file_grps = input_file_grps + self.output_file_grps = output_file_grps + self.parameters = parameters + self._ocrd_tool_json = None + + @property + def ocrd_tool_json(self): + if self._ocrd_tool_json: + return self._ocrd_tool_json + result = run([self.executable, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True) + self._ocrd_tool_json = json.loads(result.stdout) + return self._ocrd_tool_json + + def __str__(self): + ret = [self.executable] + if self.input_file_grps: + ret += ['-I', ','.join(self.input_file_grps)] + if self.output_file_grps: + ret += ['-O', ','.join(self.output_file_grps)] + for k in self.parameters: + ret += ['-P', k, json.dumps(self.parameters[k])] + return ' '.join([quote(s) for s in ret]) + diff --git a/ocrd_validators/ocrd_validators/__init__.py b/ocrd_validators/ocrd_validators/__init__.py index bbf88323af..7f25f233e8 100644 --- a/ocrd_validators/ocrd_validators/__init__.py +++ b/ocrd_validators/ocrd_validators/__init__.py @@ -20,3 +20,4 @@ from .xsd_validator import XsdValidator from .xsd_mets_validator import XsdMetsValidator from .xsd_page_validator import XsdPageValidator +from .ocrd_wf_validator import OcrdWfValidator diff --git a/ocrd_validators/ocrd_validators/ocrd_wf_validator.py b/ocrd_validators/ocrd_validators/ocrd_wf_validator.py new file mode 100644 index 0000000000..cd385cf6f1 --- /dev/null +++ b/ocrd_validators/ocrd_validators/ocrd_wf_validator.py @@ -0,0 +1,83 @@ +from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module + +from ocrd_models import ValidationReport + +from .workspace_validator import WorkspaceValidator +from .parameter_validator import ParameterValidator + +class OcrdWfValidator(): + + def __init__(self): + pass + + def validate(self, wf, workspace, overwrite=False, page_id=None): + report = ValidationReport() + report.merge_report(self.is_resolveable(wf)) + report.merge_report(self.is_consistent(wf, workspace, overwrite=overwrite, page_id=page_id)) + if not report.is_valid: + raise Exception(report.errors) + return report + + def is_resolveable(self, wf): + report = ValidationReport() + for step in wf.steps: + report.merge_report(self.step_is_resolveable(step)) + return report + + def is_consistent(self, wf, workspace, overwrite=False, page_id=None): + report = ValidationReport() + prev_output_file_grps = workspace.mets.file_groups + + first_task = wf.steps[0] + + # first task: check input/output file groups from METS + WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report) + + prev_output_file_grps += first_task.output_file_grps + for task in wf.steps[1:]: + report.merge_report(self.step_is_consistent(task)) + # check either existing fileGrp or output-file group of previous task matches current input_file_group + for input_file_grp in task.input_file_grps: + if not input_file_grp in prev_output_file_grps: + report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp) + if not overwrite: + WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report) + # TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented + # XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever. + # if len(prev_output_file_grps) != len(set(prev_output_file_grps)): + # report.add_error("Output file group specified multiple times: %s" % + # [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2]) + prev_output_file_grps += task.output_file_grps + if not report.is_valid: + raise Exception("Invalid task sequence input/output file groups: %s" % report.errors) + return report + return report + + def step_is_resolveable(self, step): + report = ValidationReport() + if not which(step.executable): + report.add_error("Unresolveable! Executable not found in PATH: %s" % step.executable) + return report + # TODO uncomment and adapt once OCR-D/spec#121 lands + # # make implicit input/output groups explicit by defaulting to what is + # # provided in ocrd-tool.json + # actual_output_grps = [*self.ocrd_tool_json['output_file_grp']] + # for i, grp in enumerate(self.output_file_grps): + # actual_output_grps[i] = grp + # self.output_file_grps = actual_output_grps + # actual_input_grps = [*self.ocrd_tool_json['input_file_grp']] + # for i, grp in enumerate(self.input_file_grps): + # actual_input_grps[i] = grp + # self.input_file_grps = actual_input_grps + param_validator = ParameterValidator(step.ocrd_tool_json) + report = param_validator.validate(step.parameters) + return report + + def step_is_consistent(self, step): + report = ValidationReport() + if not step.input_file_grps: + report.add_error("Inconsistent: Task must have input file group") + # TODO remove once OCR-D/spec#121 lands + if 'output_file_grp' in step.ocrd_tool_json and not step.output_file_grps: + report.add_error("Inconsistent: Processor requires output_file_grp but none was provided.") + return report diff --git a/tests/cli/test_bashlib.py b/tests/cli/test_bashlib.py index 902f10a88a..424eadbeeb 100644 --- a/tests/cli/test_bashlib.py +++ b/tests/cli/test_bashlib.py @@ -7,7 +7,7 @@ from ocrd.constants import BASHLIB_FILENAME from ocrd_utils.constants import VERSION, MIME_TO_EXT from ocrd_validators.constants import BAGIT_TXT -from ocrd_models.constants import TAG_MODS_IDENTIFIER +from ocrd_models.constants import TAG_MODS_IDENTIFIER, OCRD_WF_SHEBANG class TestBashlibCli(TestCase): @@ -28,6 +28,7 @@ def _test_constant(name, val): _test_constant('VERSION', VERSION) _test_constant('BAGIT_TXT', BAGIT_TXT) _test_constant('TAG_MODS_IDENTIFIER', TAG_MODS_IDENTIFIER) + _test_constant('OCRD_WF_SHEBANG', OCRD_WF_SHEBANG) def test_constants_dict(self): _, out, err = self.invoke_cli(bashlib_cli, ['constants', 'MIME_TO_EXT']) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index c60eb9c8d4..9eea417fb6 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -9,7 +9,6 @@ from tests.data.wf_testcase import TestCase from ocrd_utils import pushd_popd -from ocrd.resolver import Resolver from ocrd.cli.validate import validate_cli diff --git a/tests/model/test_ocrd_wf.py b/tests/model/test_ocrd_wf.py new file mode 100644 index 0000000000..05aca02f3c --- /dev/null +++ b/tests/model/test_ocrd_wf.py @@ -0,0 +1,73 @@ +from os.path import join +from tempfile import TemporaryDirectory + +from tests.base import main +from tests.data.wf_testcase import ( + TestCase, + + SAMPLE_NAME_REQUIRED_PARAM, + PARAM_JSON, +) + +from ocrd_models.constants import OCRD_WF_SHEBANG +from ocrd_models import OcrdWf + +class TestOcrdWf(TestCase): + + def test_parse_minimal(self): + wf = OcrdWf.parse(OCRD_WF_SHEBANG) + self.assertEqual(wf.steps, []) + self.assertEqual(wf.assignments, {}) + + def test_parse_assignment(self): + wf = OcrdWf.parse(OCRD_WF_SHEBANG + "\nfoo=bar") + self.assertEqual(wf.steps, []) + self.assertEqual(wf.assignments, {'foo': 'bar'}) + + def test_parse_comments(self): + wf = OcrdWf.parse(OCRD_WF_SHEBANG + "\n# foo\n # bar") + self.assertEqual(wf.steps, []) + self.assertEqual(wf.assignments, {}) + + def test_parse_steps_and_assignments(self): + wf = OcrdWf.parse(OCRD_WF_SHEBANG + "\n" + \ + "ocrd-sample-processor\n" + \ + "foo=bar\n" + \ + "sample-processor\n") + self.assertEqual(wf.assignments, {'foo': 'bar'}) + self.assertEqual([str(x) for x in wf.steps], [ + 'ocrd-sample-processor', + 'ocrd-sample-processor' + ]) + + def test_parse_line_continuation(self): + wf = OcrdWf.parse(OCRD_WF_SHEBANG + "\n" + + "ocrd-sample-processor\n" + + "sample-processor \\\n" + + " -P foo bar \\\n" + + " # a comment interspersed\n" + + " -P bar foo\n") + self.assertEqual([str(x) for x in wf.steps], [ + 'ocrd-sample-processor', + "ocrd-sample-processor -P foo '\"bar\"' -P bar '\"foo\"'" + ]) + + def test_parse_line_continuation_from_file(self): + with TemporaryDirectory() as tempdir: + fname = join(tempdir, 'test.ocrd.wf') + with open(fname, 'w') as f: + f.write(OCRD_WF_SHEBANG + "\n" + + "ocrd-sample-processor\n" + + "sample-processor \\\n" + + " -P foo bar \\\n" + + " # a comment interspersed\n" + + " -P bar foo\n") + wf = OcrdWf.parse_file(fname) + self.assertEqual([str(x) for x in wf.steps], [ + 'ocrd-sample-processor', + "ocrd-sample-processor -P foo '\"bar\"' -P bar '\"foo\"'" + ]) + + +if __name__ == "__main__": + main(__file__) diff --git a/tests/test_task_sequence.py b/tests/test_task_sequence.py index 2697e3481d..82316b2071 100644 --- a/tests/test_task_sequence.py +++ b/tests/test_task_sequence.py @@ -1,39 +1,101 @@ +import os import json -from tempfile import TemporaryDirectory -from pathlib import Path +from tempfile import mkdtemp, TemporaryDirectory +from shutil import rmtree -from tests.base import main, assets, copy_of_directory -from tests.data.wf_testcase import ( - TestCase, +from pathlib import Path +from os.path import join - SAMPLE_NAME_REQUIRED_PARAM, - PARAM_JSON, -) +from tests.base import TestCase, main, assets, copy_of_directory from ocrd_utils import pushd_popd, MIMETYPE_PAGE from ocrd.resolver import Resolver -from ocrd.task_sequence import run_tasks, validate_tasks, ProcessorTask - -class TestOcrdWfStep(TestCase): +from ocrd_validators import OcrdWfValidator +from ocrd.task_sequence import run_tasks +from ocrd_models import OcrdWf +import ocrd_models + +class OcrdWfStep(ocrd_models.OcrdWfStep): + + def validate(self): + wf_val = OcrdWfValidator() + report = wf_val.step_is_resolveable(self) + if not report.is_valid: + raise Exception(report.errors) + +SAMPLE_NAME = 'ocrd-sample-processor' +SAMPLE_OCRD_TOOL_JSON = '''{ + "executable": "ocrd-sample-processor", + "description": "Do stuff and things", + "categories": ["Image foobaring"], + "steps": ["preprocessing/optimization/foobarization"], + "input_file_grp": ["OCR-D-IMG"], + "output_file_grp": ["OCR-D-IMG-BIN", "SECOND_OUT"], + "parameters": { + "param1": { + "type": "boolean", + "default": false, + "description": "param1 description" + } + } +}''' + +SAMPLE_NAME_REQUIRED_PARAM = 'sample-processor-required-param' +SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM = json.loads(SAMPLE_OCRD_TOOL_JSON) +del SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM['parameters']['param1']['default'] +SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM['executable'] = 'ocrd-' + SAMPLE_NAME_REQUIRED_PARAM +SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM['parameters']['param1']['required'] = True +SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM['input_file_grp'] += ['SECOND_IN'] +SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM = json.dumps(SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM) + +PARAM_JSON = '{"foo": 42}' + +class TestTaskSequence(TestCase): + + def tearDown(self): + rmtree(self.tempdir) + + def setUp(self): + self.tempdir = mkdtemp(prefix='ocrd-task-sequence-') + self.param_fname = join(self.tempdir, 'params.json') + with open(self.param_fname, 'w') as f: + f.write(PARAM_JSON) + + p = Path(self.tempdir, SAMPLE_NAME) + p.write_text("""\ +#!/usr/bin/env python +print('''%s''') + """ % SAMPLE_OCRD_TOOL_JSON) + p.chmod(0o777) + + p = Path(self.tempdir, 'ocrd-' + SAMPLE_NAME_REQUIRED_PARAM) + p.write_text("""\ +#!/usr/bin/env python +print('''%s''') + """ % SAMPLE_OCRD_TOOL_JSON_REQUIRED_PARAM) + p.chmod(0o777) + + os.environ['PATH'] = os.pathsep.join([self.tempdir, os.environ['PATH']]) + # from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module + # self.assertTrue(which('ocrd-sample-processor')) def test_parse_no_in(self): - task = ProcessorTask.parse('sample-processor') - with self.assertRaisesRegex(Exception, 'must have input file group'): - task.validate() + task = OcrdWfStep.parse('sample-processor') + self.assertIn('must have input file group', OcrdWfValidator().step_is_consistent(task).errors[0]) # XXX no longer an error since we're relying on ocrd-tool.json info for # output file groups # def test_parse_no_out(self): - # task = ProcessorTask.parse('sample-processor -I IN') + # task = OcrdWfStep.parse('sample-processor -I IN') # with self.assertRaisesRegex(Exception, 'Processor requires output_file_grp but none was provided.'): # task.validate() # # this should validate - # task2 = ProcessorTask.parse('sample-processor-without-file-grp -I IN') + # task2 = OcrdWfStep.parse('sample-processor-without-file-grp -I IN') # self.assertTrue(task2.validate()) def test_parse_implicit_after_validate(self): - task = ProcessorTask.parse('%s -I IN -O OUT -p \'{"param1": true}\'' % SAMPLE_NAME_REQUIRED_PARAM) - task.validate() + task = OcrdWfStep.parse('%s -I IN -O OUT -p \'{"param1": true}\'' % SAMPLE_NAME_REQUIRED_PARAM) + self.assertTrue(OcrdWfValidator().step_is_resolveable(task).is_valid) # TODO uncomment and adapt once OCR-D/spec#121 lands # self.assertEqual(task.input_file_grps, ['IN', 'SECOND_IN']) # self.assertEqual(task.output_file_grps, ['OUT', 'SECOND_OUT']) @@ -42,42 +104,42 @@ def test_parse_implicit_after_validate(self): def test_parse_unknown(self): with self.assertRaisesRegex(Exception, 'Failed parsing task description'): - ProcessorTask.parse('sample-processor -x wrong wrong wrong') + OcrdWfStep.parse('sample-processor -x wrong wrong wrong') def test_parse_ok(self): - task_str = 'sample-processor -I IN -O OUT -p %s' % self.param_fname - task = ProcessorTask.parse(task_str) + task_str = 'ocrd-sample-processor -I IN -O OUT -p %s' % self.param_fname + task = OcrdWfStep.parse(task_str) self.assertEqual(task.executable, 'ocrd-sample-processor') self.assertEqual(task.input_file_grps, ['IN']) self.assertEqual(task.output_file_grps, ['OUT']) self.assertEqual(json.dumps(task.parameters), PARAM_JSON) - self.assertEqual(str(task), task_str.replace(self.param_fname, "'%s'" % PARAM_JSON)) + self.assertEqual(str(task), task_str.replace('-p %s' % self.param_fname, "-P foo 42")) def test_parse_repeated_params(self): task_str = 'sample-processor -I IN -O OUT -p %s -P foo 23' % self.param_fname - task = ProcessorTask.parse(task_str) + task = OcrdWfStep.parse(task_str) self.assertEqual(task.parameters, {'foo': 23}) def test_parse_parameter_none(self): - task_str = 'sample-processor -I IN -O OUT1,OUT2' - task = ProcessorTask.parse(task_str) + task_str = 'ocrd-sample-processor -I IN -O OUT1,OUT2' + task = OcrdWfStep.parse(task_str) self.assertEqual(task.parameters, {}) self.assertEqual(str(task), task_str) def test_fail_validate_param(self): - task = ProcessorTask.parse('sample-processor -I IN -O OUT -p %s' % self.param_fname) - with self.assertRaisesRegex(Exception, r"Additional properties are not allowed \('foo' was unexpected\)"): - task.validate() + task = OcrdWfStep.parse('sample-processor -I IN -O OUT -p %s' % self.param_fname) + report = OcrdWfValidator().step_is_resolveable(task) + self.assertIn("Additional properties are not allowed ('foo' was unexpected)", str(report.errors)) def test_fail_validate_executable(self): - task = ProcessorTask.parse('no-such-processor -I IN') - with self.assertRaisesRegex(Exception, 'Executable not found in '): - task.validate() + task = OcrdWfStep.parse('no-such-processor -I IN') + report = OcrdWfValidator().step_is_resolveable(task) + self.assertIn('Executable not found in ', str(report.errors)) def test_required_param(self): - task = ProcessorTask.parse('%s -I IN -O OUT' % SAMPLE_NAME_REQUIRED_PARAM) - with self.assertRaisesRegex(Exception, "'param1' is a required property"): - task.validate() + task = OcrdWfStep.parse('%s -I IN -O OUT' % SAMPLE_NAME_REQUIRED_PARAM) + report = OcrdWfValidator().step_is_resolveable(task) + self.assertIn("'param1' is a required property", str(report.errors)) def test_validate_sequence(self): resolver = Resolver() @@ -87,15 +149,17 @@ def test_validate_sequence(self): params_path.write_text('{"param1": true}') with self.assertRaisesRegex(Exception, "Input file group not contained in METS or produced by previous steps: FOO'"): - validate_tasks([ProcessorTask.parse(x) for x in [ - '%s -I OCR-D-IMG -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), - '%s -I FOO -O OUT2 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path) - ]], workspace) + wf = OcrdWf(steps=[OcrdWfStep.parse(x) for x in [ + '%s -I OCR-D-IMG -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), + '%s -I FOO -O OUT2 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path) + ]]) + OcrdWfValidator().validate(wf, workspace) with self.assertRaisesRegex(Exception, "Input fileGrp.@USE='IN'. not in METS!"): - validate_tasks([ProcessorTask.parse(x) for x in [ + wf = OcrdWf(steps=[OcrdWfStep.parse(x) for x in [ '%s -I IN -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), - ]], workspace) + ]]) + OcrdWfValidator().validate(wf, workspace) def test_422(self): """ @@ -104,12 +168,13 @@ def test_422(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url(assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) - validate_tasks([ProcessorTask.parse(x) for x in [ + wf = OcrdWf([OcrdWfStep.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", - ]], workspace) + ]]) + OcrdWfValidator().validate(wf, workspace) def test_overwrite(self): resolver = Resolver() @@ -118,20 +183,19 @@ def test_overwrite(self): # should fail at step 3 workspace.mets.add_file('OCR-D-SEG-WORD', url='foo/bar', ID='foo', pageId='page1', mimetype='image/tif') with self.assertRaisesRegex(Exception, r"Invalid task sequence input/output file groups: \[\"Output fileGrp\[@USE='OCR-D-SEG-WORD'\] already in METS!\"\]"): - validate_tasks([ProcessorTask.parse(x) for x in [ + OcrdWfValidator().validate(OcrdWf(steps=[OcrdWfStep.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", - ]], workspace) + ]]), workspace) # should succeed b/c overwrite - validate_tasks([ProcessorTask.parse(x) for x in [ + OcrdWfValidator().validate(OcrdWf(steps=[OcrdWfStep.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", - ]], workspace, overwrite=True) - + ]]), workspace, overwrite=True) def test_task_run(self): resolver = Resolver()