Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ocrd/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def get_help(self, ctx):
from ocrd.decorators import ocrd_loglevel
from .zip import zip_cli
from .log import log_cli
from .wf import wf_cli

@click.group()
@click.version_option()
Expand All @@ -37,3 +38,4 @@ def cli(**kwargs): # pylint: disable=unused-argument
cli.add_command(zip_cli)
cli.add_command(validate_cli)
cli.add_command(log_cli)
cli.add_command(wf_cli)
11 changes: 7 additions & 4 deletions ocrd/ocrd/cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import codecs

from ocrd import Resolver, Workspace
from ocrd.task_sequence import ProcessorTask, validate_tasks
from ocrd_models import OcrdWf, OcrdWfStep

from ocrd_utils import (
parse_json_string_or_file
Expand All @@ -16,6 +16,7 @@
PageValidator,
ParameterValidator,
WorkspaceValidator,
OcrdWfValidator,
)

def _inform_of_result(report):
Expand Down Expand Up @@ -100,8 +101,10 @@ def validate_process(tasks, workspace):
'''
Validate a sequence of tasks passable to 'ocrd process'
'''
wf_val = OcrdWfValidator()
if workspace:
_inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace)))
wf = OcrdWf(steps=[OcrdWfStep.parse(t) for t in tasks])
_inform_of_result(wf_val.validate(wf, workspace=Workspace(Resolver(), directory=workspace)))
else:
for t in [ProcessorTask.parse(t) for t in tasks]:
_inform_of_result(t.validate())
for t in [OcrdWfStep.parse(t) for t in tasks]:
_inform_of_result(wf_val.step_is_resolveable(t))
40 changes: 40 additions & 0 deletions ocrd/ocrd/cli/wf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# import os
# from os.path import relpath, exists, join, isabs
# from pathlib import Path
# import sys
# from glob import glob # XXX pathlib.Path.glob does not support absolute globs
import io
import re

import click

from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
from ocrd_validators import OcrdWfValidator
from ocrd_models import OcrdWf, OcrdWfStep
from ocrd_utils import getLogger, pushd_popd, EXT_TO_MIME

log = getLogger('ocrd.cli.wf')

# ----------------------------------------------------------------------
# ocrd wf
# ----------------------------------------------------------------------

@click.group("wf")
def wf_cli():
"""
Working with OCRD-WF workflows
"""

# ----------------------------------------------------------------------
# ocrd wf is-well-formed WF_FILE
# ----------------------------------------------------------------------

@wf_cli.command('is-well-formed')
@click.argument('wf_file', required=True, type=click.File('r'))
def validate_workspace(wf_file):
"""
Try to parse an OCRD-WF workflow.
"""
OcrdWf.parse(wf_file.read())
print("ok - well-formed")

2 changes: 1 addition & 1 deletion ocrd/ocrd/processor/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def run_cli(
args += ['--overwrite']
log.debug("Running subprocess '%s'", ' '.join(args))
result = run(args, check=False, stdout=PIPE, stderr=PIPE)
return result.returncode, result.stdout, result.stderr
return result.returncode, result.stdout.decode('utf-8'), result.stderr.decode('utf-8')

def generate_processor_help(ocrd_tool):
parameter_help = ''
Expand Down
132 changes: 9 additions & 123 deletions ocrd/ocrd/task_sequence.py
Original file line number Diff line number Diff line change
@@ -1,136 +1,22 @@
import json
from shlex import split as shlex_split
from distutils.spawn import find_executable as which # pylint: disable=import-error,no-name-in-module
from subprocess import run, PIPE
from collections import Counter

from ocrd_utils import getLogger, parse_json_string_or_file, set_json_key_value_overrides
# from collections import Counter
from ocrd.processor.base import run_cli
from ocrd.resolver import Resolver
from ocrd_validators import ParameterValidator, WorkspaceValidator
from ocrd_models import ValidationReport

class ProcessorTask():

@classmethod
def parse(cls, argstr):
tokens = shlex_split(argstr)
executable = 'ocrd-%s' % tokens.pop(0)
input_file_grps = []
output_file_grps = []
parameters = {}
while tokens:
if tokens[0] == '-I':
for grp in tokens[1].split(','):
input_file_grps.append(grp)
tokens = tokens[2:]
elif tokens[0] == '-O':
for grp in tokens[1].split(','):
output_file_grps.append(grp)
tokens = tokens[2:]
elif tokens[0] == '-p':
parameters = {**parameters, **parse_json_string_or_file(tokens[1])}
tokens = tokens[2:]
elif tokens[0] == '-P':
set_json_key_value_overrides(parameters, tokens[1:3])
tokens = tokens[3:]
else:
raise Exception("Failed parsing task description '%s' with tokens remaining: '%s'" % (argstr, tokens))
return ProcessorTask(executable, input_file_grps, output_file_grps, parameters)

def __init__(self, executable, input_file_grps, output_file_grps, parameters):
self.executable = executable
self.input_file_grps = input_file_grps
self.output_file_grps = output_file_grps
self.parameters = parameters
self._ocrd_tool_json = None

@property
def ocrd_tool_json(self):
if self._ocrd_tool_json:
return self._ocrd_tool_json
result = run([self.executable, '--dump-json'], stdout=PIPE, check=True, universal_newlines=True)
self._ocrd_tool_json = json.loads(result.stdout)
return self._ocrd_tool_json

def validate(self):
if not which(self.executable):
raise Exception("Executable not found in PATH: %s" % self.executable)
if not self.input_file_grps:
raise Exception("Task must have input file group")
# TODO uncomment and adapt once OCR-D/spec#121 lands
# # make implicit input/output groups explicit by defaulting to what is
# # provided in ocrd-tool.json
# actual_output_grps = [*self.ocrd_tool_json['output_file_grp']]
# for i, grp in enumerate(self.output_file_grps):
# actual_output_grps[i] = grp
# self.output_file_grps = actual_output_grps
# actual_input_grps = [*self.ocrd_tool_json['input_file_grp']]
# for i, grp in enumerate(self.input_file_grps):
# actual_input_grps[i] = grp
# self.input_file_grps = actual_input_grps
param_validator = ParameterValidator(self.ocrd_tool_json)
report = param_validator.validate(self.parameters)
if not report.is_valid:
raise Exception(report.errors)
# TODO remove once OCR-D/spec#121 lands
if 'output_file_grp' in self.ocrd_tool_json and not self.output_file_grps:
raise Exception("Processor requires output_file_grp but none was provided.")
return report

def __str__(self):
ret = '%s -I %s -O %s' % (
self.executable.replace('ocrd-', '', 1),
','.join(self.input_file_grps),
','.join(self.output_file_grps))
if self.parameters:
ret += " -p '%s'" % json.dumps(self.parameters)
return ret
from ocrd_validators import WorkspaceValidator
from ocrd_utils import getLogger
from ocrd_models import ValidationReport

def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
report = ValidationReport()
prev_output_file_grps = workspace.mets.file_groups

first_task = tasks[0]
first_task.validate()

# first task: check input/output file groups from METS
WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report)

prev_output_file_grps += first_task.output_file_grps
for task in tasks[1:]:
task.validate()
# check either existing fileGrp or output-file group of previous task matches current input_file_group
for input_file_grp in task.input_file_grps:
if not input_file_grp in prev_output_file_grps:
report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp)
if not overwrite:
WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report)
# TODO disable output_file_grps checks once CLI parameter 'overwrite' is implemented
# XXX Thu Jan 16 20:14:17 CET 2020 still not sufficiently clever.
# if len(prev_output_file_grps) != len(set(prev_output_file_grps)):
# report.add_error("Output file group specified multiple times: %s" %
# [grp for grp, count in Counter(prev_output_file_grps).items() if count >= 2])
prev_output_file_grps += task.output_file_grps
if not report.is_valid:
raise Exception("Invalid task sequence input/output file groups: %s" % report.errors)
return report

from ocrd_validators import OcrdWfValidator
from ocrd_models import OcrdWf, OcrdWfStep

def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
resolver = Resolver()
workspace = resolver.workspace_from_url(mets)
log = getLogger('ocrd.task_sequence.run_tasks')
tasks = [ProcessorTask.parse(task_str) for task_str in task_strs]
steps = [OcrdWfStep.parse(task_str) for task_str in task_strs]
wf = OcrdWf(steps=steps)

validate_tasks(tasks, workspace, page_id, overwrite)
OcrdWfValidator().validate(wf, workspace, page_id=page_id, overwrite=overwrite)

# Run the tasks
for task in tasks:
for task in steps:

log.info("Start processing task '%s'", task)

Expand All @@ -150,9 +36,7 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):

# check return code
if returncode != 0:
raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err))

log.info("Finished processing task '%s'", task)
raise Exception("%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err))

# reload mets
workspace.reload_mets()
Expand All @@ -161,3 +45,5 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False):
for output_file_grp in task.output_file_grps:
if not output_file_grp in workspace.mets.file_groups:
raise Exception("Invalid state: expected output file group not in mets: %s\nSTDOUT:\n%s\nSTDERR:\n%s" % (output_file_grp, out, err))

log.info("Finished processing task '%s'", task)
2 changes: 2 additions & 0 deletions ocrd_models/ocrd_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
from .ocrd_file import OcrdFile
from .ocrd_mets import OcrdMets
from .ocrd_xml_base import OcrdXmlDocument
from .ocrd_wf import OcrdWf
from .ocrd_wf_step import OcrdWfStep
from .report import ValidationReport
3 changes: 3 additions & 0 deletions ocrd_models/ocrd_models/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
'TAG_PAGE_TEXTEQUIV',
'TAG_PAGE_TEXTREGION',
'REGEX_FILE_ID',
'OCRD_WF_SHEBANG',
]

REGEX_FILE_ID = re.compile('^[A-Za-z][^:]*$')
Expand Down Expand Up @@ -70,3 +71,5 @@
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
'Separator', 'Table', 'Text', 'Unknown'
]

OCRD_WF_SHEBANG = '#!/usr/bin/env ocrd-wf'
60 changes: 60 additions & 0 deletions ocrd_models/ocrd_models/ocrd_wf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import io
import re

from .constants import OCRD_WF_SHEBANG
from .ocrd_wf_step import OcrdWfStep

class OcrdWf():

def __init__(self, steps=None, assignments=None):
self.steps = steps if steps else []
self.assignments = assignments if assignments else {}

@staticmethod
def parse_file(fname):
with io.open(fname, mode='r', encoding='utf-8') as f:
return OcrdWf.parse(f.read())

@staticmethod
def parse(src):
if src[0:len(OCRD_WF_SHEBANG)] != OCRD_WF_SHEBANG:
raise ValueError("OCRD-WF does not begin with '%s'!" % OCRD_WF_SHEBANG)
lines_wo_empty = []
# remove empty lines
for line in src.split("\n")[1:]:
if not re.fullmatch(r'^\s*$', line):
lines_wo_empty.append(line)
# strip comments
lines_wo_comment = []
for line in lines_wo_empty:
if not re.match(r"^\s*#", line):
lines_wo_comment.append(line)
lines_wo_continuation = []
# line continuation
n = 0
while n < len(lines_wo_comment):
continued_lines = 0
while lines_wo_comment[n].endswith('\\'):
lines_wo_comment[n] = re.sub(r"\s*\\$", "", lines_wo_comment[n])
continued_lines += 1
lines_wo_comment[n] += re.sub(r"^\s*", " ", lines_wo_comment[n + continued_lines])
lines_wo_continuation.append(lines_wo_comment[n])
n += 1 + continued_lines
assignments = {}
steps = []
for line in lines_wo_continuation:
if re.match(r'^[A-Za-z][A-Za-z0-9]*=', line):
k, v = line.split('=', 2)
assignments[k] = v
else:
steps.append(OcrdWfStep.parse(line))
return OcrdWf(assignments=assignments, steps=steps)

def __str__(self):
ret = '%s\n' % OCRD_WF_SHEBANG
for k in self.assignments:
v = self.assignments[k]
ret += '%s=%s\n' % (k, v)
for step in self.steps:
ret += '%s\n' % str(step)
return ret
Loading