diff --git a/README.md b/README.md index 56ef4ce34f..e9fa84fe17 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ * [Command line tools](#command-line-tools) * [`ocrd` CLI](#ocrd-cli) * [`ocrd-dummy` CLI](#ocrd-dummy-cli) +* [Configuration](#configuration) * [Packages](#packages) * [ocrd_utils](#ocrd_utils) * [ocrd_models](#ocrd_models) @@ -81,6 +82,19 @@ supported flags, options and arguments. A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp` +## Configuration + +Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support. + +Some parts of the software are configured via environement variables: + +* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. +* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens: + * `CPU`: Enable CPU profiling of processor runs + * `RSS`: Enable RSS memory profiling + * `PSS`: Enable proportionate memory profiling +* `OCRD_PROFILE_FILE`: If set, then the CPU profile is written to this file for later peruse with a analysis tools like [snakeviz](https://jiffyclub.github.io/snakeviz/) + ## Packages ### ocrd_utils diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index d7dee6e6ec..ff908c3e7f 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ from os.path import isfile +from os import environ import sys import click @@ -89,6 +90,12 @@ def ocrd_cli_wrap_processor( report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) + # Set up profiling behavior from environment variables/flags + if not profile and 'OCRD_PROFILE' in environ: + if 'CPU' in environ['OCRD_PROFILE']: + profile = True + if not profile_file and 'OCRD_PROFILE_FILE' in environ: + profile_file = environ['OCRD_PROFILE_FILE'] if profile or profile_file: import cProfile import pstats diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 90e1706728..e07333eaa2 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -1,10 +1,13 @@ """ Helper methods for running and documenting processors """ +from os import environ from time import perf_counter, process_time import json import inspect from subprocess import run, PIPE +from memory_profiler import memory_usage +from sparklines import sparklines from click import wrap_text from ocrd_utils import getLogger @@ -85,7 +88,23 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - processor.process() + if any(x in environ.get('OCRD_PROFILE', '') for x in ['RSS', 'PSS']): + backend = 'psutil_pss' if 'PSS' in environ['OCRD_PROFILE'] else 'psutil' + mem_usage = memory_usage(proc=processor.process, + # only run process once + max_iterations=1, + interval=.1, timeout=None, timestamps=True, + # include sub-processes + multiprocess=True, include_children=True, + # get proportional set size instead of RSS + backend=backend) + mem_usage_values = [mem for mem, _ in mem_usage] + mem_output = 'memory consumption: ' + mem_output += ''.join(sparklines(mem_usage_values)) + mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) + logProfile.info(mem_output) + else: + processor.process() t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index ca62ed9370..ad30bc1f83 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -7,4 +7,6 @@ opencv-python-headless Flask jsonschema pyyaml -Deprecated == 1.2.0 \ No newline at end of file +Deprecated == 1.2.0 +memory-profiler >= 0.58.0 +sparklines >= 0.4.2 diff --git a/tests/test_logging.py b/tests/test_logging.py index 0a03f86dff..a051c81a45 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -168,7 +168,9 @@ def testProcessorProfiling(self): # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s - self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents)) + # print(log_contents) + assert "Executing processor 'ocrd-test' took 0" in log_contents + # self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents)) def test_tmpConfigfile(self): self.assertNotEqual(logging.getLogger('').getEffectiveLevel(), logging.NOTSET)