From b320af6cea13eb6e8ea286228f931e90d04a2d5f Mon Sep 17 00:00:00 2001
From: John Wiggins <jwiggins@enthought.com>
Date: Wed, 3 Mar 2021 10:14:55 +0100
Subject: [PATCH] Clean up the benchmark suite

---
 enable/gcbench/bench.py   |  61 ++++++++------
 enable/gcbench/data.py    |  97 +++++++++++++++++++++
 enable/gcbench/publish.py | 173 +++++++++++++++++++++-----------------
 3 files changed, 228 insertions(+), 103 deletions(-)
 create mode 100644 enable/gcbench/data.py

diff --git a/enable/gcbench/bench.py b/enable/gcbench/bench.py
index 1083f7eca..152ba3cd4 100644
--- a/enable/gcbench/bench.py
+++ b/enable/gcbench/bench.py
@@ -14,10 +14,12 @@
 
 import numpy as np
 
+from enable.gcbench.data import BenchResult, BenchTiming
+
 _MAX_DURATION = 1.0
 _SIZE = (512, 512)
 _BACKENDS = {
-    "ui": {
+    "gui": {
         "kiva.agg": "enable.null.image",
         "cairo": "enable.null.cairo",
         "celiagg": "enable.null.celiagg",
@@ -37,8 +39,8 @@ def benchmark(outdir=None):
     """ Benchmark all backends
     """
     suite = gen_suite()
+    results = {btype: {} for btype in _BACKENDS}
 
-    results = {t: {} for t in _BACKENDS}
     for btype, backends in _BACKENDS.items():
         for name, mod_name in backends.items():
             print(f"Benchmarking backend: {name}", end="")
@@ -48,12 +50,13 @@ def benchmark(outdir=None):
                 print(" ... Not available")
                 continue
 
-            # UI backends are checked for performance, File backends are not.
-            if btype == "ui":
+            if btype == "gui":
+                # GUI backends are checked for performance (and features).
                 results[btype][name] = benchmark_backend(
                     suite, name, module, outdir=outdir
                 )
             else:
+                # File backends are checked for feature coverage.
                 # XXX: Use the fact that `name` is the same as the file ext.
                 results[btype][name] = exercise_backend(
                     suite, name, module, extension=name, outdir=outdir
@@ -70,32 +73,35 @@ def benchmark_backend(suite, mod_name, module, outdir=None):
 
     results = {}
     for name, symbol in suite.items():
+        # Result `summary` defaults to "fail"
+        results[name] = result = BenchResult()
+
         print(f"\n\tBenchmark {name}", end="")
         try:
             instance = symbol(gc, module)
         except Exception:
+            print(f" ... Failed", end="")
             continue
 
         if name.endswith("2x"):
             # Double sized
             with gc:
                 gc.scale_ctm(2, 2)
-                stats = gen_timings(gc, instance)
+                timing = gen_timing(gc, instance)
         else:
             # Normal scale
-            stats = gen_timings(gc, instance)
+            timing = gen_timing(gc, instance)
 
-        if stats is None:
+        if timing is None:
             print(f" ... Failed", end="")
-            results[name] = None
             continue
 
-        results[name] = {"times": stats}
+        result.timing = timing
+        result.summary = "success"
         if outdir is not None:
             fname = os.path.join(outdir, f"{mod_name}.{name}.png")
             gc.save(fname)
-            results[name]["format"] = "png"
-            results[name]["filename"] = os.path.basename(fname)
+            result.output = os.path.basename(fname)
 
     print()  # End the line that was left
     return results
@@ -106,11 +112,14 @@ def exercise_backend(suite, mod_name, module, extension, outdir=None):
     """
     GraphicsContext = getattr(module, "GraphicsContext")
 
-    results = {name: None for name in suite}
+    results = {}
     for name, symbol in suite.items():
+        # Result `summary` defaults to "fail"
+        results[name] = result = BenchResult()
+
         # Skip 2x versions
         if name.endswith("2x"):
-            results[name] = {"skip": True}
+            result.summary = "skip"
             continue
 
         # Use a fresh context each time
@@ -120,20 +129,21 @@ def exercise_backend(suite, mod_name, module, extension, outdir=None):
         try:
             instance = symbol(gc, module)
         except Exception:
+            print(f" ... Failed", end="")
             continue
 
         try:
             instance()
+            result.summary = "success"
         except Exception:
             print(f" ... Failed", end="")
             continue
 
-        results[name] = {"times": {}}
         if outdir is not None:
             fname = os.path.join(outdir, f"{mod_name}.{name}.{extension}")
             gc.save(fname)
-            results[name]["format"] = extension
-            results[name]["filename"] = os.path.basename(fname)
+            # Record the output
+            result.output = os.path.basename(fname)
 
     print()  # End the line that was left
     return results
@@ -142,6 +152,7 @@ def exercise_backend(suite, mod_name, module, extension, outdir=None):
 def gen_suite():
     """ Create a suite of benchmarks to run against each backend
     """
+    # Import here so we can use `suite` as a name elsewhere.
     from enable.gcbench import suite
 
     benchmarks = {}
@@ -149,12 +160,12 @@ def gen_suite():
         symbol = getattr(suite, name)
         if inspect.isclass(symbol):
             benchmarks[name] = symbol
-            benchmarks[f"{name} 2x"] = symbol
+            benchmarks[f"{name}_2x"] = symbol
 
     return benchmarks
 
 
-def gen_timings(gc, func):
+def gen_timing(gc, func):
     """ Run a function multiple times and generate some stats
     """
     duration = 0.0
@@ -174,10 +185,10 @@ def gen_timings(gc, func):
         return None
 
     times = np.array(times)
-    return {
-        "mean": times.mean() * 1000,
-        "min": times.min() * 1000,
-        "max": times.max() * 1000,
-        "std": times.std() * 1000,
-        "count": len(times),
-    }
+    return BenchTiming(
+        count=len(times),
+        mean=times.mean() * 1000,
+        minimum=times.min() * 1000,
+        maximum=times.max() * 1000,
+        stddev=times.std() * 1000,
+    )
diff --git a/enable/gcbench/data.py b/enable/gcbench/data.py
new file mode 100644
index 000000000..44cf4a161
--- /dev/null
+++ b/enable/gcbench/data.py
@@ -0,0 +1,97 @@
+# (C) Copyright 2005-2021 Enthought, Inc., Austin, TX
+# All rights reserved.
+#
+# This software is provided without warranty under the terms of the BSD
+# license included in LICENSE.txt and may be redistributed only under
+# the conditions described in the aforementioned license. The license
+# is also available online at http://www.enthought.com/licenses/BSD.txt
+#
+# Thanks for using Enthought open source!
+import os
+
+from traits.api import (
+    Enum, File, Float, HasStrictTraits, Instance, Int, Property, Str
+)
+
+
+class BenchResult(HasStrictTraits):
+    """ The result of a benchmark run on a single backend
+    """
+    #: Short status field for checking the outcome of a benchmark
+    # Default to "fail"!
+    summary = Enum("fail", "skip", "success")
+
+    #: A path to an output file and its format
+    output = File()
+    output_format = Property(Str(), observe="output")
+
+    #: Timing results
+    timing = Instance("BenchTiming")
+
+    def _get_output_format(self):
+        if self.output:
+            return os.path.splitext(self.output)[-1]
+        return ""
+
+    def compare_to(self, other):
+        return BenchComparison.from_pair(self, baseline=other)
+
+
+class BenchComparison(HasStrictTraits):
+    """ A comparison table entry.
+    """
+    #: CSS class to use for `td`
+    css_class = Enum("valid", "invalid", "skipped")
+
+    #: The content for the `td`
+    value = Str()
+
+    @classmethod
+    def from_pair(cls, result, baseline=None):
+        """ Create an instance from two BenchResult instances.
+        """
+        if result.summary == "fail":
+            return cls(value="\N{HEAVY BALLOT X}", css_class="invalid")
+
+        elif result.summary == "skip":
+            return cls(value="\N{HEAVY MINUS SIGN}", css_class="skipped")
+
+        elif result.summary == "success":
+            if result.timing is not None:
+                # Compare timing to the baseline result
+                relvalue = baseline.timing.mean / result.timing.mean
+                return cls(value=f"{relvalue:0.2f}", css_class="valid")
+            else:
+                # No timing, but the result was successful
+                return cls(value="\N{HEAVY CHECK MARK}", css_class="valid")
+
+        else:
+            raise RuntimeError("Unhandled result `summary`")
+
+        return None
+
+
+class BenchTiming(HasStrictTraits):
+    """ The timing results of a single benchmark.
+    """
+    #: How many times the benchmark ran
+    count = Int(0)
+
+    #: avg/min/max/std
+    mean = Float(0.0)
+    minimum = Float(0.0)
+    maximum = Float(0.0)
+    stddev = Float(0.0)
+
+    def to_html(self):
+        """ Format this instance as an HTML <table>
+        """
+        names = ("mean", "minimum", "maximum", "stddev", "count")
+        rows = [
+            (f"<tr><td>{name.capitalize()}</td>"
+             f"<td>{getattr(self, name):0.4f}</td></tr>")
+            for name in names
+        ]
+
+        rows = "\n".join(rows)
+        return f'<table>{rows}</table>'
diff --git a/enable/gcbench/publish.py b/enable/gcbench/publish.py
index 4163489de..9e8e55f3a 100644
--- a/enable/gcbench/publish.py
+++ b/enable/gcbench/publish.py
@@ -21,11 +21,16 @@
 <style>
   table, th, td {{
     padding: 4px;
+    background: #eee;
     border: 1px solid gray;
     border-collapse: collapse;
   }}
   th {{
-    text-align: left;
+    text-align: center;
+  }}
+  td.valid,td.invalid,td.skipped {{
+    text-align: center;
+    vertical-align: center;
   }}
   td.valid {{
     background: lightgreen;
@@ -33,13 +38,19 @@
   td.invalid {{
     background: lightpink;
   }}
-    td.skipped {{
+  td.skipped {{
+    background: inherit;
   }}
 </style>
 <h3>Kiva Backend Benchmark Results</h3>
 <p>
 All results are shown relative to the kiva.agg backend. Numbers less than 1.0
 indicate a slower result and numbers greater than 1.0 indicate a faster result.
+<br><br>
+For backends that aren't timed:<br>
+"\N{HEAVY CHECK MARK}" indicates a successful run<br>
+"\N{HEAVY BALLOT X}" indicates a failed run<br>
+"\N{HEAVY MINUS SIGN}" indicates a skipped run<br>
 </p>
 {comparison_table}
 </body>
@@ -54,11 +65,15 @@
 </head>
 <body>
 <style>
-  table, th, td {{
+  table, td {{
     padding: 4px;
     border: 1px solid gray;
     border-collapse: collapse;
     text-align: left;
+    vertical-align: top;
+  }}
+  th {{
+    text-align: center;
   }}
 </style>
 <p>
@@ -70,9 +85,7 @@
 """
 _TABLE_TEMPLATE = """
 <table>
-<tr>
-{headers}
-</tr>
+<tr>{headers}</tr>
 {rows}
 </table>
 """
@@ -82,21 +95,30 @@ def publish(results, outdir):
     """ Write the test results out as a simple webpage.
     """
     backends = []
-    functions = {}
+    benchmarks = {}
 
-    # Transpose the results so that they're accesible by function.
+    # Transpose the results so that they're accesible by benchmark.
     for btype, backend_results in results.items():
         backends.extend(list(backend_results))
         for bend in backend_results:
-            for name, res in backend_results[bend].items():
-                functions.setdefault(name, {})[bend] = res
+            for benchmark_name, res in backend_results[bend].items():
+                benchmarks.setdefault(benchmark_name, {})[bend] = res
 
+    # Convert each benchmark into an output comparison page and a row for the
+    # comparison table.
     comparisons = {}
-    for name, results in functions.items():
-        _build_function_page(name, results, outdir)
-        # Scale timing values relative to the "kiva.agg" backend implementation
-        comparisons[name] = _format_benchmark(results, "kiva.agg")
-
+    for benchmark_name, benchmark_results in benchmarks.items():
+        _build_output_comparison_page(
+            benchmark_name, benchmark_results, outdir
+        )
+        # Compare each result to the "kiva.agg" result
+        baseline = benchmark_results["kiva.agg"]
+        comparisons[benchmark_name] = {
+            name: result.compare_to(baseline)
+            for name, result in benchmark_results.items()
+        }
+
+    # Fill out the comparison table and write the summary index
     comparison_table = _build_comparison_table(backends, comparisons)
     path = os.path.join(outdir, "index.html")
     with open(path, "w") as fp:
@@ -106,48 +128,49 @@ def publish(results, outdir):
 def _build_comparison_table(backends, comparisons):
     """ Build some table data for comparison of backend performance timings.
     """
-    # All the row data
+    # Headers
+    headers = ["Draw Function"] + backends
+    headers = "\n".join(_th(head) for head in headers)
+
+    # Build the rows
     rows = []
-    for name, stats in comparisons.items():
-        # Start the row off with the name of the function
-        # Link to the table of images created by each backend
-        link = f'<a href="{name}.html">'
-        row = [f"<td>{link}{name}</a></td>"]
+    for benchmark_name, comparisons in comparisons.items():
+        # Start the row off with the name of the benchmark
+        # Link to the benchmark output comparison page
+        row = [_td(_link(f"{benchmark_name}.html", benchmark_name))]
+
+        # Add column entries for the BenchComparisons, ordered by backend
         for bend in backends:
-            # Each backend stat includes a CSS class for table styling
-            stat, klass = stats[bend]
-            row.append(f'<td class="{klass}">{stat}</td>')
-        # Concat all the <td>'s into a single string
-        rows.append("".join(row))
-    # Concat all the <tr>'s into a multiline string.
-    rows = "\n".join(f"<tr>{row}</tr>" for row in rows)
+            comp = comparisons[bend]
+            row.append(f'<td class="{comp.css_class}">{comp.value}</td>')
 
-    # Headers
-    headers = ["Draw Function"] + backends
-    headers = "\n".join(f"<th>{head}</th>" for head in headers)
+        # Concat all the columns into a single table row
+        rows.append(_tr("".join(row)))
+    rows = "\n".join(rows)
 
     # Smash it all together in the template
     return _TABLE_TEMPLATE.format(headers=headers, rows=rows)
 
 
-def _build_function_page(benchmark_name, results, outdir):
+def _build_output_comparison_page(benchmark_name, backend_results, outdir):
     """ Build a page which shows backend outputs next to each other.
     """
+    # Headers
+    headers = ("Backend", "Output", "Timing")
+    headers = "".join(_th(name) for name in headers)
+
     # Build the rows
-    backends = []
-    output_tds, stat_tds = "", ""
-    for backend_name, result in results.items():
-        if result is None or "skip" in result:
+    rows = []
+    for backend_name, result in backend_results.items():
+        # If no file was output, skip
+        if not result.output:
             continue
 
-        backends.append(backend_name)
-        output_tds += f"<td>{_format_output(result)}</td>"
-        stat_tds += f"<td>{_format_stats(result['times'])}</td>"
-
-    rows = f"<tr>{output_tds}</tr>\n<tr>{stat_tds}</tr>"
-
-    # Headers
-    headers = "\n".join(f"<th>{name}</th>" for name in backends)
+        # A row is [Backend | Output | Timing]
+        output = _format_output(result)
+        timing = _format_timing(result)
+        rows.append(_tr(f"{_td(backend_name)}{_td(output)}{_td(timing)}"))
+    rows = "\n".join(rows)
 
     table = _TABLE_TEMPLATE.format(headers=headers, rows=rows)
     content = _IMAGE_PAGE_TEMPLATE.format(
@@ -159,46 +182,40 @@ def _build_function_page(benchmark_name, results, outdir):
         fp.write(content)
 
 
-def _format_benchmark(results, baseline):
-    """ Convert stats for backend benchmark runs into data for a table row.
-    """
-    basevalue = results[baseline]["times"]["mean"]
-    formatted = {}
-    for name, result in results.items():
-        if result is not None:
-            stats = result.get("times", {})
-            if stats:
-                relvalue = basevalue / stats["mean"]
-                formatted[name] = (f"{relvalue:0.2f}", "valid")
-            else:
-                if "skip" in result:
-                    # Benchmark was skipped
-                    formatted[name] = ("\N{HEAVY MINUS SIGN}", "skipped")
-                else:
-                    # No times, but the backend succeeded
-                    formatted[name] = ("\N{HEAVY CHECK MARK}", "valid")
-        else:
-            formatted[name] = ("\N{HEAVY BALLOT X}", "invalid")
-
-    return formatted
-
-
 def _format_output(result):
     """ Convert the output from a single benchmark run into an image embed or
     link.
     """
-    if result["format"] in ("png", "svg"):
-        return f'<img src="{result["filename"]}" />'
+    if result.output_format in (".png", ".svg"):
+        return _img(result.output)
     else:
-        return f'<a href="{result["filename"]}">download</a>'
+        return _link(result.output, "download")
 
 
-def _format_stats(stats):
+def _format_timing(result):
     """ Convert timing stats for a single benchmark run into a table.
     """
-    rows = [
-        f"<tr><td>{key.capitalize()}</td><td>{value:0.4f}</td></tr>"
-        for key, value in stats.items()
-    ]
-    rows = "\n".join(rows)
-    return f"<p>Timings:</p><table>{rows}</table>"
+    if result.timing is None:
+        return ""
+    return result.timing.to_html()
+
+
+# HTML utils
+def _img(src):
+    return f'<img src="{src}" />'
+
+
+def _link(target, text):
+    return f'<a href="{target}">{text}</a>'
+
+
+def _td(data, **attrs):
+    return f"<td>{data}</td>"
+
+
+def _th(data):
+    return f"<th>{data}</th>"
+
+
+def _tr(data):
+    return f"<tr>{data}</tr>"