apache · fsaintjacques · Apr 2, 2019 · Apr 2, 2019 · Apr 5, 2019 · Apr 6, 2019
diff --git a/.gitignore b/.gitignore
@@ -50,6 +50,8 @@ docs/example1.dat
 docs/example3.dat
 python/.eggs/
 python/doc/
+# Egg metadata
+*.egg-info
 
 .vscode
 .idea/

diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h
@@ -55,5 +55,18 @@ void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) {
       bench->Args({static_cast<ArgsType>(size), nulls});
 }
 
+void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
+  // Benchmark changed its parameter type between releases from
+  // int to int64_t. As it doesn't have version macros, we need
+  // to apply C++ template magic.
+  using ArgsType =
+      typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type;
+  bench->Unit(benchmark::kMicrosecond);
+
+  // Regressions should only bench L1 data for better stability
+  for (auto nulls : std::vector<ArgsType>({0, 1, 10, 50}))
+    bench->Args({static_cast<ArgsType>(kL1Size), nulls});
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
@@ -309,7 +309,7 @@ BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive<int64_t>)->Apply(BenchmarkSetArgs);
 BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader<int64_t>)->Apply(BenchmarkSetArgs);
 BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll<int64_t>)->Apply(BenchmarkSetArgs);
 
-static void BenchSumKernel(benchmark::State& state) {
+static void RegressionSumKernel(benchmark::State& state) {
   const int64_t array_size = state.range(0) / sizeof(int64_t);
   const double null_percent = static_cast<double>(state.range(1)) / 100.0;
   auto rand = random::RandomArrayGenerator(1923);
@@ -328,7 +328,7 @@ static void BenchSumKernel(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t));
 }
 
-BENCHMARK(BenchSumKernel)->Apply(BenchmarkSetArgs);
+BENCHMARK(RegressionSumKernel)->Apply(RegressionSetArgs);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Define a global regression threshold as 5%. This is purely subjective and
+# flawed. This does not track cumulative regression.
+DEFAULT_THRESHOLD = 0.05
+
+
+class BenchmarkComparator:
+    """ Compares two benchmarks.
+
+    Encodes the logic of comparing two benchmarks and taking a decision on
+    if it induce a regression.
+    """
+
+    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD,
+                 suite_name=None):
+        self.contender = contender
+        self.baseline = baseline
+        self.threshold = threshold
+        self.suite_name = suite_name
+
+    @property
+    def name(self):
+        return self.baseline.name
+
+    @property
+    def less_is_better(self):
+        return self.baseline.less_is_better
+
+    @property
+    def unit(self):
+        return self.baseline.unit
+
+    @property
+    def change(self):
+        new = self.contender.value
+        old = self.baseline.value
+
+        if old == 0 and new == 0:
+            return 0.0
+        if old == 0:
+            return 0.0
+
+        return float(new - old) / abs(old)
+
+    @property
+    def confidence(self):
+        """ Indicate if a comparison of benchmarks should be trusted. """
+        return True
+
+    @property
+    def regression(self):
+        change = self.change
+        adjusted_change = change if self.less_is_better else -change
+        return (self.confidence and adjusted_change > self.threshold)
+
+    def compare(self, comparator=None):
+        return {
+            "benchmark": self.name,
+            "change": self.change,
+            "regression": self.regression,
+            "baseline": self.baseline.value,
+            "contender": self.contender.value,
+            "unit": self.unit,
+            "less_is_better": self.less_is_better,
+        }
+
+    def __call__(self, **kwargs):
+        return self.compare(**kwargs)
+
+
+def pairwise_compare(contender, baseline):
+    dict_contender = {e.name: e for e in contender}
+    dict_baseline = {e.name: e for e in baseline}
+
+    for name in (dict_contender.keys() & dict_baseline.keys()):
+        yield name, (dict_contender[name], dict_baseline[name])
+
+
+class RunnerComparator:
+    """ Compares suites/benchmarks from runners.
+
+    It is up to the caller that ensure that runners are compatible (both from
+    the same language implementation).
+    """
+
+    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD):
+        self.contender = contender
+        self.baseline = baseline
+        self.threshold = threshold
+
+    def comparisons(self, suite_filter=None, benchmark_filter=None):
+        """
+        """
+        contender = self.contender.suites(suite_filter, benchmark_filter)
+        baseline = self.baseline.suites(suite_filter, benchmark_filter)
+        suites = pairwise_compare(contender, baseline)
+
+        for suite_name, (suite_cont, suite_base) in suites:
+            benchmarks = pairwise_compare(
+                suite_cont.benchmarks, suite_base.benchmarks)
+
+            for bench_name, (bench_cont, bench_base) in benchmarks:
+                yield BenchmarkComparator(bench_cont, bench_base,
+                                          threshold=self.threshold,
+                                          suite_name=suite_name)
diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pandas as pa
+
+
+class Benchmark:
+    def __init__(self, name, unit, less_is_better, values, stats=None):
+        self.name = name
+        self.unit = unit
+        self.less_is_better = less_is_better
+        self.values = pa.Series(values)
+        self.statistics = self.values.describe()
+
+    @property
+    def value(self):
+        median = "50%"
+        return float(self.statistics[median])
+
+    def __repr__(self):
+        return f"Benchmark[name={self.name},value={self.value}]"
+
+
+class BenchmarkSuite:
+    def __init__(self, name, benchmarks):
+        self.name = name
+        self.benchmarks = benchmarks
+
+    def __repr__(self):
+        name = self.name
+        benchmarks = self.benchmarks
+        return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]"
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from itertools import filterfalse, groupby, tee
+import json
+import subprocess
+
+from .core import Benchmark
+from ..utils.command import Command
+
+
+def partition(pred, iterable):
+    # adapted from python's examples
+    t1, t2 = tee(iterable)
+    return list(filter(pred, t1)), list(filterfalse(pred, t2))
+
+
+class GoogleBenchmarkCommand(Command):
+    """ Run a google benchmark binary.
+
+    This assumes the binary supports the standard command line options,
+    notably `--benchmark_filter`, `--benchmark_format`, etc...
+    """
+
+    def __init__(self, benchmark_bin, benchmark_filter=None):
+        self.bin = benchmark_bin
+        self.benchmark_filter = benchmark_filter
+
+    def list_benchmarks(self):
+        argv = ["--benchmark_list_tests"]
+        if self.benchmark_filter:
+            argv.append(f"--benchmark_filter={self.benchmark_filter}")
+        result = self.run(*argv, stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE)
+        return str.splitlines(result.stdout.decode("utf-8"))
+
+    def results(self):
+        argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]
+
+        if self.benchmark_filter:
+            argv.append(f"--benchmark_filter={self.benchmark_filter}")
+
-
+    @property
+    def suite_name(self):
+        return os.path.splitext(os.path.basename(self.bin))[0]
+
+    def results(self):
+        argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]
+
+        results = { "benchmarks": [] }
+        for name in self.list_benchmarks():
+            print(f"running {self.suite_name}.{name}")
+            result = json.loads(self.run(*argv, f"--benchmark_filter={name}",
+                                         stdout=subprocess.PIPE,
+                                         stderr=subprocess.PIPE).stdout)
+            results["context"] = result["context"]
+            results["benchmarks"] += result["benchmarks"]
+
+        return results
-
+    @property
+    def suite_name(self):
+        return os.path.splitext(os.path.basename(self.bin))[0]
+
+    def results(self):
+        argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]
+
+        results = { "benchmarks": [] }
+        for name in self.list_benchmarks():
+            print(f"running {self.suite_name}.{name}")
+            result = json.loads(self.run(*argv, f"--benchmark_filter={name}",
+                                         stdout=subprocess.PIPE,
+                                         stderr=subprocess.PIPE).stdout)
+            results["context"] = result["context"]
+            results["benchmarks"] += result["benchmarks"]
+
+        return results
+        return json.loads(self.run(*argv, stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE).stdout)
+
+
+class GoogleBenchmarkObservation:
+    """ Represents one run of a single (google c++) benchmark.
+
+    Observations are found when running with `--benchmark_repetitions`. Sadly,
+    the format mixes values and aggregates, e.g.
+
+    RegressionSumKernel/32768/0                 1 us          1 us  25.8077GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.7066GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.1481GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.846GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.6453GB/s
+    RegressionSumKernel/32768/0_mean            1 us          1 us  25.6307GB/s
+    RegressionSumKernel/32768/0_median          1 us          1 us  25.7066GB/s
+    RegressionSumKernel/32768/0_stddev          0 us          0 us  288.046MB/s
+
+    As from benchmark v1.4.1 (2019-04-24), the only way to differentiate an
+    actual run from the aggregates, is to match on the benchmark name. The
+    aggregates will be appended with `_$agg_name`.
+
+    This class encapsulate the logic to separate runs from aggregate . This is
+    hopefully avoided in benchmark's master version with a separate json
+    attribute.
+    """
+
+    def __init__(self, name, real_time, cpu_time, time_unit, size=None,
+                 bytes_per_second=None, **kwargs):
+        self._name = name
+        self.real_time = real_time
+        self.cpu_time = cpu_time
+        self.time_unit = time_unit
+        self.size = size
+        self.bytes_per_second = bytes_per_second
+
+    @property
+    def is_agg(self):
+        """ Indicate if the observation is a run or an aggregate. """
+        suffixes = ["_mean", "_median", "_stddev"]
+        return any(map(lambda x: self._name.endswith(x), suffixes))
+
+    @property
+    def is_realtime(self):
+        """ Indicate if the preferred value is realtime instead of cputime. """
+        return self.name.find("/realtime") != -1
+
+    @property
+    def name(self):
+        name = self._name
+        return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name
+
+    @property
+    def time(self):
+        return self.real_time if self.is_realtime else self.cpu_time
+
+    @property
+    def value(self):
+        """ Return the benchmark value."""
+        return self.bytes_per_second if self.size else self.time
+
+    @property
+    def unit(self):
+        return "bytes_per_second" if self.size else self.time_unit
+
+    def __repr__(self):
+        return f"{self.value}"
+
+
+class GoogleBenchmark(Benchmark):
+    """ A set of GoogleBenchmarkObservations. """
+
+    def __init__(self, name, runs):
+        """ Initialize a GoogleBenchmark.
+
+        Parameters
+        ----------
+        name: str
+              Name of the benchmark
+        runs: list(GoogleBenchmarkObservation)
+              Repetitions of GoogleBenchmarkObservation run.
+
+        """
+        self.name = name
+        # exclude google benchmark aggregate artifacts
+        _, runs = partition(lambda b: b.is_agg, runs)
+        self.runs = sorted(runs, key=lambda b: b.value)
+        unit = self.runs[0].unit
+        # If `size` is found in the json dict, then the benchmark is reported
+        # in bytes per second
+        less_is_better = self.runs[0].size is None
+        values = [b.value for b in self.runs]
+        super().__init__(name, unit, less_is_better, values)
+
+    def __repr__(self):
+        return f"GoogleBenchmark[name={self.name},runs={self.runs}]"
+
+    @classmethod
+    def from_json(cls, payload):
+        def group_key(x):
+            return x.name
+
+        benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload)
+        groups = groupby(sorted(benchmarks, key=group_key), group_key)
+        return [cls(k, list(bs)) for k, bs in groups]