From 712d2ed3ca2c4b31bd6f095ebe9a1072a8901bf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 2 Apr 2019 12:12:38 -0400
Subject: [PATCH 01/31] initial commit

---
 dev/archery/archery/__init__.py      |   0
 dev/archery/archery/benchmark.py     |  34 ++++++
 dev/archery/archery/cli.py           | 148 +++++++++++++++++++++++
 dev/archery/archery/lang/cpp.py      | 111 ++++++++++++++++++
 dev/archery/archery/utils/cmake.py   | 168 +++++++++++++++++++++++++++
 dev/archery/archery/utils/command.py |  44 +++++++
 dev/archery/archery/utils/git.py     |  51 ++++++++
 dev/archery/archery/utils/logger.py  |  21 ++++
 8 files changed, 577 insertions(+)
 create mode 100644 dev/archery/archery/__init__.py
 create mode 100644 dev/archery/archery/benchmark.py
 create mode 100644 dev/archery/archery/cli.py
 create mode 100644 dev/archery/archery/lang/cpp.py
 create mode 100644 dev/archery/archery/utils/cmake.py
 create mode 100644 dev/archery/archery/utils/command.py
 create mode 100644 dev/archery/archery/utils/git.py
 create mode 100644 dev/archery/archery/utils/logger.py

diff --git a/dev/archery/archery/__init__.py b/dev/archery/archery/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/dev/archery/archery/benchmark.py b/dev/archery/archery/benchmark.py
new file mode 100644
index 00000000000..a901c640922
--- /dev/null
+++ b/dev/archery/archery/benchmark.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+from .utils.git import GitClone, GitCheckout
+
+
+class BenchmarkSuite:
+    def __init__(self, git_repo, root, revision):
+        self.root = root
+        self.clone_dir = os.path.join(root, "arrow-src")
+        self.revision = revision
+        self.git_repo = git_repo
+        self.setup()
+
+    def setup(self):
+    # Prepare local clone and checkout the proper revision.
+        GitClone("--local", self.git_repo, self.clone_dir)
+        GitCheckout("-b", self.revision, git_dir=self.clone_dir)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
new file mode 100644
index 00000000000..60a84de243b
--- /dev/null
+++ b/dev/archery/archery/cli.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import click
+import logging
+import os
+import tempfile
+
+from .lang.cpp import CppCMakeDefinition, CppConfiguration, CppBenchmarkSuite
+from .utils.git import GitClone, GitCheckout
+from .utils.logger import logger
+
+
+@click.group(name="archery")
+@click.option("--debug", count=True)
+def cli(debug):
+    if debug:
+        logger.setLevel(logging.DEBUG)
+
+
+def contains_arrow_sources(path):
+    cpp_path = os.path.join(path, "cpp")
+    cmake_path = os.path.join(cpp_path, "CMakeLists.txt")
+    return os.path.exists(cmake_path):
+
+def validate_arrow_sources(ctx, param, path):
+    """ Ensure a directory contains Arrow cpp sources. """
+    if not contains_arrow_sources(path):
+        raise click.BadParameter(f"No Arrow C++ sources found in {path}.")
+    return path
+
+
+def resolve_arrow_sources():
+    """ Infer Arrow sources directory from various method.
+
+    The following guesses are done in order:
+
+    1. Checks if the environment variable `ARROW_SRC` is defined and use this.
+       No validation is performed.
+
+    2. Checks if the current working directory (cwd) contains is an Arrow
+       source directory. If successful return this.
+
+    3. Checks if this file (cli.py) is still in the original source repository.
+       If so, returns the relative path to the source directory.
+    """
+    # Explicit via environment variable
+    arrow_env = os.environ.get("ARROW_SRC")
+    if arrow_env:
+        return arrow_env
+
+    # Implicit via cwd
+    arrow_cwd = os.getcwd()
+    if contains_arrow_sources(arrow_cwd):
+        return arrow_cwd
+
+    # Implicit via archery ran from sources, find relative sources from
+    this_dir = os.path.dirname(os.path.realpath(__file__))
+    arrow_via_archery = os.path.join(this_dir, "..", "..", "..")
+    if contains_arrow_sources(arrow_via_archery)
+        return arrow_via_archery
+
+    return None
+
+
+source_dir_type = click.Path(exists=True, dir_okay=True, file_okay=False,
+                             readable=True, resolve_path=True)
+build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True)
+# Supported build types
+build_type = click.Choice(["debug", "relwithdebinfo", "release"],
+                          case_sensitive=False)
+# Supported warn levels
+warn_level_type = click.Choice(["everything", "checkin", "production"],
+                               case_sensitive=False)
+
+
+@cli.command()
+# toolchain
+@click.option("--cc", help="C compiler.")
+@click.option("--cxx", help="C++ compiler.")
+@click.option("--cxx_flags", help="C++ compiler flags.")
+@click.option("--build-type", default="release", type=build_type,
+              help="CMake's CMAKE_BUILD_TYPE")
+@click.option("--warn-level", default="production", type=warn_level_type,
+              help="Controls compiler warnings -W(no-)error.")
+# components
+@click.option("--with-tests", default=True, type=bool,
+              help="Build with tests.")
+@click.option("--with-benchmarks", default=False, type=bool,
+              help="Build with benchmarks.")
+@click.option("--with-python", default=True, type=bool,
+              help="Build with python extension.")
+@click.option("--with-parquet", default=False, type=bool,
+              help="Build with parquet file support.")
+@click.option("--with-gandiva", default=False, type=bool,
+              help="Build with Gandiva expression compiler support.")
+@click.option("--with-plasma", default=False, type=bool,
+              help="Build with Plasma object store support.")
+@click.option("--with-flight", default=False, type=bool,
+              help="Build with Flight rpc support.")
+# misc
+@click.option("-f", "--force", help="Delete existing build directory if found.")
+@click.option("--build-and-test", type=bool, default=False, help="")
+@click.argument("build_dir", type=build_dir_type)
+@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources,
+                callback=validate_arrow_sources, required=False)
+def build(src_dir, build_dir, force, build_and_test, **kwargs):
+    # Arrow's cpp cmake configuration
+    conf = CppConfiguration(**kwargs)
+    # This is a closure around cmake invocation, e.g. calling `def.build()`
+    # yields a directory ready to be run with the generator
+    cpp_path = os.path.join(src_dir, "cpp")
+    cmake_def = CppCMakeDefinition(cpp_path, conf)
+    # Create build directory
+    build = cmake_def.build(build_dir, force=force)
+    if build_and_test:
+        build.all()
+
+
+@cli.command()
+@click.argument("rev_a")
+@click.argument("rev_b")
+@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources,
+                callback=validate_arrow_sources, required=False)
+def benchmark(rev_a, rev_b, src_dir):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        root_a = os.path.join(tmp_dir, rev_a)
+        bench_a = CppBenchmarkSuite(src_dir, root_a, rev_a)
+
+        root_b = os.path.join(tmp_dir, rev_b)
+        bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b)
+
+if __name__ == "__main__":
+    cli()
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
new file mode 100644
index 00000000000..a4c458911eb
--- /dev/null
+++ b/dev/archery/archery/lang/cpp.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+from ..benchmark import BenchmarkSuite
+from ..utils.cmake import CMakeDefinition
+
+
+def thrutifier(value):
+    return "ON" if value else "OFF"
+
+
+def or_else(value, default):
+    return value if value else default
+
+
+class CppConfiguration:
+    def __init__(self,
+                 # toolchain
+                 cc=None, cxx=None, cxx_flags=None,
+                 build_type=None, warn_level=None,
+                 # components
+                 with_tests=True, with_benchmarks=False, with_python=True,
+                 with_parquet=False, with_gandiva=False, with_plasma=False,
+                 with_flight=False):
+        self.cc = cc
+        self.cxx = cxx
+        self.cxx_flags = cxx_flags
+
+        self.build_type = build_type
+        self.warn_level = warn_level
+
+        self.with_tests = with_tests
+        self.with_benchmarks = with_benchmarks
+        self.with_python = with_python
+        self.with_parquet = with_parquet
+        self.with_gandiva = with_gandiva
+        self.with_plasma = with_plasma
+        self.with_flight = with_flight
+
+    def _gen_defs(self):
+        if self.cxx_flags:
+            yield ("ARROW_CXXFLAGS", self.cxx_flags)
+
+        yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug"))
+        yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production"))
+
+        yield ("ARROW_BUILD_TESTS", thrutifier(self.with_tests))
+        yield ("ARROW_BUILD_BENCHMARKS", thrutifier(self.with_benchmarks))
+
+        yield ("ARROW_PYTHON", thrutifier(self.with_python))
+        yield ("ARROW_PARQUET", thrutifier(self.with_parquet))
+        yield ("ARROW_GANDIVA", thrutifier(self.with_gandiva))
+        yield ("ARROW_PLASMA", thrutifier(self.with_plasma))
+        yield ("ARROW_FLIGHT", thrutifier(self.with_flight))
+
+    @property
+    def definitions(self):
+        return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()]
+
+    @property
+    def environment(self):
+        env = os.environ.copy()
+
+        if self.cc:
+            env["CC"] = self.cc
+
+        if self.cxx:
+            env["CXX"] = self.cxx
+
+        return env
+
+
+class CppCMakeDefinition(CMakeDefinition):
+    def __init__(self, source, conf, **kwargs):
+        self.configuration = conf
+        super().__init__(source, **kwargs,
+                         definitions=conf.definitions, env=conf.environment)
+
+
+default_benchmark_conf = CppConfiguration(
+    build_type="release", with_tests=True, with_benchmarks=True)
+
+
+class CppBenchmarkSuite(BenchmarkSuite):
+    def __init__(self, *args, conf=None, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.conf = conf if conf else default_benchmark_conf
+        assert self.conf.build_type is not "debug"
+        self.build_dir = os.path.join(self.root, "build")
+        self.cmake_def = CppCMakeDefinition(
+            os.path.join(self.clone_dir, "cpp"), self.conf)
+        self.build = self.cmake_def.build(self.build_dir)
+        # Build sources
+        self.build.all()
diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py
new file mode 100644
index 00000000000..920d21ee6ed
--- /dev/null
+++ b/dev/archery/archery/utils/cmake.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+from shutil import rmtree, which
+
+from .command import Command
+
+
+class CMake(Command):
+    def __init__(self, cmake_bin=None):
+        self.bin = cmake_bin if cmake_bin else os.environ.get("CMAKE", "cmake")
+
+    @staticmethod
+    def default_generator():
+        """ Infer default generator.
+
+        Gives precedence to ninja if there exists an executable named `ninja`
+        in the search path.
+        """
+        found_ninja = which("ninja")
+        return "Ninja" if found_ninja else "Make"
+
+
+cmake = CMake()
+
+
+class CMakeDefinition:
+    """ CMakeDefinition captures the cmake invocation arguments.
+
+    It allows creating build directories with the same definition, e.g.
+    ```
+    build_1 = cmake_def.build("/tmp/build-1")
+    build_2 = cmake_def.build("/tmp/build-2")
+
+    ...
+
+    build1.all()
+    build2.all()
+    """
+
+    def __init__(self, source, build_type="release", generator=None,
+                 definitions=None, env=None):
+        """ Initialize a CMakeDefinition
+
+        Parameters
+        ----------
+        source : str
+                 Source directory where the top-level CMakeLists.txt is
+                 located. This is usually the root of the project.
+        generator : str, optional
+        definitions: list(str), optional
+        env : dict(str,str), optional
+              Environment to use when invoking cmake. This can be required to
+              work around cmake deficiencies, e.g. CC and CXX.
+        """
+        self.source = os.path.abspath(source)
+        self.generator = generator if generator else cmake.default_generator()
+        self.definitions = definitions if definitions else []
+        self.env = env
+
+    @property
+    def arguments(self):
+        """" Return the arguments to cmake invocation. """
+        arguments = [
+            f"-G{self.generator}",
+        ] + self.definitions + [
+            self.source
+        ]
+        return arguments
+
+    def build(self, build_dir, force=False, **kwargs):
+        """ Invoke cmake into a build directory.
+
+        Parameters
+        ----------
+        build_dir : str
+                    Directory in which the CMake build will be instanciated.
+        force : bool
+                If the build folder exists, delete it before. Otherwise if it's
+                present, an error will be returned.
+        """
+        if os.path.exists(build_dir):
+            # Extra safety to ensure we're deleting a build folder.
+            if not CMakeBuild.is_build_dir(build_dir):
+                raise FileExistsError(f"{build_dir} exists but is not a cmake build, delete manually.")
+            if not force:
+                raise FileExistsError(f"{build_dir} exists pass force to delete")
+            rmtree(build_dir)
+
+        os.mkdir(build_dir)
+
+        cmake(*self.arguments, cwd=build_dir, env=self.env)
+        return CMakeBuild(self, build_dir, **kwargs)
+
+    def __repr__(self):
+        return f"CMakeDefinition[source={self.source}]"
+
+
+class CMakeBuild(Command):
+    """ CMakeBuild represents a build directory initialized by cmake.
+
+    The build instance can be used to build/test/install. It alleviates the
+    user to know which generator is used.
+    """
+
+    def __init__(self, definition, build_dir):
+        """ Initialize a CMakeBuild.
+
+        The caller must ensure that cmake was invoked in the build directory.
+
+        Parameters
+        ----------
+        definition : CMakeDefinition
+                     The definition to build from.
+        build_dir : str
+                    The build directory to setup into.
+        """
+        self.definition = definition
+        self.build_dir = os.path.abspath(build_dir)
+
+    @property
+    def bin(self):
+        return self.definition.generator.lower()
+
+    def run(self, *argv, verbose=False, **kwargs):
+        extra = []
+        if verbose:
+            is_make = self.bin is "make"
+            extra.append("VERBOSE=1" if is_make else "-v")
+        # Commands must be ran under the build directory
+        super().run(*extra, *argv, **kwargs, cwd=self.build_dir)
+        return self
+
+    def all(self):
+        return self.run("all")
+
+    def clean(self):
+        return self.run("clean")
+
+    def install(self):
+        return self.run("install")
+
+    def test(self):
+        return self.run("test")
+
+    @staticmethod
+    def is_build_dir(path):
+        cmake_cache = os.path.join(path, "CMakeCache.txt")
+        cmake_files = os.path.join(path, "CMakeFiles")
+        return os.path.exists(cmake_cache) and os.path.exists(cmake_files)
+
+    def __repr__(self):
+        return f"CMakeBuild[build={self.build_dir},definition={self.definition}]"
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
new file mode 100644
index 00000000000..214de92b7ad
--- /dev/null
+++ b/dev/archery/archery/utils/command.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import shutil
+import subprocess
+
+from .logger import logger
+
+
+def find_exec(executable):
+    if os.path.exists(executable):
+        return executable
+
+    return shutil.which(executable)
+
+
+class Command:
+    def bin(self):
+        raise NotImplementedError("Command must implement bin() method")
+
+    def run(self, *argv, **kwargs):
+        invocation = [find_exec(self.bin)]
+        invocation.extend(argv)
+
+        logger.debug(f"Executing `{' '.join(invocation)}`")
+        return subprocess.run(invocation, **kwargs)
+
+    def __call__(self, *argv, **kwargs):
+        self.run(*argv, **kwargs)
diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py
new file mode 100644
index 00000000000..4e0e8fd74d0
--- /dev/null
+++ b/dev/archery/archery/utils/git.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+from .command import Command
+
+
+class Git(Command):
+    def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs):
+        self.bin = git_bin if git_bin else os.environ.get("GIT", "git")
+        self.git_dir = git_dir
+        self.run(*argv, **kwargs)
+
+    @property
+    def global_opts(self):
+        if self.git_dir:
+            # For some reason, git does not accept `-Cdir`
+            yield "-C"
+            yield self.git_dir
+
+
+class GitSubCommand(Git):
+    def run(self, *argv, **kwargs):
+        super().run(*self.global_opts, self.cmd, *argv, **kwargs)
+
+
+class GitClone(GitSubCommand):
+    @property
+    def cmd(self):
+        return "clone"
+
+
+class GitCheckout(GitSubCommand):
+    @property
+    def cmd(self):
+        return "checkout"
diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py
new file mode 100644
index 00000000000..ab97204decb
--- /dev/null
+++ b/dev/archery/archery/utils/logger.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("archery")

From a5ad76d111eaec7ae6c3be7553d4bf1bffa440b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 2 Apr 2019 13:33:15 -0400
Subject: [PATCH 02/31] Fix syntax

---
 dev/archery/archery/cli.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 60a84de243b..90ba14478cf 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -35,7 +35,8 @@ def cli(debug):
 def contains_arrow_sources(path):
     cpp_path = os.path.join(path, "cpp")
     cmake_path = os.path.join(cpp_path, "CMakeLists.txt")
-    return os.path.exists(cmake_path):
+    return os.path.exists(cmake_path)
+
 
 def validate_arrow_sources(ctx, param, path):
     """ Ensure a directory contains Arrow cpp sources. """
@@ -71,7 +72,7 @@ def resolve_arrow_sources():
     # Implicit via archery ran from sources, find relative sources from
     this_dir = os.path.dirname(os.path.realpath(__file__))
     arrow_via_archery = os.path.join(this_dir, "..", "..", "..")
-    if contains_arrow_sources(arrow_via_archery)
+    if contains_arrow_sources(arrow_via_archery):
         return arrow_via_archery
 
     return None
@@ -113,22 +114,26 @@ def resolve_arrow_sources():
 @click.option("--with-flight", default=False, type=bool,
               help="Build with Flight rpc support.")
 # misc
-@click.option("-f", "--force", help="Delete existing build directory if found.")
-@click.option("--build-and-test", type=bool, default=False, help="")
+@click.option("-f", "--force",
+              help="Delete existing build directory if found.")
+@click.option("--targets", type=str, multiple=True,
+              help="Generator targets to run")
 @click.argument("build_dir", type=build_dir_type)
 @click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources,
                 callback=validate_arrow_sources, required=False)
-def build(src_dir, build_dir, force, build_and_test, **kwargs):
+def build(src_dir, build_dir, force, targets, **kwargs):
+    src_cpp = os.path.join(src_dir, "cpp")
+
     # Arrow's cpp cmake configuration
     conf = CppConfiguration(**kwargs)
     # This is a closure around cmake invocation, e.g. calling `def.build()`
     # yields a directory ready to be run with the generator
-    cpp_path = os.path.join(src_dir, "cpp")
-    cmake_def = CppCMakeDefinition(cpp_path, conf)
+    cmake_def = CppCMakeDefinition(src_cpp, conf)
     # Create build directory
     build = cmake_def.build(build_dir, force=force)
-    if build_and_test:
-        build.all()
+
+    for target in targets:
+        build.run(target)
 
 
 @cli.command()
@@ -144,5 +149,6 @@ def benchmark(rev_a, rev_b, src_dir):
         root_b = os.path.join(tmp_dir, rev_b)
         bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b)
 
+
 if __name__ == "__main__":
     cli()

From a38f49cd92c6d8e36b85fc6316d311b7d325496a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Fri, 5 Apr 2019 11:15:37 -0400
Subject: [PATCH 03/31] checkpoint

---
 dev/archery/archery/benchmark.py          |  34 ---
 dev/archery/archery/benchmark/__init__.py |   0
 dev/archery/archery/benchmark/core.py     |  80 +++++++
 dev/archery/archery/benchmark/google.py   | 269 ++++++++++++++++++++++
 dev/archery/archery/benchmark/runner.py   |  80 +++++++
 dev/archery/archery/cli.py                | 163 +++++++------
 dev/archery/archery/lang/cpp.py           |  24 +-
 dev/archery/archery/utils/cmake.py        |  28 ++-
 dev/archery/archery/utils/command.py      |  10 +-
 dev/archery/archery/utils/git.py          |   4 +-
 dev/archery/archery/utils/source.py       | 128 ++++++++++
 11 files changed, 682 insertions(+), 138 deletions(-)
 delete mode 100644 dev/archery/archery/benchmark.py
 create mode 100644 dev/archery/archery/benchmark/__init__.py
 create mode 100644 dev/archery/archery/benchmark/core.py
 create mode 100644 dev/archery/archery/benchmark/google.py
 create mode 100644 dev/archery/archery/benchmark/runner.py
 create mode 100644 dev/archery/archery/utils/source.py

diff --git a/dev/archery/archery/benchmark.py b/dev/archery/archery/benchmark.py
deleted file mode 100644
index a901c640922..00000000000
--- a/dev/archery/archery/benchmark.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-
-from .utils.git import GitClone, GitCheckout
-
-
-class BenchmarkSuite:
-    def __init__(self, git_repo, root, revision):
-        self.root = root
-        self.clone_dir = os.path.join(root, "arrow-src")
-        self.revision = revision
-        self.git_repo = git_repo
-        self.setup()
-
-    def setup(self):
-    # Prepare local clone and checkout the proper revision.
-        GitClone("--local", self.git_repo, self.clone_dir)
-        GitCheckout("-b", self.revision, git_dir=self.clone_dir)
diff --git a/dev/archery/archery/benchmark/__init__.py b/dev/archery/archery/benchmark/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
new file mode 100644
index 00000000000..ccd2630d444
--- /dev/null
+++ b/dev/archery/archery/benchmark/core.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+class Statistics:
+    def __init__(self, min, max, mean, stddev, q1, median, q3):
+        self.min = min
+        self.max = max
+        self.mean = mean
+        self.stddev = stddev
+        self.q1 = q1
+        self.median = median
+        self.q3 = q3
+
+    @staticmethod
+    def from_values(values):
+        return None
+
+
+class Benchmark:
+    def __init__(self, name, values, stats=None):
+        self.name = name
+        self.values = values
+        self.statistics = stats if stats else Statistics.from_values(values)
+        self.less_is_better = less_is_better
+
+    @property
+    def value(self):
+        return self.statistics.median
+
+
+def changes(old, new):
+    if old == 0 and new == 0:
+        return 0.0
+    if old == 0:
+        return float(new - old) / (float(old + new) / 2)
+    return float(new - old) / abs(old)
+
+
+class BenchmarkSuite:
+    def __init__(self, name, benchmarks):
+        self.name = name
+        self.benchmarks = benchmarks
+
+
+def default_comparator(old, new, threshold=0.05):
+    # negative change is better, positive is tolerable until it exceeds
+    # threshold
+    return changes(old, new) < threshold
+
+
+class BenchmarkComparator:
+    def __init__(self, baseline, contender):
+        self.baseline = baseline
+        self.contender = contender
+
+    def compare(self, comparator=None):
+        comparator = comparator if comparator else default_comparator
+        return comparator(self.baseline.value, self.contender.value)
+
+    def confidence(self):
+        """ Indicate if a comparison of benchmarks should be trusted. """
+        return True
+
+    def __call__(self, **kwargs):
+        return self.compare(**kwargs)
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
new file mode 100644
index 00000000000..15a65f819ec
--- /dev/null
+++ b/dev/archery/archery/benchmark/google.py
@@ -0,0 +1,269 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cmp_to_key, lru_cache
+from itertools import filterfalse, groupby, tee
+import json
+import six
+from socket import gethostname
+import subprocess
+import sys
+from uuid import getnode
+
+from ..utils.command import Command
+
+
+def memoize(fn):
+    return lru_cache(maxsize=1)(fn)
+
+
+def partition(pred, iterable):
+    # adapted from python's examples
+    t1, t2 = tee(iterable)
+    return list(filter(pred, t1)), list(filterfalse(pred, t2))
+
+
+# Taken from merge_arrow_pr.py
+def run_cmd(cmd):
+    if isinstance(cmd, six.string_types):
+        cmd = cmd.split(' ')
+
+    try:
+        output = subprocess.check_output(cmd)
+    except subprocess.CalledProcessError as e:
+        # this avoids hiding the stdout / stderr of failed processes
+        print('Command failed: %s' % cmd)
+        print('With output:')
+        print('--------------')
+        print(e.output)
+        print('--------------')
+        raise e
+
+    if isinstance(output, six.binary_type):
+        output = output.decode('utf-8')
+    return output.rstrip()
+
+
+class GoogleBenchmarkCommand(Command):
+    def __init__(self, benchmark_bin):
+        self.bin = benchmark_bin
+
+    def list_benchmarks(self, benchmark_filter=None):
+        argv = ["--benchmark_list_tests"]
+        if benchmark_filter:
+            argv.append(f"--benchmark_filter={benchmark_filter}")
+        result = self.run(*argv, stdout=subprocess.PIPE)
+        return [b for b in result.stdout]
+
+
+class Context:
+    """ Represents the runtime environment """
+
+    def __init__(self, date=None, executable=None, **kwargs):
+        self.date = date
+        self.executable = executable
+
+    @property
+    def host(self):
+        host = {
+            "hostname": gethostname(),
+            # Not sure if we should leak this.
+            "mac_address": getnode(),
+        }
+        return host
+
+    @property
+    def git(self):
+        head = run_cmd("git rev-parse HEAD")
+        # %ai: author date, ISO 8601-like format
+        fmt = "%ai"
+        timestamp = run_cmd(f"git log -1 --pretty='{fmt}' {head}")
+        branch = run_cmd("git rev-parse --abbrev-ref HEAD")
+        git_info = {
+            "git_commit_timestamp": timestamp,
+            "git_hash": head,
+            "git_branch": branch,
+        }
+        return git_info
+
+    @property
+    def toolchain(self):
+        # TODO parse local CMake generated info to extract compile flags and
+        # arrow features
+        deps = {}
+        toolchain = {
+            "language_implementation_version": "c++11",
+            "dependencies": deps,
+        }
+        return toolchain
+
+    def as_arrow(self):
+        ctx = {
+            "benchmark_language": "C++",
+            "run_timestamp": self.date,
+        }
+
+        for extra in (self.host, self.git, self.toolchain):
+            ctx.update(extra)
+
+        return ctx
+
+    @classmethod
+    def from_json(cls, version, payload):
+        return cls(**payload)
+
+
+class GoogleBenchmarkObservation:
+    """ Represents one run of a single benchmark. """
+
+    def __init__(self, version, **kwargs):
+        self._name = kwargs.get("name")
+        self.version = version
+        self.real_time = kwargs.get("real_time")
+        self.cpu_time = kwargs.get("cpu_time")
+        self.time_unit = kwargs.get("time_unit")
+        self.size = kwargs.get("size")
+        self.bytes_per_second = kwargs.get("bytes_per_second")
+
+    @property
+    def is_mean(self):
+        return self._name.endswith("_mean")
+
+    @property
+    def is_median(self):
+        return self._name.endswith("_median")
+
+    @property
+    def is_stddev(self):
+        return self._name.endswith("_stddev")
+
+    @property
+    def is_agg(self):
+        return self.is_mean or self.is_median or self.is_stddev
+
+    @property
+    def is_realtime(self):
+        return self.name.find("/realtime") != -1
+
+    @property
+    @memoize
+    def name(self):
+        name = self._name
+        return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name
+
+    @property
+    def value(self):
+        """ Return the benchmark value."""
+        if self.size:
+            return self.bytes_per_second
+        return self.real_time if self.is_realtime else self.cpu_time
+
+    @property
+    def unit(self):
+        if self.size:
+            return "bytes_per_second"
+        return self.time_unit
+
+    def __str__(self):
+        return f"BenchmarkObservation[name={self.name}]"
+
+
+class BenchmarkSuite:
+    def __init__(self, name, version, runs):
+        self.name = name
+        self.version = version
+        # exclude google benchmark aggregate artifacts
+        aggs, runs = partition(lambda b: b.is_agg, runs)
+        self.runs = sorted(runs, key=lambda b: b.value)
+        self.values = [b.value for b in self.runs]
+        self.aggregates = aggs
+
+    @property
+    def n_runs(self):
+        return len(self.runs)
+
+    @property
+    @memoize
+    def mean(self):
+        maybe_mean = [b for b in self.aggregates if b.is_mean]
+        if maybe_mean:
+            return maybe_mean[0].value
+        # fallback
+        return sum(self.values) / self.n_runs
+
+    @property
+    @memoize
+    def median(self):
+        maybe_median = [b for b in self.aggregates if b.is_median]
+        if maybe_median:
+            return maybe_median[0].value
+        # fallback
+        return self.runs[int(self.n_runs / 2)].value
+
+    @property
+    @memoize
+    def stddev(self):
+        maybe_stddev = [b for b in self.aggregates if b.is_stddev]
+        if maybe_stddev:
+            return maybe_stddev[0].value
+
+        sum_diff = sum([(val - self.mean)**2 for val in self.values])
+        return (sum_diff / (self.n_runs - 1))**0.5 if self.n_runs > 1 else 0.0
+
+    @property
+    def min(self):
+        return self.values[0]
+
+    @property
+    def max(self):
+        return self.values[-1]
+
+    def quartile(self, q):
+        return self.values[int(q * self.n_runs / 4)]
+
+    @property
+    def q1(self):
+        return self.quartile(1)
+
+    @property
+    def q3(self):
+        return self.quartile(3)
+
+    @property
+    def parameters(self):
+        """ Extract parameters from Benchmark's name"""
+        def parse_param(idx, param):
+            k_v = param.split(":")
+            # nameless parameter are transformed into positional names
+            name = k_v[0] if len(k_v) > 1 else f"arg{idx}"
+            return name, k_v[-1]
+
+        params = enumerate(self.name.split("/")[1:])
+        named_params = [parse_param(idx, p) for idx, p in params if p]
+        return {k: v for k, v in named_params}
+
+    def __str__(self):
+        return f"BenchmarkSuite[name={self.name},runs={len(self.runs)}]"
+
+    @classmethod
+    def from_json(cls, version, payload):
+        def group_key(x):
+            return x.name
+
+        benchmarks = map(lambda x: BenchmarkObservation(version, **x), payload)
+        groups = groupby(sorted(benchmarks, key=group_key), group_key)
+        return [cls(k, version, list(bs)) for k, bs in groups]
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
new file mode 100644
index 00000000000..5fb1f19086a
--- /dev/null
+++ b/dev/archery/archery/benchmark/runner.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import glob
+import os
+import re
+import subprocess
+
+from .core import Benchmark, BenchmarkSuite
+from .google import GoogleBenchmarkCommand
+from ..utils.logger import logger
+
+
+def regex_filter(re_expr):
+    if re_expr is None:
+        return lambda s: True
+
+    re_comp = re.compile(re_expr)
+    return lambda s: re_comp.search(s)
+
+
+class BenchmarkRunner:
+    def suites(self, suite_filter=None, benchmark_filter=None):
+        raise NotImplementedError("BenchmarkRunner must implement suites")
+
+
+class CppBenchmarkRunner(BenchmarkRunner):
+    def __init__(self, build):
+        """ Initialize a CppBenchmarkRunner. """
+        self.build = build
+        # Ensure build is ready to run benchmarks
+        self.build.all()
+
+    @property
+    def suites_binaries(self):
+        """ Returns a list of benchmark binaries for this build. """
+        glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark")
+        return {os.path.basename(b): b for b in glob.glob(glob_expr)}
+
+    def benchmarks(self, suite_bin, benchmark_filter):
+        """ Returns the resulting benchmarks for a given suite. """
+        suite_cmd = GoogleBenchmarkCommand(suite_bin)
+        benchmark_names = suite_cmd.list_benchmarks(benchmark_filter)
+
+        if not benchmark_names:
+            return []
+
+        return benchmark_names
+
+    def suites(self, suite_filter=None, benchmark_filter=None):
+        suite_matcher = regex_filter(suite_filter)
+
+        suite_and_binaries = self.suites_binaries
+        for suite_name in suite_and_binaries:
+            if suite_matcher(suite_name):
+                suite_bin = suite_and_binaries[suite_name]
+                benchmarks = self.benchmarks(suite_bin,
+                                             benchmark_filter=benchmark_filter)
+
+                # Filter may exclude all benchmarks
+                if not benchmarks:
+                    continue
+
+                yield BenchmarkSuite(suite_name, benchmarks)
+            else:
+                logger.debug(f"Ignoring suite {suite_name}")
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 90ba14478cf..426673fee7f 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,70 +17,37 @@
 # under the License.
 
 import click
+from contextlib import contextmanager
 import logging
 import os
-import tempfile
+import re
+from tempfile import mkdtemp, TemporaryDirectory
 
-from .lang.cpp import CppCMakeDefinition, CppConfiguration, CppBenchmarkSuite
-from .utils.git import GitClone, GitCheckout
+# from .benchmark import GoogleCompareBenchmark
+from .benchmark.runner import CppBenchmarkRunner
+from .lang.cpp import CppCMakeDefinition, CppConfiguration
+from .utils.git import Git
 from .utils.logger import logger
+from .utils.source import ArrowSources
 
 
-@click.group(name="archery")
+@click.group()
 @click.option("--debug", count=True)
-def cli(debug):
+def archery(debug):
+    """ Apache Arrow developer utilities. """
     if debug:
         logger.setLevel(logging.DEBUG)
 
 
-def contains_arrow_sources(path):
-    cpp_path = os.path.join(path, "cpp")
-    cmake_path = os.path.join(cpp_path, "CMakeLists.txt")
-    return os.path.exists(cmake_path)
-
-
-def validate_arrow_sources(ctx, param, path):
+def validate_arrow_sources(ctx, param, src):
     """ Ensure a directory contains Arrow cpp sources. """
-    if not contains_arrow_sources(path):
-        raise click.BadParameter(f"No Arrow C++ sources found in {path}.")
-    return path
-
-
-def resolve_arrow_sources():
-    """ Infer Arrow sources directory from various method.
+    if isinstance(src, str):
+        if not ArrowSources.valid(src):
+            raise click.BadParameter(f"No Arrow C++ sources found in {src}.")
+        src = ArrowSources(src)
+    return src
 
-    The following guesses are done in order:
 
-    1. Checks if the environment variable `ARROW_SRC` is defined and use this.
-       No validation is performed.
-
-    2. Checks if the current working directory (cwd) contains is an Arrow
-       source directory. If successful return this.
-
-    3. Checks if this file (cli.py) is still in the original source repository.
-       If so, returns the relative path to the source directory.
-    """
-    # Explicit via environment variable
-    arrow_env = os.environ.get("ARROW_SRC")
-    if arrow_env:
-        return arrow_env
-
-    # Implicit via cwd
-    arrow_cwd = os.getcwd()
-    if contains_arrow_sources(arrow_cwd):
-        return arrow_cwd
-
-    # Implicit via archery ran from sources, find relative sources from
-    this_dir = os.path.dirname(os.path.realpath(__file__))
-    arrow_via_archery = os.path.join(this_dir, "..", "..", "..")
-    if contains_arrow_sources(arrow_via_archery):
-        return arrow_via_archery
-
-    return None
-
-
-source_dir_type = click.Path(exists=True, dir_okay=True, file_okay=False,
-                             readable=True, resolve_path=True)
 build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True)
 # Supported build types
 build_type = click.Choice(["debug", "relwithdebinfo", "release"],
@@ -89,7 +57,10 @@ def resolve_arrow_sources():
                                case_sensitive=False)
 
 
-@cli.command()
+@archery.command(short_help="Initialize an Arrow C++ build")
+@click.option("--src", default=ArrowSources.find(),
+              callback=validate_arrow_sources,
+              help="Specify Arrow source directory")
 # toolchain
 @click.option("--cc", help="C compiler.")
 @click.option("--cxx", help="C++ compiler.")
@@ -114,21 +85,18 @@ def resolve_arrow_sources():
 @click.option("--with-flight", default=False, type=bool,
               help="Build with Flight rpc support.")
 # misc
-@click.option("-f", "--force",
+@click.option("-f", "--force", type=bool, is_flag=True, default=False,
               help="Delete existing build directory if found.")
 @click.option("--targets", type=str, multiple=True,
               help="Generator targets to run")
 @click.argument("build_dir", type=build_dir_type)
-@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources,
-                callback=validate_arrow_sources, required=False)
-def build(src_dir, build_dir, force, targets, **kwargs):
-    src_cpp = os.path.join(src_dir, "cpp")
-
+def build(src, build_dir, force, targets, **kwargs):
+    """ Build. """
     # Arrow's cpp cmake configuration
     conf = CppConfiguration(**kwargs)
     # This is a closure around cmake invocation, e.g. calling `def.build()`
     # yields a directory ready to be run with the generator
-    cmake_def = CppCMakeDefinition(src_cpp, conf)
+    cmake_def = CppCMakeDefinition(src.cpp, conf)
     # Create build directory
     build = cmake_def.build(build_dir, force=force)
 
@@ -136,19 +104,74 @@ def build(src_dir, build_dir, force, targets, **kwargs):
         build.run(target)
 
 
-@cli.command()
-@click.argument("rev_a")
-@click.argument("rev_b")
-@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources,
-                callback=validate_arrow_sources, required=False)
-def benchmark(rev_a, rev_b, src_dir):
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        root_a = os.path.join(tmp_dir, rev_a)
-        bench_a = CppBenchmarkSuite(src_dir, root_a, rev_a)
+@contextmanager
+def tmpdir(preserve, prefix="arrow-bench-"):
+    if preserve:
+        yield mkdtemp(prefix=prefix)
+    else:
+        with TemporaryDirectory(prefix=prefix) as tmp:
+            yield tmp
+
+
+DEFAULT_BENCHMARK_CONF = CppConfiguration(
+    build_type="release", with_tests=True, with_benchmarks=True)
+
+
+def arrow_cpp_benchmark_runner(src, root, rev):
+    root_rev = os.path.join(root, rev)
+    os.mkdir(root_rev)
+
+    root_src = os.path.join(root_rev, "arrow")
+    # Possibly checkout the sources at given revision
+    src_rev = src if rev == Git.WORKSPACE else src.at_revision(rev, root_src)
+
+    # TODO: find a way to pass custom configuration without cluttering the cli
+    # Ideally via a configuration file that can be shared with the
+    # `build` sub-command.
+    cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF)
+    build = cmake_def.build(os.path.join(root_rev, "build"))
+    return CppBenchmarkRunner(build)
+
+
+DEFAULT_BENCHMARK_FILTER = "^Regression"
+
+
+@archery.command(short_help="Run the C++ benchmark suite")
+@click.option("--src", default=ArrowSources.find(),
+              callback=validate_arrow_sources,
+              help="Specify Arrow source directory")
+@click.option("--preserve", type=bool, default=False, is_flag=True,
+              help="Preserve temporary workspace directory for investigation.")
+@click.option("--suite-filter", type=str, default=None,
+              help="Regex filtering benchmark suites.")
+@click.option("--benchmark-filter", type=str, default=DEFAULT_BENCHMARK_FILTER,
+              help="Regex filtering benchmark suites.")
+@click.argument("contender_rev", default=Git.WORKSPACE, required=False)
+@click.argument("baseline_rev", default="master", required=False)
+def benchmark(src, preserve, suite_filter,  benchmark_filter,
+              contender_rev, baseline_rev):
+    """ Benchmark Arrow C++ implementation.
+    """
+    with tmpdir(preserve) as root:
+        runner_cont = arrow_cpp_benchmark_runner(src, root, contender_rev)
+        runner_base = arrow_cpp_benchmark_runner(src, root, baseline_rev)
+
+        suites_cont = {s.name: s for s in runner_cont.suites(suite_filter,
+                                                             benchmark_filter)}
+        suites_base = {s.name: s for s in runner_base.suites(suite_filter,
+                                                             benchmark_filter)}
+
+        for name in suites_cont.keys() & suites_base.keys():
+            logger.debug(f"Comparing {name}")
+            contender_bin = suites_cont[name]
+            baseline_bin = suites_base[name]
 
-        root_b = os.path.join(tmp_dir, rev_b)
-        bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b)
+            # Note that compare.py requires contender and baseline inverted.
+            # GoogleCompareBenchmark("benchmarks", "--display_aggregates_only",
+            #                       baseline_bin, contender_bin,
+            #                       f"--benchmark_filter={benchmark_filter}",
+            #                       "--benchmark_repetitions=20")
 
 
 if __name__ == "__main__":
-    cli()
+    archery()
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index a4c458911eb..136dadc6b4e 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import glob
 import os
 
-from ..benchmark import BenchmarkSuite
+from ..benchmark.runner import BenchmarkRunner
 from ..utils.cmake import CMakeDefinition
 
 
@@ -90,22 +91,5 @@ class CppCMakeDefinition(CMakeDefinition):
     def __init__(self, source, conf, **kwargs):
         self.configuration = conf
         super().__init__(source, **kwargs,
-                         definitions=conf.definitions, env=conf.environment)
-
-
-default_benchmark_conf = CppConfiguration(
-    build_type="release", with_tests=True, with_benchmarks=True)
-
-
-class CppBenchmarkSuite(BenchmarkSuite):
-    def __init__(self, *args, conf=None, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        self.conf = conf if conf else default_benchmark_conf
-        assert self.conf.build_type is not "debug"
-        self.build_dir = os.path.join(self.root, "build")
-        self.cmake_def = CppCMakeDefinition(
-            os.path.join(self.clone_dir, "cpp"), self.conf)
-        self.build = self.cmake_def.build(self.build_dir)
-        # Build sources
-        self.build.all()
+                         definitions=conf.definitions, env=conf.environment,
+                         build_type=conf.build_type)
diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py
index 920d21ee6ed..b2e7c75b511 100644
--- a/dev/archery/archery/utils/cmake.py
+++ b/dev/archery/archery/utils/cmake.py
@@ -69,6 +69,7 @@ def __init__(self, source, build_type="release", generator=None,
               work around cmake deficiencies, e.g. CC and CXX.
         """
         self.source = os.path.abspath(source)
+        self.build_type = build_type
         self.generator = generator if generator else cmake.default_generator()
         self.definitions = definitions if definitions else []
         self.env = env
@@ -97,15 +98,16 @@ def build(self, build_dir, force=False, **kwargs):
         if os.path.exists(build_dir):
             # Extra safety to ensure we're deleting a build folder.
             if not CMakeBuild.is_build_dir(build_dir):
-                raise FileExistsError(f"{build_dir} exists but is not a cmake build, delete manually.")
+                raise FileExistsError(f"{build_dir} is not a cmake build")
             if not force:
-                raise FileExistsError(f"{build_dir} exists pass force to delete")
+                raise FileExistsError(f"{build_dir} exists use force=True")
             rmtree(build_dir)
 
         os.mkdir(build_dir)
 
         cmake(*self.arguments, cwd=build_dir, env=self.env)
-        return CMakeBuild(self, build_dir, **kwargs)
+        return CMakeBuild(build_dir, self.generator.lower(), self.build_type,
+                          definition=self, **kwargs)
 
     def __repr__(self):
         return f"CMakeDefinition[source={self.source}]"
@@ -118,7 +120,7 @@ class CMakeBuild(Command):
     user to know which generator is used.
     """
 
-    def __init__(self, definition, build_dir):
+    def __init__(self, build_dir, generator, build_type, definition=None):
         """ Initialize a CMakeBuild.
 
         The caller must ensure that cmake was invoked in the build directory.
@@ -130,18 +132,19 @@ def __init__(self, definition, build_dir):
         build_dir : str
                     The build directory to setup into.
         """
-        self.definition = definition
         self.build_dir = os.path.abspath(build_dir)
+        self.bin = generator
+        self.build_type = build_type
+        self.definition = definition
 
     @property
-    def bin(self):
-        return self.definition.generator.lower()
+    def binaries_dir(self):
+        return os.path.join(self.build_dir, self.build_type)
 
     def run(self, *argv, verbose=False, **kwargs):
         extra = []
         if verbose:
-            is_make = self.bin is "make"
-            extra.append("VERBOSE=1" if is_make else "-v")
+            extra.append("-v" if self.bin.endswith("ninja") else "VERBOSE=1")
         # Commands must be ran under the build directory
         super().run(*extra, *argv, **kwargs, cwd=self.build_dir)
         return self
@@ -165,4 +168,9 @@ def is_build_dir(path):
         return os.path.exists(cmake_cache) and os.path.exists(cmake_files)
 
     def __repr__(self):
-        return f"CMakeBuild[build={self.build_dir},definition={self.definition}]"
+        return ("CMakeBuild["
+                "build = {},"
+                "build_type = {},"
+                "definition = {}]".format(self.build_dir,
+                                          self.build_type,
+                                          self.definition))
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 214de92b7ad..77b2230bd6c 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -33,12 +33,16 @@ class Command:
     def bin(self):
         raise NotImplementedError("Command must implement bin() method")
 
-    def run(self, *argv, **kwargs):
+    def run(self, *argv, raise_on_failure=True, **kwargs):
         invocation = [find_exec(self.bin)]
         invocation.extend(argv)
 
-        logger.debug(f"Executing `{' '.join(invocation)}`")
-        return subprocess.run(invocation, **kwargs)
+        logger.debug(f"Executing `{invocation}`")
+        result = subprocess.run(invocation, **kwargs)
+        if raise_on_failure:
+            result.check_returncode()
+
+        return result
 
     def __call__(self, *argv, **kwargs):
         self.run(*argv, **kwargs)
diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py
index 4e0e8fd74d0..18c52bb07cb 100644
--- a/dev/archery/archery/utils/git.py
+++ b/dev/archery/archery/utils/git.py
@@ -21,6 +21,8 @@
 
 
 class Git(Command):
+    WORKSPACE = "WORKSPACE"
+
     def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs):
         self.bin = git_bin if git_bin else os.environ.get("GIT", "git")
         self.git_dir = git_dir
@@ -29,7 +31,7 @@ def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs):
     @property
     def global_opts(self):
         if self.git_dir:
-            # For some reason, git does not accept `-Cdir`
+            # For some reason, git does not accept `-Cdir`  or `-C=dir`
             yield "-C"
             yield self.git_dir
 
diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
new file mode 100644
index 00000000000..24e22c552cd
--- /dev/null
+++ b/dev/archery/archery/utils/source.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+from .git import Git, GitCheckout, GitClone
+
+
+class ArrowSources:
+    """ ArrowSources is a companion class representing a directory containing
+    Apache Arrow's sources.
+    """
+
+    def __init__(self, path):
+        """ Initialize an ArrowSources
+
+        The caller must ensure that path is valid arrow source directory (can
+        be checked with ArrowSources.valid
+
+        Parameters
+        ----------
+        path : src
+        """
+        assert isinstance(path, str) and ArrowSources.valid(path)
+        self.path = path
+
+    @property
+    def cpp(self):
+        """ Returns the cpp directory of an Arrow sources. """
+        return os.path.join(self.path, "cpp")
+
+    @property
+    def python(self):
+        """ Returns the python directory of an Arrow sources. """
+        return os.path.join(self.path, "python")
+
+    @property
+    def git_backed(self):
+        """ Indicate if the sources are backed by git. """
+        git_path = os.path.join(self.path, ".git")
+        return os.path.exists(git_path)
+
+    def at_revision(self, revision, clone_dir):
+        """ Return a copy of the current sources for a specified git revision.
+
+        This method may return the current object if no checkout is required.
+        The caller is responsible to remove the cloned repository directory.
+
+        Parameters
+        ----------
+        revision : str
+                   Revision to checkout sources at.
+        clone_dir : str
+                    Path to checkout the local clone.
+        """
+        if not self.git_backed:
+            raise ValueError(f"{self} is not backed by git")
+
+        # A local clone is required to leave the current sources intact such
+        # that build depending on said sources are not invalidated (or worse
+        # slightly affected when re-invoking the generator).
+        GitClone("--local", self.path, clone_dir)
+        GitCheckout("-b", revision, git_dir=clone_dir)
+
+        return ArrowSources(clone_dir)
+
+    @staticmethod
+    def valid(src):
+        """ Indicate if current sources are valid. """
+        if isinstance(src, ArrowSources):
+            return True
+        if isinstance(src, str):
+            cpp_path = os.path.join(src, "cpp")
+            cmake_path = os.path.join(cpp_path, "CMakeLists.txt")
+            return os.path.exists(cmake_path)
+        return False
+
+    @staticmethod
+    def find(path=None):
+        """ Infer Arrow sources directory from various method.
+
+        The following guesses are done in order until a valid match is found:
+
+        1. Checks the given optional parameter.
+
+        2. Checks if the environment variable `ARROW_SRC` is defined and use
+           this.
+
+        3. Checks if the current working directory (cwd) contains is an Arrow
+           source directory.
+
+        4. Checks if this file (cli.py) is still in the original source
+           repository. If so, returns the relative path to the source
+           directory.
+        """
+
+        # Explicit via environment
+        env = os.environ.get("ARROW_SRC")
+
+        # Implicit via cwd
+        cwd = os.getcwd()
+
+        # Implicit via current file
+        this_dir = os.path.dirname(os.path.realpath(__file__))
+        this = os.path.join(this_dir, "..", "..", "..", "..")
+
+        for p in [path, env, cwd, this]:
+            if ArrowSources.valid(p):
+                return ArrowSources(p)
+
+        return None
+
+    def __repr__(self):
+        return f"ArrowSources[{self.path}]"

From 2c0d512f89bccc8f6d0e9e951a834a4154ace4a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Fri, 5 Apr 2019 22:19:56 -0400
Subject: [PATCH 04/31] Checkpoint

---
 dev/archery/archery/benchmark/core.py   |  19 ++-
 dev/archery/archery/benchmark/google.py | 172 +++++++-----------------
 dev/archery/archery/benchmark/runner.py |  44 +++---
 dev/archery/archery/cli.py              | 151 +++++++++++++++------
 dev/archery/archery/utils/cmake.py      |  26 ++++
 dev/archery/archery/utils/command.py    |  14 ++
 dev/archery/archery/utils/git.py        |  65 ++++++---
 dev/archery/archery/utils/source.py     |   6 +-
 8 files changed, 286 insertions(+), 211 deletions(-)

diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
index ccd2630d444..751e9773def 100644
--- a/dev/archery/archery/benchmark/core.py
+++ b/dev/archery/archery/benchmark/core.py
@@ -56,6 +56,11 @@ def __init__(self, name, benchmarks):
         self.name = name
         self.benchmarks = benchmarks
 
+    def __repr__(self):
+        name = self.name
+        benchmarks = self.benchmarks
+        return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]"
+
 
 def default_comparator(old, new, threshold=0.05):
     # negative change is better, positive is tolerable until it exceeds
@@ -63,14 +68,22 @@ def default_comparator(old, new, threshold=0.05):
     return changes(old, new) < threshold
 
 
+def changes(old, new):
+    if old == 0 and new == 0:
+        return 0.0
+    if old == 0:
+        return float(new - old) / (float(old + new) / 2)
+    return float(new - old) / abs(old)
+
+
 class BenchmarkComparator:
-    def __init__(self, baseline, contender):
-        self.baseline = baseline
+    def __init__(self, contender, baseline):
         self.contender = contender
+        self.baseline = baseline
 
     def compare(self, comparator=None):
         comparator = comparator if comparator else default_comparator
-        return comparator(self.baseline.value, self.contender.value)
+        return changes(self.baseline.value, self.contender.value)
 
     def confidence(self):
         """ Indicate if a comparison of benchmarks should be trusted. """
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index 15a65f819ec..a330a523f1e 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -15,20 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from functools import cmp_to_key, lru_cache
 from itertools import filterfalse, groupby, tee
 import json
-import six
-from socket import gethostname
 import subprocess
-import sys
-from uuid import getnode
 
+from .core import Benchmark
 from ..utils.command import Command
-
-
-def memoize(fn):
-    return lru_cache(maxsize=1)(fn)
+from ..utils.git import git
 
 
 def partition(pred, iterable):
@@ -37,40 +30,28 @@ def partition(pred, iterable):
     return list(filter(pred, t1)), list(filterfalse(pred, t2))
 
 
-# Taken from merge_arrow_pr.py
-def run_cmd(cmd):
-    if isinstance(cmd, six.string_types):
-        cmd = cmd.split(' ')
-
-    try:
-        output = subprocess.check_output(cmd)
-    except subprocess.CalledProcessError as e:
-        # this avoids hiding the stdout / stderr of failed processes
-        print('Command failed: %s' % cmd)
-        print('With output:')
-        print('--------------')
-        print(e.output)
-        print('--------------')
-        raise e
-
-    if isinstance(output, six.binary_type):
-        output = output.decode('utf-8')
-    return output.rstrip()
-
-
 class GoogleBenchmarkCommand(Command):
-    def __init__(self, benchmark_bin):
+    def __init__(self, benchmark_bin, benchmark_filter=None):
         self.bin = benchmark_bin
+        self.benchmark_filter = benchmark_filter
 
-    def list_benchmarks(self, benchmark_filter=None):
+    def list_benchmarks(self):
         argv = ["--benchmark_list_tests"]
-        if benchmark_filter:
-            argv.append(f"--benchmark_filter={benchmark_filter}")
+        if self.benchmark_filter:
+            argv.append(f"--benchmark_filter={self.benchmark_filter}")
         result = self.run(*argv, stdout=subprocess.PIPE)
         return [b for b in result.stdout]
 
+    def results(self):
+        argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]
+
+        if self.benchmark_filter:
+            argv.append(f"--benchmark_filter={self.benchmark_filter}")
+
+        return json.loads(self.run(*argv, stdout=subprocess.PIPE).stdout)
 
-class Context:
+
+class GoogleContext:
     """ Represents the runtime environment """
 
     def __init__(self, date=None, executable=None, **kwargs):
@@ -88,11 +69,10 @@ def host(self):
 
     @property
     def git(self):
-        head = run_cmd("git rev-parse HEAD")
+        head = git.head()
         # %ai: author date, ISO 8601-like format
-        fmt = "%ai"
-        timestamp = run_cmd(f"git log -1 --pretty='{fmt}' {head}")
-        branch = run_cmd("git rev-parse --abbrev-ref HEAD")
+        timestamp = git.log("-1", "--pretty='%ai'", head)
+        branch = git.current_branch(),
         git_info = {
             "git_commit_timestamp": timestamp,
             "git_hash": head,
@@ -123,125 +103,59 @@ def as_arrow(self):
         return ctx
 
     @classmethod
-    def from_json(cls, version, payload):
+    def from_json(cls, payload):
         return cls(**payload)
 
 
-class GoogleBenchmarkObservation:
+class BenchmarkObservation:
     """ Represents one run of a single benchmark. """
 
-    def __init__(self, version, **kwargs):
+    def __init__(self, **kwargs):
         self._name = kwargs.get("name")
-        self.version = version
         self.real_time = kwargs.get("real_time")
         self.cpu_time = kwargs.get("cpu_time")
         self.time_unit = kwargs.get("time_unit")
         self.size = kwargs.get("size")
         self.bytes_per_second = kwargs.get("bytes_per_second")
 
-    @property
-    def is_mean(self):
-        return self._name.endswith("_mean")
-
-    @property
-    def is_median(self):
-        return self._name.endswith("_median")
-
-    @property
-    def is_stddev(self):
-        return self._name.endswith("_stddev")
-
     @property
     def is_agg(self):
-        return self.is_mean or self.is_median or self.is_stddev
+        suffixes = ["_mean", "_median", "_stddev"]
+        return any(map(lambda x: self._name.endswith(x), suffixes))
 
     @property
     def is_realtime(self):
         return self.name.find("/realtime") != -1
 
     @property
-    @memoize
     def name(self):
         name = self._name
         return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name
 
+    @property
+    def time(self):
+        return self.real_time if self.is_realtime else self.cpu_time
+
     @property
     def value(self):
         """ Return the benchmark value."""
-        if self.size:
-            return self.bytes_per_second
-        return self.real_time if self.is_realtime else self.cpu_time
+        return self.bytes_per_second if self.size else self.time
 
     @property
     def unit(self):
-        if self.size:
-            return "bytes_per_second"
-        return self.time_unit
+        return "bytes_per_second" if self.size else self.time_unit
 
-    def __str__(self):
-        return f"BenchmarkObservation[name={self.name}]"
+    def __repr__(self):
+        return f"{self.value}"
 
 
-class BenchmarkSuite:
-    def __init__(self, name, version, runs):
+class GoogleBenchmark(Benchmark):
+    def __init__(self, name, runs):
         self.name = name
-        self.version = version
         # exclude google benchmark aggregate artifacts
-        aggs, runs = partition(lambda b: b.is_agg, runs)
+        _, runs = partition(lambda b: b.is_agg, runs)
         self.runs = sorted(runs, key=lambda b: b.value)
-        self.values = [b.value for b in self.runs]
-        self.aggregates = aggs
-
-    @property
-    def n_runs(self):
-        return len(self.runs)
-
-    @property
-    @memoize
-    def mean(self):
-        maybe_mean = [b for b in self.aggregates if b.is_mean]
-        if maybe_mean:
-            return maybe_mean[0].value
-        # fallback
-        return sum(self.values) / self.n_runs
-
-    @property
-    @memoize
-    def median(self):
-        maybe_median = [b for b in self.aggregates if b.is_median]
-        if maybe_median:
-            return maybe_median[0].value
-        # fallback
-        return self.runs[int(self.n_runs / 2)].value
-
-    @property
-    @memoize
-    def stddev(self):
-        maybe_stddev = [b for b in self.aggregates if b.is_stddev]
-        if maybe_stddev:
-            return maybe_stddev[0].value
-
-        sum_diff = sum([(val - self.mean)**2 for val in self.values])
-        return (sum_diff / (self.n_runs - 1))**0.5 if self.n_runs > 1 else 0.0
-
-    @property
-    def min(self):
-        return self.values[0]
-
-    @property
-    def max(self):
-        return self.values[-1]
-
-    def quartile(self, q):
-        return self.values[int(q * self.n_runs / 4)]
-
-    @property
-    def q1(self):
-        return self.quartile(1)
-
-    @property
-    def q3(self):
-        return self.quartile(3)
+        super().__init__(name, [b.value for b in self.runs])
 
     @property
     def parameters(self):
@@ -256,14 +170,18 @@ def parse_param(idx, param):
         named_params = [parse_param(idx, p) for idx, p in params if p]
         return {k: v for k, v in named_params}
 
-    def __str__(self):
-        return f"BenchmarkSuite[name={self.name},runs={len(self.runs)}]"
+    @property
+    def less_is_better(self):
+        return self.runs[0].size is None
+
+    def __repr__(self):
+        return f"GoogleBenchmark[name={self.name},runs={self.runs}]"
 
     @classmethod
-    def from_json(cls, version, payload):
+    def from_json(cls, payload):
         def group_key(x):
             return x.name
 
-        benchmarks = map(lambda x: BenchmarkObservation(version, **x), payload)
+        benchmarks = map(lambda x: BenchmarkObservation(**x), payload)
         groups = groupby(sorted(benchmarks, key=group_key), group_key)
-        return [cls(k, version, list(bs)) for k, bs in groups]
+        return [cls(k, list(bs)) for k, bs in groups]
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index 5fb1f19086a..13786020f4d 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -21,14 +21,13 @@
 import subprocess
 
 from .core import Benchmark, BenchmarkSuite
-from .google import GoogleBenchmarkCommand
+from .google import GoogleBenchmarkCommand, GoogleBenchmark, GoogleContext
 from ..utils.logger import logger
 
 
 def regex_filter(re_expr):
     if re_expr is None:
         return lambda s: True
-
     re_comp = re.compile(re_expr)
     return lambda s: re_comp.search(s)
 
@@ -42,39 +41,46 @@ class CppBenchmarkRunner(BenchmarkRunner):
     def __init__(self, build):
         """ Initialize a CppBenchmarkRunner. """
         self.build = build
-        # Ensure build is ready to run benchmarks
-        self.build.all()
 
     @property
     def suites_binaries(self):
         """ Returns a list of benchmark binaries for this build. """
+        # Ensure build is up-to-date to run benchmarks
+        self.build()
+        # Not the best method, but works for now
         glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark")
         return {os.path.basename(b): b for b in glob.glob(glob_expr)}
 
-    def benchmarks(self, suite_bin, benchmark_filter):
+    def suite(self, name, suite_bin, benchmark_filter):
         """ Returns the resulting benchmarks for a given suite. """
-        suite_cmd = GoogleBenchmarkCommand(suite_bin)
-        benchmark_names = suite_cmd.list_benchmarks(benchmark_filter)
+        suite_cmd = GoogleBenchmarkCommand(suite_bin, benchmark_filter)
 
+        # Ensure there will be data
+        benchmark_names = suite_cmd.list_benchmarks()
         if not benchmark_names:
-            return []
+            return None
 
-        return benchmark_names
+        results = suite_cmd.results()
+        benchmarks = GoogleBenchmark.from_json(results.get("benchmarks"))
+        return BenchmarkSuite(name, benchmarks)
 
     def suites(self, suite_filter=None, benchmark_filter=None):
+        """ Returns all suite for a runner. """
         suite_matcher = regex_filter(suite_filter)
 
         suite_and_binaries = self.suites_binaries
         for suite_name in suite_and_binaries:
-            if suite_matcher(suite_name):
-                suite_bin = suite_and_binaries[suite_name]
-                benchmarks = self.benchmarks(suite_bin,
-                                             benchmark_filter=benchmark_filter)
+            if not suite_matcher(suite_name):
+                logger.debug(f"Ignoring suite {suite_name}")
+                continue
 
-                # Filter may exclude all benchmarks
-                if not benchmarks:
-                    continue
+            suite_bin = suite_and_binaries[suite_name]
+            suite = self.suite(suite_name, suite_bin,
+                               benchmark_filter=benchmark_filter)
 
-                yield BenchmarkSuite(suite_name, benchmarks)
-            else:
-                logger.debug(f"Ignoring suite {suite_name}")
+            # Filter may exclude all benchmarks
+            if not suite:
+                logger.debug(f"Suite {suite_name} executed but no results")
+                continue
+
+            yield suite
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 426673fee7f..b27357a582d 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -23,9 +23,10 @@
 import re
 from tempfile import mkdtemp, TemporaryDirectory
 
-# from .benchmark import GoogleCompareBenchmark
+from .benchmark.core import BenchmarkComparator
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
+from .utils.cmake import CMakeBuild
 from .utils.git import Git
 from .utils.logger import logger
 from .utils.source import ArrowSources
@@ -33,8 +34,12 @@
 
 @click.group()
 @click.option("--debug", count=True)
-def archery(debug):
+@click.pass_context
+def archery(ctx, debug):
     """ Apache Arrow developer utilities. """
+    # Ensure ctx.obj exists
+    ctx.ensure_object(dict)
+    ctx.obj["DEBUG"] = debug
     if debug:
         logger.setLevel(logging.DEBUG)
 
@@ -58,7 +63,7 @@ def validate_arrow_sources(ctx, param, src):
 
 
 @archery.command(short_help="Initialize an Arrow C++ build")
-@click.option("--src", default=ArrowSources.find(),
+@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
               callback=validate_arrow_sources,
               help="Specify Arrow source directory")
 # toolchain
@@ -90,7 +95,8 @@ def validate_arrow_sources(ctx, param, src):
 @click.option("--targets", type=str, multiple=True,
               help="Generator targets to run")
 @click.argument("build_dir", type=build_dir_type)
-def build(src, build_dir, force, targets, **kwargs):
+@click.pass_context
+def build(ctx, src, build_dir, force, targets, **kwargs):
     """ Build. """
     # Arrow's cpp cmake configuration
     conf = CppConfiguration(**kwargs)
@@ -117,61 +123,130 @@ def tmpdir(preserve, prefix="arrow-bench-"):
     build_type="release", with_tests=True, with_benchmarks=True)
 
 
-def arrow_cpp_benchmark_runner(src, root, rev):
-    root_rev = os.path.join(root, rev)
-    os.mkdir(root_rev)
+def cpp_runner_from_rev_or_path(src, root, rev_or_path):
+    build = None
+    if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
+        build = CMakeBuild.from_path(rev_or_path)
+    else:
+        root_rev = os.path.join(root, rev_or_path)
+        os.mkdir(root_rev)
+
+        # Possibly checkout the sources at given revision
+        src_rev = src
+        if rev_or_path != Git.WORKSPACE:
+            root_src = os.path.join(root_rev, "arrow")
+            src_rev = src.at_revision(rev_or_path, root_src)
 
-    root_src = os.path.join(root_rev, "arrow")
-    # Possibly checkout the sources at given revision
-    src_rev = src if rev == Git.WORKSPACE else src.at_revision(rev, root_src)
+        # TODO: find a way to pass custom configuration without cluttering
+        # the cli. Ideally via a configuration file that can be shared with the
+        # `build` sub-command.
+        cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF)
+        build = cmake_def.build(os.path.join(root_rev, "build"))
 
-    # TODO: find a way to pass custom configuration without cluttering the cli
-    # Ideally via a configuration file that can be shared with the
-    # `build` sub-command.
-    cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF)
-    build = cmake_def.build(os.path.join(root_rev, "build"))
     return CppBenchmarkRunner(build)
 
 
 DEFAULT_BENCHMARK_FILTER = "^Regression"
 
 
-@archery.command(short_help="Run the C++ benchmark suite")
-@click.option("--src", default=ArrowSources.find(),
+@archery.group()
+@click.pass_context
+def benchmark(ctx):
+    """ Arrow benchmarking.
+
+    Use the diff sub-command to benchmake revisions, and/or build directories.
+    """
+    pass
+
+
+@benchmark.command(name="diff", short_help="Run the C++ benchmark suite")
+@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
               callback=validate_arrow_sources,
               help="Specify Arrow source directory")
-@click.option("--preserve", type=bool, default=False, is_flag=True,
-              help="Preserve temporary workspace directory for investigation.")
-@click.option("--suite-filter", type=str, default=None,
+@click.option("--suite-filter", metavar="<regex>", type=str, default=None,
               help="Regex filtering benchmark suites.")
-@click.option("--benchmark-filter", type=str, default=DEFAULT_BENCHMARK_FILTER,
+@click.option("--benchmark-filter", metavar="<regex>", type=str,
+              default=DEFAULT_BENCHMARK_FILTER,
               help="Regex filtering benchmark suites.")
-@click.argument("contender_rev", default=Git.WORKSPACE, required=False)
-@click.argument("baseline_rev", default="master", required=False)
-def benchmark(src, preserve, suite_filter,  benchmark_filter,
-              contender_rev, baseline_rev):
-    """ Benchmark Arrow C++ implementation.
+@click.option("--preserve", type=bool, default=False, is_flag=True,
+              help="Preserve temporary workspace directory for investigation.")
+@click.argument("contender", metavar="[<contender>", default=Git.WORKSPACE,
+                required=False)
+@click.argument("baseline", metavar="[<baseline>]]", default="master",
+                required=False)
+@click.pass_context
+def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
+                   contender, baseline):
+    """ Compare (diff) benchmark runs.
+
+    This command acts like git-diff but for benchmark results.
+
+    The caller can optionally specify both the contender and the baseline. If
+    unspecified, the contender will default to the current workspace (like git)
+    and the baseline will default to master.
+
+    Each target (contender or baseline) can either be a git object reference
+    (commit, tag, special values like HEAD) or a cmake build directory. This
+    allow comparing git commits, and/or different compilers and/or compiler
+    flags.
+
+    When a commit is referenced, a local clone of the arrow sources (specified
+    via --src) is performed and the proper branch is created. This is done in
+    a temporary directory which can be left intact with the `---preserve` flag.
+
+    The special token "WORKSPACE" is reserved to specify the current git
+    workspace. This imply that no clone will be performed.
+
+    Examples:
+
+    \b
+    # Compare workspace (contender) with master (baseline)
+    archery benchmark diff
+
+    \b
+    # Compare master (contender) with latest version (baseline)
+    export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1)
+    archery benchmark diff master "$LAST"
+
+    \b
+    # Compare g++7 (contender) with clang++-7 (baseline) builds
+    archery build --with-benchmarks=true \\
+            --cc=gcc7 --cxx=g++7 gcc7-build
+    archery build --with-benchmarks=true \\
+            --cc=clang-7 --cxx=clang++-7 clang7-build
+    archery benchmark diff gcc7-build clang7-build
+
+    \b
+    # Compare default but only scoped to the suites matching `aggregate` and
+    # the benchmarks matching `Kernel`.
+    archery benchmark diff --suite-filter=aggregate --benchmark-filter=Kernel
     """
     with tmpdir(preserve) as root:
-        runner_cont = arrow_cpp_benchmark_runner(src, root, contender_rev)
-        runner_base = arrow_cpp_benchmark_runner(src, root, baseline_rev)
+        runner_cont = cpp_runner_from_rev_or_path(src, root, contender)
+        runner_base = cpp_runner_from_rev_or_path(src, root, baseline)
 
         suites_cont = {s.name: s for s in runner_cont.suites(suite_filter,
                                                              benchmark_filter)}
         suites_base = {s.name: s for s in runner_base.suites(suite_filter,
                                                              benchmark_filter)}
 
-        for name in suites_cont.keys() & suites_base.keys():
-            logger.debug(f"Comparing {name}")
-            contender_bin = suites_cont[name]
-            baseline_bin = suites_base[name]
+        for suite_name in suites_cont.keys() & suites_base.keys():
+            logger.debug(f"Comparing {suite_name}")
+
+            suite_cont = {
+                b.name: b for b in suites_cont[suite_name].benchmarks}
+            suite_base = {
+                b.name: b for b in suites_base[suite_name].benchmarks}
+
+            for bench_name in suite_cont.keys() & suite_base.keys():
+                logger.debug(f"Comparing {bench_name}")
+
+                bench_cont = suite_cont[bench_name]
+                bench_base = suite_base[bench_name]
 
-            # Note that compare.py requires contender and baseline inverted.
-            # GoogleCompareBenchmark("benchmarks", "--display_aggregates_only",
-            #                       baseline_bin, contender_bin,
-            #                       f"--benchmark_filter={benchmark_filter}",
-            #                       "--benchmark_repetitions=20")
+                comp = BenchmarkComparator(bench_cont, bench_base)
+                print(comp.compare())
 
 
 if __name__ == "__main__":
-    archery()
+    archery(obj={})
diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py
index b2e7c75b511..a549f07b8b4 100644
--- a/dev/archery/archery/utils/cmake.py
+++ b/dev/archery/archery/utils/cmake.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import re
 from shutil import rmtree, which
 
 from .command import Command
@@ -113,6 +114,9 @@ def __repr__(self):
         return f"CMakeDefinition[source={self.source}]"
 
 
+CMAKE_BUILD_TYPE_RE = re.compile("CMAKE_BUILD_TYPE:STRING=([a-zA-Z]+)")
+
+
 class CMakeBuild(Command):
     """ CMakeBuild represents a build directory initialized by cmake.
 
@@ -132,6 +136,7 @@ def __init__(self, build_dir, generator, build_type, definition=None):
         build_dir : str
                     The build directory to setup into.
         """
+        assert CMakeBuild.is_build_dir(build_dir)
         self.build_dir = os.path.abspath(build_dir)
         self.bin = generator
         self.build_type = build_type
@@ -167,6 +172,27 @@ def is_build_dir(path):
         cmake_files = os.path.join(path, "CMakeFiles")
         return os.path.exists(cmake_cache) and os.path.exists(cmake_files)
 
+    @staticmethod
+    def from_path(path):
+        if not CMakeBuild.is_build_dir(path):
+            raise ValueError(f"Not a valid CMakeBuild path: {path}")
+
+        generator = "make"
+        if os.path.exists(os.path.join(path, "build.ninja")):
+            generator = "ninja"
+
+        build_type = None
+        # Infer build_type by looking at CMakeCache.txt and looking for a magic
+        # definition
+        cmake_cache_path = os.path.join(path, "CMakeCache.txt")
+        with open(cmake_cache_path, "r") as cmake_cache:
+            candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read())
+            if len(candidates) != 1:
+                ValueError("Could not chose build_type from {candidates}")
+            build_type = candidates[0].lower()
+
+        return CMakeBuild(path, generator, build_type)
+
     def __repr__(self):
         return ("CMakeBuild["
                 "build = {},"
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 77b2230bd6c..a9f2bb7a393 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -29,6 +29,20 @@ def find_exec(executable):
     return shutil.which(executable)
 
 
+# Decorator running a command and returning stdout
+class capture_stdout:
+    def __init__(self, strip=False):
+        self.strip = strip
+
+    def __call__(self, f):
+        def strip_it(x):
+            return x.strip() if self.strip else x
+
+        def wrapper(*argv, **kwargs):
+            return strip_it(fn(*argv, **kwargs, stdout=subprocess.PIPE).stdout)
+        return wrapper
+
+
 class Command:
     def bin(self):
         raise NotImplementedError("Command must implement bin() method")
diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py
index 18c52bb07cb..2d62eeb31bd 100644
--- a/dev/archery/archery/utils/git.py
+++ b/dev/archery/archery/utils/git.py
@@ -17,37 +17,60 @@
 
 import os
 
-from .command import Command
+from .command import Command, capture_stdout
+
+
+# Decorator prepending argv with the git sub-command found with the method
+# name.
+def git_cmd(fn):
+    # function name is the subcommand
+    sub_cmd = fn.__name__.replace("_", "-")
+
+    def wrapper(self, *argv, **kwargs):
+        return fn(self, sub_cmd, *argv, **kwargs)
+    return wrapper
 
 
 class Git(Command):
     WORKSPACE = "WORKSPACE"
+    HEAD = "HEAD"
 
-    def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs):
+    def __init__(self, git_bin=None):
         self.bin = git_bin if git_bin else os.environ.get("GIT", "git")
-        self.git_dir = git_dir
-        self.run(*argv, **kwargs)
 
-    @property
-    def global_opts(self):
-        if self.git_dir:
-            # For some reason, git does not accept `-Cdir`  or `-C=dir`
-            yield "-C"
-            yield self.git_dir
+    def run_cmd(self, cmd, *argv, git_dir=None, **kwargs):
+        """ Inject flags before sub-command in argv. """
+        opts = []
+        if git_dir and isinstance(git_dir, str):
+            opts.extend(("-C", git_dir))
+
+        return self.run(*opts, cmd, *argv, **kwargs)
+
+    @git_cmd
+    def clone(self, *argv, **kwargs):
+        return self.run_cmd(*argv, **kwargs)
+
+    @git_cmd
+    def checkout(self, *argv, **kwargs):
+        return self.run_cmd(*argv, **kwargs)
 
+    @git_cmd
+    def log(self, *argv, **kwargs):
+        return self.run_cmd(*argv, **kwargs)
 
-class GitSubCommand(Git):
-    def run(self, *argv, **kwargs):
-        super().run(*self.global_opts, self.cmd, *argv, **kwargs)
+    @git_cmd
+    def rev_parse(self, *argv, **kwargs):
+        print(self.head())
+        return self.run_cmd(*argv, **kwargs)
 
+    @capture_stdout(strip=True)
+    def head(self, **kwargs):
+        """ Return commit pointed by HEAD. """
+        return self.rev_parse(Git.HEAD, **kwargs)
 
-class GitClone(GitSubCommand):
-    @property
-    def cmd(self):
-        return "clone"
+    @capture_stdout(strip=True)
+    def current_branch(self, **kwargs):
+        return self.rev_parse("--abbrev-ref", Git.HEAD, **kwargs)
 
 
-class GitCheckout(GitSubCommand):
-    @property
-    def cmd(self):
-        return "checkout"
+git = Git()
diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
index 24e22c552cd..8f75b8f2675 100644
--- a/dev/archery/archery/utils/source.py
+++ b/dev/archery/archery/utils/source.py
@@ -17,7 +17,7 @@
 
 import os
 
-from .git import Git, GitCheckout, GitClone
+from .git import git
 
 
 class ArrowSources:
@@ -73,8 +73,8 @@ def at_revision(self, revision, clone_dir):
         # A local clone is required to leave the current sources intact such
         # that build depending on said sources are not invalidated (or worse
         # slightly affected when re-invoking the generator).
-        GitClone("--local", self.path, clone_dir)
-        GitCheckout("-b", revision, git_dir=clone_dir)
+        git.clone("--local", self.path, clone_dir)
+        git.checkout("-b", revision, git_dir=clone_dir)
 
         return ArrowSources(clone_dir)
 

From 703cf987abe44e40b6e6179b9c17d395fe2c9529 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 9 Apr 2019 09:12:48 -0400
Subject: [PATCH 05/31] commit

---
 dev/archery/archery/benchmark/core.py   | 50 ++++++++++++++++------
 dev/archery/archery/benchmark/google.py | 10 ++++-
 dev/archery/archery/cli.py              | 57 ++++++++++++++++++-------
 dev/archery/archery/utils/command.py    | 12 +++++-
 dev/archery/archery/utils/logger.py     |  4 ++
 dev/archery/archery/utils/source.py     |  2 +-
 6 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
index 751e9773def..3cb4d9568ba 100644
--- a/dev/archery/archery/benchmark/core.py
+++ b/dev/archery/archery/benchmark/core.py
@@ -28,7 +28,21 @@ def __init__(self, min, max, mean, stddev, q1, median, q3):
 
     @staticmethod
     def from_values(values):
-        return None
+        sorted(values)
+        n = len(values)
+        mean = sum(values) / len(values)
+        sum_diff = sum([(val - mean)**2 for val in values])
+        stddev = (sum_diff / (n - 1))**0.5 if n > 1 else 0.0
+
+        return Statistics(
+            values[0],
+            values[-1],
+            mean,
+            stddev,
+            values[int(n/4)],
+            values[int(n/2)],
+            values[int((3*n)/4)],
+        )
 
 
 class Benchmark:
@@ -36,19 +50,18 @@ def __init__(self, name, values, stats=None):
         self.name = name
         self.values = values
         self.statistics = stats if stats else Statistics.from_values(values)
-        self.less_is_better = less_is_better
 
     @property
     def value(self):
         return self.statistics.median
 
+    @property
+    def unit(self):
+        return None
 
-def changes(old, new):
-    if old == 0 and new == 0:
-        return 0.0
-    if old == 0:
-        return float(new - old) / (float(old + new) / 2)
-    return float(new - old) / abs(old)
+    @property
+    def less_is_better(self):
+        return True
 
 
 class BenchmarkSuite:
@@ -62,10 +75,10 @@ def __repr__(self):
         return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]"
 
 
-def default_comparator(old, new, threshold=0.05):
+def regress(change, threshold=0.05):
     # negative change is better, positive is tolerable until it exceeds
     # threshold
-    return changes(old, new) < threshold
+    return change > threshold
 
 
 def changes(old, new):
@@ -82,8 +95,21 @@ def __init__(self, contender, baseline):
         self.baseline = baseline
 
     def compare(self, comparator=None):
-        comparator = comparator if comparator else default_comparator
-        return changes(self.baseline.value, self.contender.value)
+        base = self.baseline.value
+        cont = self.contender.value
+        change = changes(base, cont)
+        adjusted_change = change
+        if not self.baseline.less_is_better:
+            adjusted_change = change * -1.0
+        return {
+            "benchmark": self.baseline.name,
+            "change": change,
+            "regression": regress(adjusted_change),
+            "baseline": base,
+            "contender": cont,
+            "unit": self.baseline.unit,
+            "less_is_better": self.baseline.less_is_better,
+        }
 
     def confidence(self):
         """ Indicate if a comparison of benchmarks should be trusted. """
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index a330a523f1e..ce2d060ccb2 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -39,7 +39,8 @@ def list_benchmarks(self):
         argv = ["--benchmark_list_tests"]
         if self.benchmark_filter:
             argv.append(f"--benchmark_filter={self.benchmark_filter}")
-        result = self.run(*argv, stdout=subprocess.PIPE)
+        result = self.run(*argv, stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE)
         return [b for b in result.stdout]
 
     def results(self):
@@ -48,7 +49,8 @@ def results(self):
         if self.benchmark_filter:
             argv.append(f"--benchmark_filter={self.benchmark_filter}")
 
-        return json.loads(self.run(*argv, stdout=subprocess.PIPE).stdout)
+        return json.loads(self.run(*argv, stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE).stdout)
 
 
 class GoogleContext:
@@ -170,6 +172,10 @@ def parse_param(idx, param):
         named_params = [parse_param(idx, p) for idx, p in params if p]
         return {k: v for k, v in named_params}
 
+    @property
+    def unit(self):
+        return self.runs[0].unit
+
     @property
     def less_is_better(self):
         return self.runs[0].size is None
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index b27357a582d..07936e32179 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -23,23 +23,34 @@
 import re
 from tempfile import mkdtemp, TemporaryDirectory
 
+from .utils.logger import logger
+import archery.utils.logger as log
+
 from .benchmark.core import BenchmarkComparator
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.cmake import CMakeBuild
 from .utils.git import Git
-from .utils.logger import logger
 from .utils.source import ArrowSources
 
 
 @click.group()
-@click.option("--debug", count=True)
+@click.option("--debug", type=bool, is_flag=True, default=False,
+              help="Increase logging with debugging output.")
+@click.option("--quiet", type=bool, is_flag=True, default=False,
+              help="Silence executed commands.")
 @click.pass_context
-def archery(ctx, debug):
+def archery(ctx, debug, quiet):
     """ Apache Arrow developer utilities. """
     # Ensure ctx.obj exists
     ctx.ensure_object(dict)
-    ctx.obj["DEBUG"] = debug
+
+    print("Init")
+    print(id(log.quiet))
+    print(log.quiet)
+    log.quiet = quiet
+    print(log.quiet)
+
     if debug:
         logger.setLevel(logging.DEBUG)
 
@@ -160,16 +171,17 @@ def benchmark(ctx):
 
 
 @benchmark.command(name="diff", short_help="Run the C++ benchmark suite")
-@click.option("--src", metavar="<arrow_src>", default=ArrowSources.find(),
+@click.option("--src", metavar="<arrow_src>", show_default=True,
+              default=ArrowSources.find(),
               callback=validate_arrow_sources,
               help="Specify Arrow source directory")
-@click.option("--suite-filter", metavar="<regex>", type=str, default=None,
+@click.option("--suite-filter", metavar="<regex>", show_default=True,
+              type=str, default=None, help="Regex filtering benchmark suites.")
+@click.option("--benchmark-filter", metavar="<regex>", show_default=True,
+              type=str, default=DEFAULT_BENCHMARK_FILTER,
               help="Regex filtering benchmark suites.")
-@click.option("--benchmark-filter", metavar="<regex>", type=str,
-              default=DEFAULT_BENCHMARK_FILTER,
-              help="Regex filtering benchmark suites.")
-@click.option("--preserve", type=bool, default=False, is_flag=True,
-              help="Preserve temporary workspace directory for investigation.")
+@click.option("--preserve", type=bool, default=False, show_default=True,
+              is_flag=True, help="Preserve workspace for investigation.")
 @click.argument("contender", metavar="[<contender>", default=Git.WORKSPACE,
                 required=False)
 @click.argument("baseline", metavar="[<baseline>]]", default="master",
@@ -201,25 +213,33 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
 
     \b
     # Compare workspace (contender) with master (baseline)
+    \b
     archery benchmark diff
 
     \b
     # Compare master (contender) with latest version (baseline)
+    \b
     export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1)
+    \b
     archery benchmark diff master "$LAST"
 
     \b
     # Compare g++7 (contender) with clang++-7 (baseline) builds
+    \b
     archery build --with-benchmarks=true \\
             --cc=gcc7 --cxx=g++7 gcc7-build
+    \b
     archery build --with-benchmarks=true \\
             --cc=clang-7 --cxx=clang++-7 clang7-build
+    \b
     archery benchmark diff gcc7-build clang7-build
 
     \b
-    # Compare default but only scoped to the suites matching `aggregate` and
-    # the benchmarks matching `Kernel`.
-    archery benchmark diff --suite-filter=aggregate --benchmark-filter=Kernel
+    # Compare default targets but scoped to the suites matching
+    # `^arrow-compute-aggregate` and benchmarks matching `(Sum|Mean)Kernel`.
+    \b
+    archery benchmark diff --suite-filter="^arrow-compute-aggregate" \\
+            --benchmark-filter="(Sum|Mean)Kernel"
     """
     with tmpdir(preserve) as root:
         runner_cont = cpp_runner_from_rev_or_path(src, root, contender)
@@ -244,8 +264,13 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
                 bench_cont = suite_cont[bench_name]
                 bench_base = suite_base[bench_name]
 
-                comp = BenchmarkComparator(bench_cont, bench_base)
-                print(comp.compare())
+                comparator = BenchmarkComparator(bench_cont, bench_base)
+                comparison = comparator()
+
+                comparison["suite"] = suite_name
+                comparison["benchmark"] = bench_name
+
+                print(comparison)
 
 
 if __name__ == "__main__":
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index a9f2bb7a393..6a8955b5adc 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -19,7 +19,7 @@
 import shutil
 import subprocess
 
-from .logger import logger
+from .logger import logger, quiet
 
 
 def find_exec(executable):
@@ -51,6 +51,16 @@ def run(self, *argv, raise_on_failure=True, **kwargs):
         invocation = [find_exec(self.bin)]
         invocation.extend(argv)
 
+        print("Within command module")
+        print(id(quiet))
+        print(quiet)
+
+        if "stdout" not in kwargs and quiet:
+            kwargs["stdout"] = subprocess.PIPE
+
+        if "stderr" not in kwargs and quiet:
+            kwargs["stderr"] = subprocess.PIPE
+
         logger.debug(f"Executing `{invocation}`")
         result = subprocess.run(invocation, **kwargs)
         if raise_on_failure:
diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py
index ab97204decb..951221ddfc8 100644
--- a/dev/archery/archery/utils/logger.py
+++ b/dev/archery/archery/utils/logger.py
@@ -17,5 +17,9 @@
 
 import logging
 
+
+quiet = False
+
+""" Global logger. """
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("archery")
diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
index 8f75b8f2675..6ff05fb5498 100644
--- a/dev/archery/archery/utils/source.py
+++ b/dev/archery/archery/utils/source.py
@@ -125,4 +125,4 @@ def find(path=None):
         return None
 
     def __repr__(self):
-        return f"ArrowSources[{self.path}]"
+        return f"{self.path}"

From c85661cf3a6a773377564f2ab62682fcd53a6693 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 10:42:54 -0400
Subject: [PATCH 06/31] Add documentation

---
 .../compute/kernels/aggregate-benchmark.cc    |   4 +-
 dev/archery/archery/cli.py                    |  60 ++++++---
 dev/archery/archery/utils/command.py          |  35 +++--
 dev/archery/archery/utils/logger.py           |  11 +-
 dev/archery/setup.py                          |  40 ++++++
 docs/source/developers/benchmarks.rst         | 123 ++++++++++++++++++
 6 files changed, 230 insertions(+), 43 deletions(-)
 create mode 100644 dev/archery/setup.py
 create mode 100644 docs/source/developers/benchmarks.rst

diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
index e81f87947b0..bea2656d73e 100644
--- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
@@ -309,7 +309,7 @@ BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive<int64_t>)->Apply(BenchmarkSetArgs);
 BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader<int64_t>)->Apply(BenchmarkSetArgs);
 BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll<int64_t>)->Apply(BenchmarkSetArgs);
 
-static void BenchSumKernel(benchmark::State& state) {
+static void RegressionSumKernel(benchmark::State& state) {
   const int64_t array_size = state.range(0) / sizeof(int64_t);
   const double null_percent = static_cast<double>(state.range(1)) / 100.0;
   auto rand = random::RandomArrayGenerator(1923);
@@ -328,7 +328,7 @@ static void BenchSumKernel(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t));
 }
 
-BENCHMARK(BenchSumKernel)->Apply(BenchmarkSetArgs);
+BENCHMARK(RegressionSumKernel)->Apply(SetArgs);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 07936e32179..9e271843de9 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -18,39 +18,38 @@
 
 import click
 from contextlib import contextmanager
+import json
 import logging
 import os
 import re
 from tempfile import mkdtemp, TemporaryDirectory
 
-from .utils.logger import logger
-import archery.utils.logger as log
 
 from .benchmark.core import BenchmarkComparator
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.cmake import CMakeBuild
 from .utils.git import Git
+from .utils.logger import logger, ctx as log_ctx
 from .utils.source import ArrowSources
 
 
 @click.group()
 @click.option("--debug", type=bool, is_flag=True, default=False,
               help="Increase logging with debugging output.")
-@click.option("--quiet", type=bool, is_flag=True, default=False,
+@click.option("-q", "--quiet", type=bool, is_flag=True, default=False,
               help="Silence executed commands.")
 @click.pass_context
 def archery(ctx, debug, quiet):
-    """ Apache Arrow developer utilities. """
+    """ Apache Arrow developer utilities.
+
+    See sub-commands help with `archery <cmd> --help`.
+
+    """
     # Ensure ctx.obj exists
     ctx.ensure_object(dict)
 
-    print("Init")
-    print(id(log.quiet))
-    print(log.quiet)
-    log.quiet = quiet
-    print(log.quiet)
-
+    log_ctx.quiet = quiet
     if debug:
         logger.setLevel(logging.DEBUG)
 
@@ -78,8 +77,8 @@ def validate_arrow_sources(ctx, param, src):
               callback=validate_arrow_sources,
               help="Specify Arrow source directory")
 # toolchain
-@click.option("--cc", help="C compiler.")
-@click.option("--cxx", help="C++ compiler.")
+@click.option("--cc", metavar="<compiler>", help="C compiler.")
+@click.option("--cxx", metavar="<compiler>", help="C++ compiler.")
 @click.option("--cxx_flags", help="C++ compiler flags.")
 @click.option("--build-type", default="release", type=build_type,
               help="CMake's CMAKE_BUILD_TYPE")
@@ -108,7 +107,27 @@ def validate_arrow_sources(ctx, param, src):
 @click.argument("build_dir", type=build_dir_type)
 @click.pass_context
 def build(ctx, src, build_dir, force, targets, **kwargs):
-    """ Build. """
+    """ Initialize a C++ build directory.
+
+    The build command creates a directory initialized with Arrow's cpp source
+    cmake configuration. It can also optionally invoke the generator to test
+    the build (and used in scripts).
+
+    Note that archery will carry the caller environment. It will also not touch
+    an existing directory, one must use the `--force` option to remove the
+    existing directory.
+
+    Examples:
+
+    \b
+    # Initialize build with clang7 and avx2 support in directory `clang7-build`
+    \b
+    archery build --cc=clang-7 --cxx=clang++-7 --cxx_flags=-mavx2 clang7-build
+
+    \b
+    # Builds and run test
+    archery build --targets=all --targets=test build
+    """
     # Arrow's cpp cmake configuration
     conf = CppConfiguration(**kwargs)
     # This is a closure around cmake invocation, e.g. calling `def.build()`
@@ -187,7 +206,7 @@ def benchmark(ctx):
 @click.argument("baseline", metavar="[<baseline>]]", default="master",
                 required=False)
 @click.pass_context
-def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
+def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
                    contender, baseline):
     """ Compare (diff) benchmark runs.
 
@@ -227,9 +246,11 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
     # Compare g++7 (contender) with clang++-7 (baseline) builds
     \b
     archery build --with-benchmarks=true \\
-            --cc=gcc7 --cxx=g++7 gcc7-build
+            --cxx_flags=-ftree-vectorize \\
+            --cc=gcc-7 --cxx=g++-7 gcc7-build
     \b
     archery build --with-benchmarks=true \\
+            --cxx_flags=-flax-vector-conversions \\
             --cc=clang-7 --cxx=clang++-7 clang7-build
     \b
     archery benchmark diff gcc7-build clang7-build
@@ -242,6 +263,9 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
             --benchmark-filter="(Sum|Mean)Kernel"
     """
     with tmpdir(preserve) as root:
+        logger.debug(f"Comparing {contender} (contender) with "
+                     f"{baseline} (baseline)")
+
         runner_cont = cpp_runner_from_rev_or_path(src, root, contender)
         runner_base = cpp_runner_from_rev_or_path(src, root, baseline)
 
@@ -250,7 +274,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
         suites_base = {s.name: s for s in runner_base.suites(suite_filter,
                                                              benchmark_filter)}
 
-        for suite_name in suites_cont.keys() & suites_base.keys():
+        for suite_name in sorted(suites_cont.keys() & suites_base.keys()):
             logger.debug(f"Comparing {suite_name}")
 
             suite_cont = {
@@ -258,7 +282,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
             suite_base = {
                 b.name: b for b in suites_base[suite_name].benchmarks}
 
-            for bench_name in suite_cont.keys() & suite_base.keys():
+            for bench_name in sorted(suite_cont.keys() & suite_base.keys()):
                 logger.debug(f"Comparing {bench_name}")
 
                 bench_cont = suite_cont[bench_name]
@@ -270,7 +294,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter,  benchmark_filter,
                 comparison["suite"] = suite_name
                 comparison["benchmark"] = bench_name
 
-                print(comparison)
+                print(json.dumps(comparison))
 
 
 if __name__ == "__main__":
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 6a8955b5adc..4e05487221e 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -19,14 +19,12 @@
 import shutil
 import subprocess
 
-from .logger import logger, quiet
+from .logger import logger, ctx
 
 
 def find_exec(executable):
-    if os.path.exists(executable):
-        return executable
-
-    return shutil.which(executable)
+    exec_exists = os.path.exists(executable)
+    return executable if exec_exists else shutil.which(executable)
 
 
 # Decorator running a command and returning stdout
@@ -39,7 +37,9 @@ def strip_it(x):
             return x.strip() if self.strip else x
 
         def wrapper(*argv, **kwargs):
-            return strip_it(fn(*argv, **kwargs, stdout=subprocess.PIPE).stdout)
+            # Ensure stdout is captured
+            kwargs["stdout"] = subprocess.PIPE
+            return strip_it(fn(*argv, **kwargs).stdout)
         return wrapper
 
 
@@ -47,26 +47,21 @@ class Command:
     def bin(self):
         raise NotImplementedError("Command must implement bin() method")
 
-    def run(self, *argv, raise_on_failure=True, **kwargs):
+    def run(self, *argv, **kwargs):
         invocation = [find_exec(self.bin)]
         invocation.extend(argv)
 
-        print("Within command module")
-        print(id(quiet))
-        print(quiet)
+        for key in ["stdout", "stderr"]:
+            # Preserve caller intention, otherwise silence
+            if key not in kwargs and ctx.quiet:
+                kwargs["key"] = subprocess.PIPE
 
-        if "stdout" not in kwargs and quiet:
-            kwargs["stdout"] = subprocess.PIPE
-
-        if "stderr" not in kwargs and quiet:
-            kwargs["stderr"] = subprocess.PIPE
+        # Prefer safe by default
+        if "check" not in kwargs:
+            kwargs["check"] = True
 
         logger.debug(f"Executing `{invocation}`")
-        result = subprocess.run(invocation, **kwargs)
-        if raise_on_failure:
-            result.check_returncode()
-
-        return result
+        return subprocess.run(invocation, **kwargs)
 
     def __call__(self, *argv, **kwargs):
         self.run(*argv, **kwargs)
diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py
index 951221ddfc8..0a4a99edb2f 100644
--- a/dev/archery/archery/utils/logger.py
+++ b/dev/archery/archery/utils/logger.py
@@ -17,9 +17,14 @@
 
 import logging
 
-
-quiet = False
-
 """ Global logger. """
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("archery")
+
+
+class LoggingContext:
+    def __init__(self, quiet=False):
+        self.quiet = quiet
+
+
+ctx = LoggingContext()
diff --git a/dev/archery/setup.py b/dev/archery/setup.py
new file mode 100644
index 00000000000..54f1c14966e
--- /dev/null
+++ b/dev/archery/setup.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+from setuptools import setup
+
+
+if sys.version_info < (3, 5):
+    sys.exit('Python < 3.5 is not supported due to subprocess.run')
+
+
+setup(
+    name='archery',
+    version="0.1.0",
+    description='Apache Arrow Developers Tools',
+    url='http://github.com/apache/arrow',
+    maintainer='Arrow Developers',
+    maintainer_email='dev@arrow.apache.org',
+    packages=['archery'],
+    install_requires=['click'],
+    entry_points='''
+        [console_scripts]
+        archery=archery.cli:archery
+    ''',
+)
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
new file mode 100644
index 00000000000..64428a15984
--- /dev/null
+++ b/docs/source/developers/benchmarks.rst
@@ -0,0 +1,123 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. _benchmarks:
+
+**********
+Benchmarks
+**********
+
+Archery
+=======
+
+``archery`` is a python library and command line utility made to interact with
+Arrow's sources. The main feature is the benchmarking process.
+
+Installation
+~~~~~~~~~~~~
+
+The simplest way to install archery is with pip from the top-level directory.
+It is recommended to use the ``-e,--editable`` flag such that pip don't copy
+the module files but use the actual sources.
+
+.. code-block:: shell
+  pip install -e dev/archery
+  archery --help
+
+Comparison
+==========
+
+One desire with benchmarks is to detect performance regressions. Thus,
+``archery`` implements a benchmark comparison facility via the ``benchmark
+diff`` command.
+
+In the default invocation, it will compare the current source (known as the
+current workspace in git) with local master branch.
+
+For more information, invoke the ``archery benchmark diff --help`` command for
+multiple examples of invocation.
+
+Iterating efficiently
+~~~~~~~~~~~~~~~~~~~~~
+
+Iterating with benchmarks development can be a tedious process due to long
+build time and long run times. ``archery benchmark diff`` provides 2 methods
+to reduce this overhead.
+
+First, the benchmark command supports comparing existing
+build directories, This can be paired with the ``--preserve`` flag to
+avoid rebuilding sources from zero.
+
+.. code-block:: shell
+
+  # First invocation clone and checkouts in a temporary directory. The
+  # directory is preserved with --preserve
+  archery benchmark diff --preserve
+
+  # Modify C++ sources
+
+  # Re-run benchmark in the previously created build directory.
+  archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build
+
+Second, the benchmark command supports filtering suites (``--suite-filter``)
+and benchmarks (``--benchmark-filter``), both options supports regular
+expressions.
+
+.. code-block:: shell
+
+  # Taking over a previous run, but only filtering for benchmarks matching
+  # `Kernel` and suite matching `compute-aggregate`.
+  archery benchmark diff                                       \
+    --suite-filter=compute-aggregate --benchmark-filter=Kernel \
+    /tmp/arrow-bench*/{WORKSPACE,master}/build
+
+Both method can be combined.
+
+Regression detection
+====================
+
+Writing a benchmark
+~~~~~~~~~~~~~~~~~~~
+
+1. The benchmark command will filter (by default) benchmarks with the regular
+   expression ``^Regression``. This way, not all benchmarks are run by default.
+   Thus, if you want your benchmark to be verified for regression
+   automatically, the name must match.
+
+2. The benchmark command will run with the ``--benchmark_repetitions=K``
+   options for statistical stability. Thus, a benchmark should not override the
+   repetitions in the (C++) benchmark's arguments definition.
+
+3. Due to #2, a benchmark should run sufficiently fast. Often, when the input
+   does not fit in memory (L2/L3), the benchmark will be memory bound instead
+   of CPU bound. In this case, the input can be downsized.
+
+Scripting
+=========
+
+``archery`` is written as a python library with a command line frontend. The
+library can be imported to automate some tasks.
+
+Some invocation of the command line interface can be quite verbose due to build
+output. This can be controlled/avoided with the ``--quiet`` option, e.g.
+
+.. code-block:: shell
+
+  archery --quiet benchmark diff --benchmark-filter=Kernel
+  {"benchmark": "BenchSumKernel/32768/0", "change": -0.6498, "regression": true, ...
+  {"benchmark": "BenchSumKernel/32768/1", "change": 0.01553, "regression": false, ...
+  ...

From 2a81744cfb4aed1c437be01ab382d7377d200976 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 10:44:09 -0400
Subject: [PATCH 07/31] Ooops.

---
 dev/archery/archery/utils/command.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 4e05487221e..15a4a3411ab 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -54,7 +54,7 @@ def run(self, *argv, **kwargs):
         for key in ["stdout", "stderr"]:
             # Preserve caller intention, otherwise silence
             if key not in kwargs and ctx.quiet:
-                kwargs["key"] = subprocess.PIPE
+                kwargs[key] = subprocess.PIPE
 
         # Prefer safe by default
         if "check" not in kwargs:

From 21b2e14fcf9ffdaffadb286bf0ee22a44150a26c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 10:55:23 -0400
Subject: [PATCH 08/31] Add doc and fix bugs

---
 cpp/src/arrow/compute/kernels/aggregate-benchmark.cc |  2 +-
 dev/archery/archery/benchmark/core.py                | 10 +++++++---
 dev/archery/archery/cli.py                           |  7 +++++--
 docs/source/developers/benchmarks.rst                |  4 ++--
 docs/source/developers/index.rst                     |  1 +
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
index bea2656d73e..6d35dd8f477 100644
--- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
@@ -328,7 +328,7 @@ static void RegressionSumKernel(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t));
 }
 
-BENCHMARK(RegressionSumKernel)->Apply(SetArgs);
+BENCHMARK(RegressionSumKernel)->Apply(BenchmarkSetArgs);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
index 3cb4d9568ba..133945b49d0 100644
--- a/dev/archery/archery/benchmark/core.py
+++ b/dev/archery/archery/benchmark/core.py
@@ -75,7 +75,7 @@ def __repr__(self):
         return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]"
 
 
-def regress(change, threshold=0.05):
+def regress(change, threshold):
     # negative change is better, positive is tolerable until it exceeds
     # threshold
     return change > threshold
@@ -89,10 +89,14 @@ def changes(old, new):
     return float(new - old) / abs(old)
 
 
+DEFAULT_THRESHOLD = 0.05
+
+
 class BenchmarkComparator:
-    def __init__(self, contender, baseline):
+    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD):
         self.contender = contender
         self.baseline = baseline
+        self.threshold = threshold
 
     def compare(self, comparator=None):
         base = self.baseline.value
@@ -104,7 +108,7 @@ def compare(self, comparator=None):
         return {
             "benchmark": self.baseline.name,
             "change": change,
-            "regression": regress(adjusted_change),
+            "regression": regress(adjusted_change, self.threshold),
             "baseline": base,
             "contender": cont,
             "unit": self.baseline.unit,
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 9e271843de9..ad02e30b058 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -201,13 +201,15 @@ def benchmark(ctx):
               help="Regex filtering benchmark suites.")
 @click.option("--preserve", type=bool, default=False, show_default=True,
               is_flag=True, help="Preserve workspace for investigation.")
+@click.option("--threshold", type=float, default=0.05, show_default=True,
+              help="Regression failure threshold in percentage.")
 @click.argument("contender", metavar="[<contender>", default=Git.WORKSPACE,
                 required=False)
 @click.argument("baseline", metavar="[<baseline>]]", default="master",
                 required=False)
 @click.pass_context
 def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
-                   contender, baseline):
+                   threshold, contender, baseline):
     """ Compare (diff) benchmark runs.
 
     This command acts like git-diff but for benchmark results.
@@ -288,7 +290,8 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
                 bench_cont = suite_cont[bench_name]
                 bench_base = suite_base[bench_name]
 
-                comparator = BenchmarkComparator(bench_cont, bench_base)
+                comparator = BenchmarkComparator(bench_cont, bench_base,
+                                                 threshold=threshold)
                 comparison = comparator()
 
                 comparison["suite"] = suite_name
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index 64428a15984..bb9e32eb79a 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -99,8 +99,8 @@ Writing a benchmark
    automatically, the name must match.
 
 2. The benchmark command will run with the ``--benchmark_repetitions=K``
-   options for statistical stability. Thus, a benchmark should not override the
-   repetitions in the (C++) benchmark's arguments definition.
+   options for statistical significance. Thus, a benchmark should not override
+   the repetitions in the (C++) benchmark's arguments definition.
 
 3. Due to #2, a benchmark should run sufficiently fast. Often, when the input
    does not fit in memory (L2/L3), the benchmark will be memory bound instead
diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst
index a58f96997a6..d30963893cf 100644
--- a/docs/source/developers/index.rst
+++ b/docs/source/developers/index.rst
@@ -22,4 +22,5 @@
    cpp
    python
    integration
+   benchmarks
    documentation

From d6733b6f403a0afcf5ef751a8dfee6f97fc300a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 11:53:04 -0400
Subject: [PATCH 09/31] Formatting

---
 docs/source/developers/benchmarks.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index bb9e32eb79a..0d69e9d645b 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -35,6 +35,7 @@ It is recommended to use the ``-e,--editable`` flag such that pip don't copy
 the module files but use the actual sources.
 
 .. code-block:: shell
+
   pip install -e dev/archery
   archery --help
 

From bc111b2d38134fd7d26ddb23a4a399a5a56972c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 13:38:33 -0400
Subject: [PATCH 10/31] Removes copied stuff

---
 dev/archery/archery/benchmark/google.py | 69 -------------------------
 dev/archery/archery/benchmark/runner.py |  2 +-
 2 files changed, 1 insertion(+), 70 deletions(-)

diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index ce2d060ccb2..9515d3fc6e3 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -53,62 +53,6 @@ def results(self):
                                    stderr=subprocess.PIPE).stdout)
 
 
-class GoogleContext:
-    """ Represents the runtime environment """
-
-    def __init__(self, date=None, executable=None, **kwargs):
-        self.date = date
-        self.executable = executable
-
-    @property
-    def host(self):
-        host = {
-            "hostname": gethostname(),
-            # Not sure if we should leak this.
-            "mac_address": getnode(),
-        }
-        return host
-
-    @property
-    def git(self):
-        head = git.head()
-        # %ai: author date, ISO 8601-like format
-        timestamp = git.log("-1", "--pretty='%ai'", head)
-        branch = git.current_branch(),
-        git_info = {
-            "git_commit_timestamp": timestamp,
-            "git_hash": head,
-            "git_branch": branch,
-        }
-        return git_info
-
-    @property
-    def toolchain(self):
-        # TODO parse local CMake generated info to extract compile flags and
-        # arrow features
-        deps = {}
-        toolchain = {
-            "language_implementation_version": "c++11",
-            "dependencies": deps,
-        }
-        return toolchain
-
-    def as_arrow(self):
-        ctx = {
-            "benchmark_language": "C++",
-            "run_timestamp": self.date,
-        }
-
-        for extra in (self.host, self.git, self.toolchain):
-            ctx.update(extra)
-
-        return ctx
-
-    @classmethod
-    def from_json(cls, payload):
-        return cls(**payload)
-
-
 class BenchmarkObservation:
     """ Represents one run of a single benchmark. """
 
@@ -159,19 +103,6 @@ def __init__(self, name, runs):
         self.runs = sorted(runs, key=lambda b: b.value)
         super().__init__(name, [b.value for b in self.runs])
 
-    @property
-    def parameters(self):
-        """ Extract parameters from Benchmark's name"""
-        def parse_param(idx, param):
-            k_v = param.split(":")
-            # nameless parameter are transformed into positional names
-            name = k_v[0] if len(k_v) > 1 else f"arg{idx}"
-            return name, k_v[-1]
-
-        params = enumerate(self.name.split("/")[1:])
-        named_params = [parse_param(idx, p) for idx, p in params if p]
-        return {k: v for k, v in named_params}
-
     @property
     def unit(self):
         return self.runs[0].unit
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index 13786020f4d..f9ef63e5251 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -21,7 +21,7 @@
 import subprocess
 
 from .core import Benchmark, BenchmarkSuite
-from .google import GoogleBenchmarkCommand, GoogleBenchmark, GoogleContext
+from .google import GoogleBenchmarkCommand, GoogleBenchmark
 from ..utils.logger import logger
 
 

From 1b028390c6ed4fb98cbb97a064fafd1d95d132ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 16:28:41 -0400
Subject: [PATCH 11/31] Rename --cxx_flags to --cxx-flags

---
 dev/archery/archery/cli.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index ad02e30b058..987c9b27b25 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -79,7 +79,7 @@ def validate_arrow_sources(ctx, param, src):
 # toolchain
 @click.option("--cc", metavar="<compiler>", help="C compiler.")
 @click.option("--cxx", metavar="<compiler>", help="C++ compiler.")
-@click.option("--cxx_flags", help="C++ compiler flags.")
+@click.option("--cxx-flags", help="C++ compiler flags.")
 @click.option("--build-type", default="release", type=build_type,
               help="CMake's CMAKE_BUILD_TYPE")
 @click.option("--warn-level", default="production", type=warn_level_type,
@@ -122,7 +122,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
     \b
     # Initialize build with clang7 and avx2 support in directory `clang7-build`
     \b
-    archery build --cc=clang-7 --cxx=clang++-7 --cxx_flags=-mavx2 clang7-build
+    archery build --cc=clang-7 --cxx=clang++-7 --cxx-flags=-mavx2 clang7-build
 
     \b
     # Builds and run test
@@ -248,11 +248,11 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
     # Compare g++7 (contender) with clang++-7 (baseline) builds
     \b
     archery build --with-benchmarks=true \\
-            --cxx_flags=-ftree-vectorize \\
+            --cxx-flags=-ftree-vectorize \\
             --cc=gcc-7 --cxx=g++-7 gcc7-build
     \b
     archery build --with-benchmarks=true \\
-            --cxx_flags=-flax-vector-conversions \\
+            --cxx-flags=-flax-vector-conversions \\
             --cc=clang-7 --cxx=clang++-7 clang7-build
     \b
     archery benchmark diff gcc7-build clang7-build

From a281ae8e6a6d51755d8adb2d552f299cc6565b0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 11 Apr 2019 16:37:38 -0400
Subject: [PATCH 12/31] Various language fixes

---
 dev/archery/archery/cli.py            |  4 ++--
 dev/archery/archery/lang/cpp.py       | 16 ++++++++--------
 dev/archery/archery/utils/source.py   |  8 ++++----
 docs/source/developers/benchmarks.rst | 10 +++++-----
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 987c9b27b25..1e0344ea586 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -110,8 +110,8 @@ def build(ctx, src, build_dir, force, targets, **kwargs):
     """ Initialize a C++ build directory.
 
     The build command creates a directory initialized with Arrow's cpp source
-    cmake configuration. It can also optionally invoke the generator to test
-    the build (and used in scripts).
+    cmake and configuration. It can also optionally invoke the generator to
+    test the build (and used in scripts).
 
     Note that archery will carry the caller environment. It will also not touch
     an existing directory, one must use the `--force` option to remove the
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index 136dadc6b4e..a1fded5a095 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -22,7 +22,7 @@
 from ..utils.cmake import CMakeDefinition
 
 
-def thrutifier(value):
+def truthifier(value):
     return "ON" if value else "OFF"
 
 
@@ -61,14 +61,14 @@ def _gen_defs(self):
         yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug"))
         yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production"))
 
-        yield ("ARROW_BUILD_TESTS", thrutifier(self.with_tests))
-        yield ("ARROW_BUILD_BENCHMARKS", thrutifier(self.with_benchmarks))
+        yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests))
+        yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks))
 
-        yield ("ARROW_PYTHON", thrutifier(self.with_python))
-        yield ("ARROW_PARQUET", thrutifier(self.with_parquet))
-        yield ("ARROW_GANDIVA", thrutifier(self.with_gandiva))
-        yield ("ARROW_PLASMA", thrutifier(self.with_plasma))
-        yield ("ARROW_FLIGHT", thrutifier(self.with_flight))
+        yield ("ARROW_PYTHON", truthifier(self.with_python))
+        yield ("ARROW_PARQUET", truthifier(self.with_parquet))
+        yield ("ARROW_GANDIVA", truthifier(self.with_gandiva))
+        yield ("ARROW_PLASMA", truthifier(self.with_plasma))
+        yield ("ARROW_FLIGHT", truthifier(self.with_flight))
 
     @property
     def definitions(self):
diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
index 6ff05fb5498..3ba4983eb98 100644
--- a/dev/archery/archery/utils/source.py
+++ b/dev/archery/archery/utils/source.py
@@ -29,7 +29,7 @@ def __init__(self, path):
         """ Initialize an ArrowSources
 
         The caller must ensure that path is valid arrow source directory (can
-        be checked with ArrowSources.valid
+        be checked with ArrowSources.valid)
 
         Parameters
         ----------
@@ -71,7 +71,7 @@ def at_revision(self, revision, clone_dir):
             raise ValueError(f"{self} is not backed by git")
 
         # A local clone is required to leave the current sources intact such
-        # that build depending on said sources are not invalidated (or worse
+        # that builds depending on said sources are not invalidated (or worse
         # slightly affected when re-invoking the generator).
         git.clone("--local", self.path, clone_dir)
         git.checkout("-b", revision, git_dir=clone_dir)
@@ -100,8 +100,8 @@ def find(path=None):
         2. Checks if the environment variable `ARROW_SRC` is defined and use
            this.
 
-        3. Checks if the current working directory (cwd) contains is an Arrow
-           source directory.
+        3. Checks if the current working directory (cwd) is an Arrow source
+           directory.
 
         4. Checks if this file (cli.py) is still in the original source
            repository. If so, returns the relative path to the source
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index 0d69e9d645b..b403392ffbc 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -31,8 +31,8 @@ Installation
 ~~~~~~~~~~~~
 
 The simplest way to install archery is with pip from the top-level directory.
-It is recommended to use the ``-e,--editable`` flag such that pip don't copy
-the module files but use the actual sources.
+It is recommended to use the ``-e,--editable`` flag so that pip don't copy
+the module files but uses the actual sources.
 
 .. code-block:: shell
 
@@ -42,7 +42,7 @@ the module files but use the actual sources.
 Comparison
 ==========
 
-One desire with benchmarks is to detect performance regressions. Thus,
+One goal with benchmarking is to detect performance regressions. To this end,
 ``archery`` implements a benchmark comparison facility via the ``benchmark
 diff`` command.
 
@@ -55,7 +55,7 @@ multiple examples of invocation.
 Iterating efficiently
 ~~~~~~~~~~~~~~~~~~~~~
 
-Iterating with benchmarks development can be a tedious process due to long
+Iterating with benchmark development can be a tedious process due to long
 build time and long run times. ``archery benchmark diff`` provides 2 methods
 to reduce this overhead.
 
@@ -86,7 +86,7 @@ expressions.
     --suite-filter=compute-aggregate --benchmark-filter=Kernel \
     /tmp/arrow-bench*/{WORKSPACE,master}/build
 
-Both method can be combined.
+Both methods can be combined.
 
 Regression detection
 ====================

From 7696202ba2f145acd1bf4e17871edd72728027cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Fri, 12 Apr 2019 10:14:27 -0400
Subject: [PATCH 13/31] Add doc for bin attribute.

---
 dev/archery/archery/utils/command.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 15a4a3411ab..32e8d9859a0 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -44,10 +44,14 @@ def wrapper(*argv, **kwargs):
 
 
 class Command:
-    def bin(self):
-        raise NotImplementedError("Command must implement bin() method")
+    """ A runnable command.
+
+    Class inheriting from the Command class must provide the bin
+    property/attribute.
+    """
 
     def run(self, *argv, **kwargs):
+        assert(hasattr(self, "bin"))
         invocation = [find_exec(self.bin)]
         invocation.extend(argv)
 

From 90578af61bf546c06f7f22fe96243f478430eeaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Fri, 12 Apr 2019 10:25:48 -0400
Subject: [PATCH 14/31] Add --cmake-extras to build command

---
 dev/archery/archery/cli.py      | 5 ++++-
 dev/archery/archery/lang/cpp.py | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 1e0344ea586..fe11f65acb5 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -99,11 +99,14 @@ def validate_arrow_sources(ctx, param, src):
               help="Build with Plasma object store support.")
 @click.option("--with-flight", default=False, type=bool,
               help="Build with Flight rpc support.")
+@click.option("--cmake-extras", type=str, multiple=True,
+              help="Extra flags/options to pass to cmake invocation. "
+              "Can be stacked")
 # misc
 @click.option("-f", "--force", type=bool, is_flag=True, default=False,
               help="Delete existing build directory if found.")
 @click.option("--targets", type=str, multiple=True,
-              help="Generator targets to run")
+              help="Generator targets to run. Can be stacked.")
 @click.argument("build_dir", type=build_dir_type)
 @click.pass_context
 def build(ctx, src, build_dir, force, targets, **kwargs):
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index a1fded5a095..240bd3e6d48 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -38,7 +38,7 @@ def __init__(self,
                  # components
                  with_tests=True, with_benchmarks=False, with_python=True,
                  with_parquet=False, with_gandiva=False, with_plasma=False,
-                 with_flight=False):
+                 with_flight=False, cmake_extras=None):
         self.cc = cc
         self.cxx = cxx
         self.cxx_flags = cxx_flags
@@ -53,6 +53,7 @@ def __init__(self,
         self.with_gandiva = with_gandiva
         self.with_plasma = with_plasma
         self.with_flight = with_flight
+        self.cmake_extras = cmake_extras
 
     def _gen_defs(self):
         if self.cxx_flags:
@@ -72,7 +73,8 @@ def _gen_defs(self):
 
     @property
     def definitions(self):
-        return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()]
+        extras = list(self.cmake_extras) if self.cmake_extras else []
+        return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] + extras
 
     @property
     def environment(self):

From d9692bc8fd5813215defbf35be481673070dbf49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Mon, 15 Apr 2019 12:53:32 -0400
Subject: [PATCH 15/31] Fix splitlines

---
 dev/archery/archery/benchmark/google.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index 9515d3fc6e3..b3b71b417ab 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -41,7 +41,7 @@ def list_benchmarks(self):
             argv.append(f"--benchmark_filter={self.benchmark_filter}")
         result = self.run(*argv, stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
-        return [b for b in result.stdout]
+        return str.splitlines(result.stdout.decode("utf-8"))
 
     def results(self):
         argv = ["--benchmark_format=json", "--benchmark_repetitions=20"]

From 96f999748390d1d52f0712c788bd3ef1a2e1727a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Mon, 15 Apr 2019 13:02:39 -0400
Subject: [PATCH 16/31] Add gitignore entry

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 6bb237af98e..6d08d4607d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ perf.data.old
 
 cpp/.idea/
 cpp/apidoc/xml/
+dev/archery/archery.egg-info/
 docs/example.gz
 docs/example1.dat
 docs/example3.dat

From 1949f749ce6f9bd43014d54ce3496b8ea9ed45b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 16 Apr 2019 10:25:00 -0400
Subject: [PATCH 17/31] Supports HEAD revisions

---
 dev/archery/archery/utils/source.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
index 3ba4983eb98..45795a5ba6e 100644
--- a/dev/archery/archery/utils/source.py
+++ b/dev/archery/archery/utils/source.py
@@ -74,7 +74,7 @@ def at_revision(self, revision, clone_dir):
         # that builds depending on said sources are not invalidated (or worse
         # slightly affected when re-invoking the generator).
         git.clone("--local", self.path, clone_dir)
-        git.checkout("-b", revision, git_dir=clone_dir)
+        git.checkout(revision, git_dir=clone_dir)
 
         return ArrowSources(clone_dir)
 

From 8845e3e78095393e2ed9265cf1604ea4b904bb7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 16 Apr 2019 14:03:34 -0400
Subject: [PATCH 18/31] Remove empty __init__.py

---
 dev/archery/archery/__init__.py           | 0
 dev/archery/archery/benchmark/__init__.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 dev/archery/archery/__init__.py
 delete mode 100644 dev/archery/archery/benchmark/__init__.py

diff --git a/dev/archery/archery/__init__.py b/dev/archery/archery/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/dev/archery/archery/benchmark/__init__.py b/dev/archery/archery/benchmark/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000

From c3719214c64a4a0e7f9129bfba6a163afb67919a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 16 Apr 2019 14:39:06 -0400
Subject: [PATCH 19/31] Fix flake8 warnings

---
 dev/archery/archery/benchmark/google.py | 1 -
 dev/archery/archery/benchmark/runner.py | 3 +--
 dev/archery/archery/cli.py              | 1 -
 dev/archery/archery/lang/cpp.py         | 2 --
 dev/archery/archery/utils/command.py    | 2 +-
 5 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index b3b71b417ab..c3c836a329a 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -21,7 +21,6 @@
 
 from .core import Benchmark
 from ..utils.command import Command
-from ..utils.git import git
 
 
 def partition(pred, iterable):
diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index f9ef63e5251..158f2763667 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -18,9 +18,8 @@
 import glob
 import os
 import re
-import subprocess
 
-from .core import Benchmark, BenchmarkSuite
+from .core import BenchmarkSuite
 from .google import GoogleBenchmarkCommand, GoogleBenchmark
 from ..utils.logger import logger
 
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index fe11f65acb5..6408aabbad1 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -21,7 +21,6 @@
 import json
 import logging
 import os
-import re
 from tempfile import mkdtemp, TemporaryDirectory
 
 
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index 240bd3e6d48..c83acb22b9b 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -15,10 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import glob
 import os
 
-from ..benchmark.runner import BenchmarkRunner
 from ..utils.cmake import CMakeDefinition
 
 
diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py
index 32e8d9859a0..46d0066d23b 100644
--- a/dev/archery/archery/utils/command.py
+++ b/dev/archery/archery/utils/command.py
@@ -39,7 +39,7 @@ def strip_it(x):
         def wrapper(*argv, **kwargs):
             # Ensure stdout is captured
             kwargs["stdout"] = subprocess.PIPE
-            return strip_it(fn(*argv, **kwargs).stdout)
+            return strip_it(f(*argv, **kwargs).stdout)
         return wrapper
 
 

From 71b10e98addb6363ebc49ab2e40c9143b38be682 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 18 Apr 2019 14:11:14 -0400
Subject: [PATCH 20/31] Disable python in benchmarks

---
 dev/archery/archery/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 6408aabbad1..135117cd5b8 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -152,7 +152,8 @@ def tmpdir(preserve, prefix="arrow-bench-"):
 
 
 DEFAULT_BENCHMARK_CONF = CppConfiguration(
-    build_type="release", with_tests=True, with_benchmarks=True)
+    build_type="release", with_tests=True, with_benchmarks=True,
+    with_python=False)
 
 
 def cpp_runner_from_rev_or_path(src, root, rev_or_path):

From 048ba0ede54bfb19b13152381be0a944a9381639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 18 Apr 2019 14:21:57 -0400
Subject: [PATCH 21/31] Add verbose_third_party

---
 dev/archery/archery/lang/cpp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index c83acb22b9b..90822fe6122 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -32,7 +32,7 @@ class CppConfiguration:
     def __init__(self,
                  # toolchain
                  cc=None, cxx=None, cxx_flags=None,
-                 build_type=None, warn_level=None,
+                 build_type=None, warn_level=None, verbose_third_party=True
                  # components
                  with_tests=True, with_benchmarks=False, with_python=True,
                  with_parquet=False, with_gandiva=False, with_plasma=False,
@@ -43,6 +43,7 @@ def __init__(self,
 
         self.build_type = build_type
         self.warn_level = warn_level
+        self.verbose_third_party = verbose_third_party
 
         self.with_tests = with_tests
         self.with_benchmarks = with_benchmarks
@@ -59,6 +60,8 @@ def _gen_defs(self):
 
         yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug"))
         yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production"))
+        yield ("ARROW_VERBOSE_THIRDPARTY_BUILD",
+               truthifier(self.verbose_third_party))
 
         yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests))
         yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks))

From e6762899cbf48795e9211d32d70b368673b50321 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 18 Apr 2019 14:28:35 -0400
Subject: [PATCH 22/31] Typo

---
 dev/archery/archery/lang/cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index 90822fe6122..9af0ad8d446 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -32,7 +32,7 @@ class CppConfiguration:
     def __init__(self,
                  # toolchain
                  cc=None, cxx=None, cxx_flags=None,
-                 build_type=None, warn_level=None, verbose_third_party=True
+                 build_type=None, warn_level=None, verbose_third_party=True,
                  # components
                  with_tests=True, with_benchmarks=False, with_python=True,
                  with_parquet=False, with_gandiva=False, with_plasma=False,

From 28254676c27f255303966c5aaf5adb4aff74570c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Mon, 22 Apr 2019 08:53:25 -0400
Subject: [PATCH 23/31] Add --cmake-extras to benchmark-diff command

---
 dev/archery/archery/cli.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 135117cd5b8..65a335aec94 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -151,12 +151,7 @@ def tmpdir(preserve, prefix="arrow-bench-"):
             yield tmp
 
 
-DEFAULT_BENCHMARK_CONF = CppConfiguration(
-    build_type="release", with_tests=True, with_benchmarks=True,
-    with_python=False)
-
-
-def cpp_runner_from_rev_or_path(src, root, rev_or_path):
+def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf):
     build = None
     if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
         build = CMakeBuild.from_path(rev_or_path)
@@ -173,7 +168,7 @@ def cpp_runner_from_rev_or_path(src, root, rev_or_path):
         # TODO: find a way to pass custom configuration without cluttering
         # the cli. Ideally via a configuration file that can be shared with the
         # `build` sub-command.
-        cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF)
+        cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
         build = cmake_def.build(os.path.join(root_rev, "build"))
 
     return CppBenchmarkRunner(build)
@@ -206,13 +201,16 @@ def benchmark(ctx):
               is_flag=True, help="Preserve workspace for investigation.")
 @click.option("--threshold", type=float, default=0.05, show_default=True,
               help="Regression failure threshold in percentage.")
+@click.option("--cmake-extras", type=str, multiple=True,
+              help="Extra flags/options to pass to cmake invocation. "
+              "Can be stacked")
 @click.argument("contender", metavar="[<contender>", default=Git.WORKSPACE,
                 required=False)
 @click.argument("baseline", metavar="[<baseline>]]", default="master",
                 required=False)
 @click.pass_context
 def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
-                   threshold, contender, baseline):
+                   threshold, cmake_extras, contender, baseline):
     """ Compare (diff) benchmark runs.
 
     This command acts like git-diff but for benchmark results.
@@ -271,8 +269,12 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
         logger.debug(f"Comparing {contender} (contender) with "
                      f"{baseline} (baseline)")
 
-        runner_cont = cpp_runner_from_rev_or_path(src, root, contender)
-        runner_base = cpp_runner_from_rev_or_path(src, root, baseline)
+        conf = CppConfiguration(
+                build_type="release", with_tests=True, with_benchmarks=True,
+                with_python=False, cmake_extras=cmake_extras)
+
+        runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf)
+        runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf)
 
         suites_cont = {s.name: s for s in runner_cont.suites(suite_filter,
                                                              benchmark_filter)}

From dc031bde7fa9080a68694ad6f08f05252b46c8e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Tue, 23 Apr 2019 09:48:44 -0400
Subject: [PATCH 24/31] Support conda toolchain

---
 dev/archery/archery/lang/cpp.py | 41 +++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index 9af0ad8d446..669c6f59601 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -18,6 +18,7 @@
 import os
 
 from ..utils.cmake import CMakeDefinition
+from ..utils.logger import ctx
 
 
 def truthifier(value):
@@ -32,7 +33,8 @@ class CppConfiguration:
     def __init__(self,
                  # toolchain
                  cc=None, cxx=None, cxx_flags=None,
-                 build_type=None, warn_level=None, verbose_third_party=True,
+                 build_type=None, warn_level=None,
+                 install_prefix=None, use_conda=None,
                  # components
                  with_tests=True, with_benchmarks=False, with_python=True,
                  with_parquet=False, with_gandiva=False, with_plasma=False,
@@ -43,7 +45,8 @@ def __init__(self,
 
         self.build_type = build_type
         self.warn_level = warn_level
-        self.verbose_third_party = verbose_third_party
+        self._install_prefix = install_prefix
+        self._use_conda = use_conda
 
         self.with_tests = with_tests
         self.with_benchmarks = with_benchmarks
@@ -60,8 +63,13 @@ def _gen_defs(self):
 
         yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug"))
         yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production"))
-        yield ("ARROW_VERBOSE_THIRDPARTY_BUILD",
-               truthifier(self.verbose_third_party))
+
+        if not ctx.quiet:
+            yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON")
+
+        maybe_prefix = self.install_prefix
+        if maybe_prefix:
+            yield ("CMAKE_INSTALL_PREFIX", maybe_prefix)
 
         yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests))
         yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks))
@@ -72,6 +80,31 @@ def _gen_defs(self):
         yield ("ARROW_PLASMA", truthifier(self.with_plasma))
         yield ("ARROW_FLIGHT", truthifier(self.with_flight))
 
+        # Detect custom conda toolchain
+        if self.use_conda:
+            for d, v in [('CMAKE_AR', 'AR'), ('CMAKE_RANLIB', 'RANLIB')]:
+                v = os.environ.get(v)
+                if v:
+                    yield (d, v)
+
+    @property
+    def install_prefix(self):
+        if self._install_prefix:
+            return self._install_prefix
+
+        if self.use_conda:
+            return os.environ.get("CONDA_PREFIX")
+
+        return None
+
+    @property
+    def use_conda(self):
+        # If the user didn't specify a preference, guess via environment
+        if self._use_conda is None:
+            return os.environ.get("CONDA_PREFIX") is not None
+
+        return self._use_conda
+
     @property
     def definitions(self):
         extras = list(self.cmake_extras) if self.cmake_extras else []

From 280c93be4ea6954a06dd426975e95fcfcda6ab90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 11:09:59 -0400
Subject: [PATCH 25/31] Update gitignore

---
 .gitignore        | 3 ++-
 python/.gitignore | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6d08d4607d5..4a03020f950 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,12 +45,13 @@ perf.data.old
 
 cpp/.idea/
 cpp/apidoc/xml/
-dev/archery/archery.egg-info/
 docs/example.gz
 docs/example1.dat
 docs/example3.dat
 python/.eggs/
 python/doc/
+# Egg metadata
+*.egg-info
 
 .vscode
 .idea/
diff --git a/python/.gitignore b/python/.gitignore
index 3346aa62df4..8f08f9377bf 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -25,8 +25,6 @@ pyarrow/include
 build
 # setup.py dist directory
 dist
-# Egg metadata
-*.egg-info
 # Coverage
 .coverage
 coverage.xml

From 514e8e42819eb994c2446e65658ce295a618a29f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 13:57:06 -0400
Subject: [PATCH 26/31] Introduce RegressionSetArgs

---
 cpp/src/arrow/compute/benchmark-util.h              | 13 +++++++++++++
 .../arrow/compute/kernels/aggregate-benchmark.cc    |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h
index 1678f8d51e2..865da6671e3 100644
--- a/cpp/src/arrow/compute/benchmark-util.h
+++ b/cpp/src/arrow/compute/benchmark-util.h
@@ -55,5 +55,18 @@ void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) {
       bench->Args({static_cast<ArgsType>(size), nulls});
 }
 
+void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
+  // Benchmark changed its parameter type between releases from
+  // int to int64_t. As it doesn't have version macros, we need
+  // to apply C++ template magic.
+  using ArgsType =
+      typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type;
+  bench->Unit(benchmark::kMicrosecond);
+
+  // Regressions should only bench L1 data for better stability
+  for (auto nulls : std::vector<ArgsType>({0, 1, 10, 50}))
+    bench->Args({static_cast<ArgsType>(kL1Size), nulls});
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
index 6d35dd8f477..bbc923f6ebd 100644
--- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc
@@ -328,7 +328,7 @@ static void RegressionSumKernel(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t));
 }
 
-BENCHMARK(RegressionSumKernel)->Apply(BenchmarkSetArgs);
+BENCHMARK(RegressionSumKernel)->Apply(RegressionSetArgs);
 
 }  // namespace compute
 }  // namespace arrow

From d8e3c1c85f288f3d3a8c13266c450ef6fba6263f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 13:59:37 -0400
Subject: [PATCH 27/31] Review

---
 dev/archery/archery/benchmark/core.py   | 97 +++----------------------
 dev/archery/archery/benchmark/google.py | 77 +++++++++++++++-----
 dev/archery/archery/cli.py              | 71 ++++++++----------
 dev/archery/archery/lang/cpp.py         |  5 +-
 dev/archery/archery/utils/cmake.py      | 13 ++++
 dev/archery/archery/utils/git.py        |  7 +-
 dev/archery/archery/utils/source.py     | 15 +++-
 dev/archery/setup.py                    |  4 +-
 docs/source/developers/benchmarks.rst   |  3 +
 9 files changed, 133 insertions(+), 159 deletions(-)

diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py
index 133945b49d0..83bc2735dd9 100644
--- a/dev/archery/archery/benchmark/core.py
+++ b/dev/archery/archery/benchmark/core.py
@@ -15,53 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
-
-class Statistics:
-    def __init__(self, min, max, mean, stddev, q1, median, q3):
-        self.min = min
-        self.max = max
-        self.mean = mean
-        self.stddev = stddev
-        self.q1 = q1
-        self.median = median
-        self.q3 = q3
-
-    @staticmethod
-    def from_values(values):
-        sorted(values)
-        n = len(values)
-        mean = sum(values) / len(values)
-        sum_diff = sum([(val - mean)**2 for val in values])
-        stddev = (sum_diff / (n - 1))**0.5 if n > 1 else 0.0
-
-        return Statistics(
-            values[0],
-            values[-1],
-            mean,
-            stddev,
-            values[int(n/4)],
-            values[int(n/2)],
-            values[int((3*n)/4)],
-        )
+import pandas as pa
 
 
 class Benchmark:
-    def __init__(self, name, values, stats=None):
+    def __init__(self, name, unit, less_is_better, values, stats=None):
         self.name = name
-        self.values = values
-        self.statistics = stats if stats else Statistics.from_values(values)
+        self.unit = unit
+        self.less_is_better = less_is_better
+        self.values = pa.Series(values)
+        self.statistics = self.values.describe()
 
     @property
     def value(self):
-        return self.statistics.median
-
-    @property
-    def unit(self):
-        return None
+        median = "50%"
+        return float(self.statistics[median])
 
-    @property
-    def less_is_better(self):
-        return True
+    def __repr__(self):
+        return f"Benchmark[name={self.name},value={self.value}]"
 
 
 class BenchmarkSuite:
@@ -73,51 +44,3 @@ def __repr__(self):
         name = self.name
         benchmarks = self.benchmarks
         return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]"
-
-
-def regress(change, threshold):
-    # negative change is better, positive is tolerable until it exceeds
-    # threshold
-    return change > threshold
-
-
-def changes(old, new):
-    if old == 0 and new == 0:
-        return 0.0
-    if old == 0:
-        return float(new - old) / (float(old + new) / 2)
-    return float(new - old) / abs(old)
-
-
-DEFAULT_THRESHOLD = 0.05
-
-
-class BenchmarkComparator:
-    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD):
-        self.contender = contender
-        self.baseline = baseline
-        self.threshold = threshold
-
-    def compare(self, comparator=None):
-        base = self.baseline.value
-        cont = self.contender.value
-        change = changes(base, cont)
-        adjusted_change = change
-        if not self.baseline.less_is_better:
-            adjusted_change = change * -1.0
-        return {
-            "benchmark": self.baseline.name,
-            "change": change,
-            "regression": regress(adjusted_change, self.threshold),
-            "baseline": base,
-            "contender": cont,
-            "unit": self.baseline.unit,
-            "less_is_better": self.baseline.less_is_better,
-        }
-
-    def confidence(self):
-        """ Indicate if a comparison of benchmarks should be trusted. """
-        return True
-
-    def __call__(self, **kwargs):
-        return self.compare(**kwargs)
diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py
index c3c836a329a..d6efb77dc22 100644
--- a/dev/archery/archery/benchmark/google.py
+++ b/dev/archery/archery/benchmark/google.py
@@ -30,6 +30,12 @@ def partition(pred, iterable):
 
 
 class GoogleBenchmarkCommand(Command):
+    """ Run a google benchmark binary.
+
+    This assumes the binary supports the standard command line options,
+    notably `--benchmark_filter`, `--benchmark_format`, etc...
+    """
+
     def __init__(self, benchmark_bin, benchmark_filter=None):
         self.bin = benchmark_bin
         self.benchmark_filter = benchmark_filter
@@ -52,24 +58,48 @@ def results(self):
                                    stderr=subprocess.PIPE).stdout)
 
 
-class BenchmarkObservation:
-    """ Represents one run of a single benchmark. """
+class GoogleBenchmarkObservation:
+    """ Represents one run of a single (google c++) benchmark.
+
+    Observations are found when running with `--benchmark_repetitions`. Sadly,
+    the format mixes values and aggregates, e.g.
+
+    RegressionSumKernel/32768/0                 1 us          1 us  25.8077GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.7066GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.1481GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.846GB/s
+    RegressionSumKernel/32768/0                 1 us          1 us  25.6453GB/s
+    RegressionSumKernel/32768/0_mean            1 us          1 us  25.6307GB/s
+    RegressionSumKernel/32768/0_median          1 us          1 us  25.7066GB/s
+    RegressionSumKernel/32768/0_stddev          0 us          0 us  288.046MB/s
 
-    def __init__(self, **kwargs):
-        self._name = kwargs.get("name")
-        self.real_time = kwargs.get("real_time")
-        self.cpu_time = kwargs.get("cpu_time")
-        self.time_unit = kwargs.get("time_unit")
-        self.size = kwargs.get("size")
-        self.bytes_per_second = kwargs.get("bytes_per_second")
+    As from benchmark v1.4.1 (2019-04-24), the only way to differentiate an
+    actual run from the aggregates, is to match on the benchmark name. The
+    aggregates will be appended with `_$agg_name`.
+
+    This class encapsulate the logic to separate runs from aggregate . This is
+    hopefully avoided in benchmark's master version with a separate json
+    attribute.
+    """
+
+    def __init__(self, name, real_time, cpu_time, time_unit, size=None,
+                 bytes_per_second=None, **kwargs):
+        self._name = name
+        self.real_time = real_time
+        self.cpu_time = cpu_time
+        self.time_unit = time_unit
+        self.size = size
+        self.bytes_per_second = bytes_per_second
 
     @property
     def is_agg(self):
+        """ Indicate if the observation is a run or an aggregate. """
         suffixes = ["_mean", "_median", "_stddev"]
         return any(map(lambda x: self._name.endswith(x), suffixes))
 
     @property
     def is_realtime(self):
+        """ Indicate if the preferred value is realtime instead of cputime. """
         return self.name.find("/realtime") != -1
 
     @property
@@ -95,20 +125,29 @@ def __repr__(self):
 
 
 class GoogleBenchmark(Benchmark):
+    """ A set of GoogleBenchmarkObservations. """
+
     def __init__(self, name, runs):
+        """ Initialize a GoogleBenchmark.
+
+        Parameters
+        ----------
+        name: str
+              Name of the benchmark
+        runs: list(GoogleBenchmarkObservation)
+              Repetitions of GoogleBenchmarkObservation run.
+
+        """
         self.name = name
         # exclude google benchmark aggregate artifacts
         _, runs = partition(lambda b: b.is_agg, runs)
         self.runs = sorted(runs, key=lambda b: b.value)
-        super().__init__(name, [b.value for b in self.runs])
-
-    @property
-    def unit(self):
-        return self.runs[0].unit
-
-    @property
-    def less_is_better(self):
-        return self.runs[0].size is None
+        unit = self.runs[0].unit
+        # If `size` is found in the json dict, then the benchmark is reported
+        # in bytes per second
+        less_is_better = self.runs[0].size is None
+        values = [b.value for b in self.runs]
+        super().__init__(name, unit, less_is_better, values)
 
     def __repr__(self):
         return f"GoogleBenchmark[name={self.name},runs={self.runs}]"
@@ -118,6 +157,6 @@ def from_json(cls, payload):
         def group_key(x):
             return x.name
 
-        benchmarks = map(lambda x: BenchmarkObservation(**x), payload)
+        benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload)
         groups = groupby(sorted(benchmarks, key=group_key), group_key)
         return [cls(k, list(bs)) for k, bs in groups]
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 65a335aec94..c8929e7dd1b 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -21,14 +21,14 @@
 import json
 import logging
 import os
+import sys
 from tempfile import mkdtemp, TemporaryDirectory
 
-
-from .benchmark.core import BenchmarkComparator
+from .benchmark.compare import RunnerComparator
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.cmake import CMakeBuild
-from .utils.git import Git
+from .utils.codec import JsonEncoder
 from .utils.logger import logger, ctx as log_ctx
 from .utils.source import ArrowSources
 
@@ -152,6 +152,14 @@ def tmpdir(preserve, prefix="arrow-bench-"):
 
 
 def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf):
+    """ Returns a CppBenchmarkRunner from a path or a git revision.
+
+    First, it checks if `rev_or_path` points to a valid CMake build directory.
+    If so, it creates a CppBenchmarkRunner with this existing CMakeBuild.
+
+    Otherwise, it assumes `rev_or_path` is a revision and clone/checkout the
+    given revision and create a fresh CMakeBuild.
+    """
     build = None
     if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
         build = CMakeBuild.from_path(rev_or_path)
@@ -159,21 +167,18 @@ def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf):
         root_rev = os.path.join(root, rev_or_path)
         os.mkdir(root_rev)
 
-        # Possibly checkout the sources at given revision
-        src_rev = src
-        if rev_or_path != Git.WORKSPACE:
-            root_src = os.path.join(root_rev, "arrow")
-            src_rev = src.at_revision(rev_or_path, root_src)
-
-        # TODO: find a way to pass custom configuration without cluttering
-        # the cli. Ideally via a configuration file that can be shared with the
-        # `build` sub-command.
+        clone_dir = os.path.join(root_rev, "arrow")
+        # Possibly checkout the sources at given revision, no need to perform
+        # cleanup on cloned repository as root_rev is reclaimed.
+        src_rev, _ = src.at_revision(rev_or_path, clone_dir)
         cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
         build = cmake_def.build(os.path.join(root_rev, "build"))
 
     return CppBenchmarkRunner(build)
 
 
+# Running all benchmarks would be prohibitive. Benchmark who needs to be
+# monitored for regression should be named with this prefix.
 DEFAULT_BENCHMARK_FILTER = "^Regression"
 
 
@@ -204,8 +209,8 @@ def benchmark(ctx):
 @click.option("--cmake-extras", type=str, multiple=True,
               help="Extra flags/options to pass to cmake invocation. "
               "Can be stacked")
-@click.argument("contender", metavar="[<contender>", default=Git.WORKSPACE,
-                required=False)
+@click.argument("contender", metavar="[<contender>",
+                default=ArrowSources.WORKSPACE, required=False)
 @click.argument("baseline", metavar="[<baseline>]]", default="master",
                 required=False)
 @click.pass_context
@@ -219,7 +224,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
     unspecified, the contender will default to the current workspace (like git)
     and the baseline will default to master.
 
-    Each target (contender or baseline) can either be a git object reference
+    Each target (contender or baseline) can either be a git revision
     (commit, tag, special values like HEAD) or a cmake build directory. This
     allow comparing git commits, and/or different compilers and/or compiler
     flags.
@@ -270,39 +275,21 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
                      f"{baseline} (baseline)")
 
         conf = CppConfiguration(
-                build_type="release", with_tests=True, with_benchmarks=True,
-                with_python=False, cmake_extras=cmake_extras)
+            build_type="release", with_tests=True, with_benchmarks=True,
+            with_python=False, cmake_extras=cmake_extras)
 
         runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf)
         runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf)
 
-        suites_cont = {s.name: s for s in runner_cont.suites(suite_filter,
-                                                             benchmark_filter)}
-        suites_base = {s.name: s for s in runner_base.suites(suite_filter,
-                                                             benchmark_filter)}
-
-        for suite_name in sorted(suites_cont.keys() & suites_base.keys()):
-            logger.debug(f"Comparing {suite_name}")
-
-            suite_cont = {
-                b.name: b for b in suites_cont[suite_name].benchmarks}
-            suite_base = {
-                b.name: b for b in suites_base[suite_name].benchmarks}
-
-            for bench_name in sorted(suite_cont.keys() & suite_base.keys()):
-                logger.debug(f"Comparing {bench_name}")
-
-                bench_cont = suite_cont[bench_name]
-                bench_base = suite_base[bench_name]
-
-                comparator = BenchmarkComparator(bench_cont, bench_base,
-                                                 threshold=threshold)
-                comparison = comparator()
+        runner_comp = RunnerComparator(runner_cont, runner_base, threshold)
+        comparisons = runner_comp.comparisons(suite_filter, benchmark_filter)
 
-                comparison["suite"] = suite_name
-                comparison["benchmark"] = bench_name
+        regressions = 0
+        for comparator in comparisons:
+            regressions += comparator.regression
+            print(json.dumps(comparator, cls=JsonEncoder))
 
-                print(json.dumps(comparison))
+        sys.exit(regressions)
 
 
 if __name__ == "__main__":
diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py
index 669c6f59601..84b6346e14e 100644
--- a/dev/archery/archery/lang/cpp.py
+++ b/dev/archery/archery/lang/cpp.py
@@ -18,7 +18,6 @@
 import os
 
 from ..utils.cmake import CMakeDefinition
-from ..utils.logger import ctx
 
 
 def truthifier(value):
@@ -64,8 +63,8 @@ def _gen_defs(self):
         yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug"))
         yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production"))
 
-        if not ctx.quiet:
-            yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON")
+        # if not ctx.quiet:
+        #   yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON")
 
         maybe_prefix = self.install_prefix
         if maybe_prefix:
diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py
index a549f07b8b4..9fa5ca66f52 100644
--- a/dev/archery/archery/utils/cmake.py
+++ b/dev/archery/archery/utils/cmake.py
@@ -168,12 +168,25 @@ def test(self):
 
     @staticmethod
     def is_build_dir(path):
+        """ Indicate if a path is CMake build directory.
+
+        This method only checks for the existence of paths and does not do any
+        validation whatsoever.
+        """
         cmake_cache = os.path.join(path, "CMakeCache.txt")
         cmake_files = os.path.join(path, "CMakeFiles")
         return os.path.exists(cmake_cache) and os.path.exists(cmake_files)
 
     @staticmethod
     def from_path(path):
+        """ Instantiate a CMakeBuild from a path.
+
+        This is used to recover from an existing physical directory (created
+        with or without CMakeBuild).
+
+        Note that this method is not idempotent as the original definition will
+        be lost. Only some parameters are recovered (generator and build_type).
+        """
         if not CMakeBuild.is_build_dir(path):
             raise ValueError(f"Not a valid CMakeBuild path: {path}")
 
diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py
index 2d62eeb31bd..c6113524f1b 100644
--- a/dev/archery/archery/utils/git.py
+++ b/dev/archery/archery/utils/git.py
@@ -32,9 +32,6 @@ def wrapper(self, *argv, **kwargs):
 
 
 class Git(Command):
-    WORKSPACE = "WORKSPACE"
-    HEAD = "HEAD"
-
     def __init__(self, git_bin=None):
         self.bin = git_bin if git_bin else os.environ.get("GIT", "git")
 
@@ -66,11 +63,11 @@ def rev_parse(self, *argv, **kwargs):
     @capture_stdout(strip=True)
     def head(self, **kwargs):
         """ Return commit pointed by HEAD. """
-        return self.rev_parse(Git.HEAD, **kwargs)
+        return self.rev_parse("HEAD", **kwargs)
 
     @capture_stdout(strip=True)
     def current_branch(self, **kwargs):
-        return self.rev_parse("--abbrev-ref", Git.HEAD, **kwargs)
+        return self.rev_parse("--abbrev-ref", "HEAD", **kwargs)
 
 
 git = Git()
diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py
index 45795a5ba6e..12dc735ea0b 100644
--- a/dev/archery/archery/utils/source.py
+++ b/dev/archery/archery/utils/source.py
@@ -24,6 +24,10 @@ class ArrowSources:
     """ ArrowSources is a companion class representing a directory containing
     Apache Arrow's sources.
     """
+    # Note that WORKSPACE is a reserved git revision name by this module to
+    # reference the current git workspace. In other words, this indicates to
+    # ArrowSources.at_revision that no cloning/checkout is required.
+    WORKSPACE = "WORKSPACE"
 
     def __init__(self, path):
         """ Initialize an ArrowSources
@@ -60,6 +64,12 @@ def at_revision(self, revision, clone_dir):
         This method may return the current object if no checkout is required.
         The caller is responsible to remove the cloned repository directory.
 
+        The user can use the special WORKSPACE token to mean the current git
+        workspace (no checkout performed).
+
+        The second value of the returned tuple indicates if a clone was
+        performed.
+
         Parameters
         ----------
         revision : str
@@ -70,13 +80,16 @@ def at_revision(self, revision, clone_dir):
         if not self.git_backed:
             raise ValueError(f"{self} is not backed by git")
 
+        if revision == ArrowSources.WORKSPACE:
+            return self, False
+
         # A local clone is required to leave the current sources intact such
         # that builds depending on said sources are not invalidated (or worse
         # slightly affected when re-invoking the generator).
         git.clone("--local", self.path, clone_dir)
         git.checkout(revision, git_dir=clone_dir)
 
-        return ArrowSources(clone_dir)
+        return ArrowSources(clone_dir), True
 
     @staticmethod
     def valid(src):
diff --git a/dev/archery/setup.py b/dev/archery/setup.py
index 54f1c14966e..2cf692c005a 100644
--- a/dev/archery/setup.py
+++ b/dev/archery/setup.py
@@ -21,7 +21,7 @@
 
 
 if sys.version_info < (3, 5):
-    sys.exit('Python < 3.5 is not supported due to subprocess.run')
+    sys.exit('Python < 3.5 is not supported')
 
 
 setup(
@@ -32,7 +32,7 @@
     maintainer='Arrow Developers',
     maintainer_email='dev@arrow.apache.org',
     packages=['archery'],
-    install_requires=['click'],
+    install_requires=['click', 'pandas'],
     entry_points='''
         [console_scripts]
         archery=archery.cli:archery
diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst
index b403392ffbc..d0e6f1b8aa4 100644
--- a/docs/source/developers/benchmarks.rst
+++ b/docs/source/developers/benchmarks.rst
@@ -39,6 +39,9 @@ the module files but uses the actual sources.
   pip install -e dev/archery
   archery --help
 
+  # optional: enable bash/zsh autocompletion
+  eval "$(_ARCHERY_COMPLETE=source archery)"
+
 Comparison
 ==========
 

From 2a953f1808566da01bbb90faeabe8131ff55f902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 14:05:38 -0400
Subject: [PATCH 28/31] Missing files

---
 dev/archery/archery/benchmark/compare.py | 120 +++++++++++++++++++++++
 dev/archery/archery/utils/codec.py       |  43 ++++++++
 dev/archery/tests/test_benchmarks.py     |  39 ++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 dev/archery/archery/benchmark/compare.py
 create mode 100644 dev/archery/archery/utils/codec.py
 create mode 100644 dev/archery/tests/test_benchmarks.py

diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py
new file mode 100644
index 00000000000..74a2538157b
--- /dev/null
+++ b/dev/archery/archery/benchmark/compare.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+DEFAULT_THRESHOLD = 0.05
+
+
+class BenchmarkComparator:
+    """ Compares two benchmarks.
+
+    Encodes the logic of comparing two benchmarks and taking a decision on
+    if it induce a regression.
+    """
+
+    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD,
+                 suite_name=None):
+        self.contender = contender
+        self.baseline = baseline
+        self.threshold = threshold
+        self.suite_name = suite_name
+
+    @property
+    def name(self):
+        return self.baseline.name
+
+    @property
+    def less_is_better(self):
+        return self.baseline.less_is_better
+
+    @property
+    def unit(self):
+        return self.baseline.unit
+
+    @property
+    def change(self):
+        new = self.contender.value
+        old = self.baseline.value
+
+        if old == 0 and new == 0:
+            return 0.0
+        if old == 0:
+            return 0.0
+
+        return float(new - old) / abs(old)
+
+    @property
+    def confidence(self):
+        """ Indicate if a comparison of benchmarks should be trusted. """
+        return True
+
+    @property
+    def regression(self):
+        change = self.change
+        adjusted_change = change if self.less_is_better else -change
+        return (self.confidence and adjusted_change > self.threshold)
+
+    def compare(self, comparator=None):
+        return {
+            "benchmark": self.name,
+            "change": self.change,
+            "regression": self.regression,
+            "baseline": self.baseline.value,
+            "contender": self.contender.value,
+            "unit": self.unit,
+            "less_is_better": self.less_is_better,
+        }
+
+    def __call__(self, **kwargs):
+        return self.compare(**kwargs)
+
+
+def pairwise_compare(contender, baseline):
+    dict_contender = {e.name: e for e in contender}
+    dict_baseline = {e.name: e for e in baseline}
+
+    for name in (dict_contender.keys() & dict_baseline.keys()):
+        yield name, (dict_contender[name], dict_baseline[name])
+
+
+class RunnerComparator:
+    """ Compares suites/benchmarks from runners.
+
+    It is up to the caller that ensure that runners are compatible (both from
+    the same language implementation).
+    """
+
+    def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD):
+        self.contender = contender
+        self.baseline = baseline
+        self.threshold = threshold
+
+    def comparisons(self, suite_filter=None, benchmark_filter=None):
+        """
+        """
+        contender = self.contender.suites(suite_filter, benchmark_filter)
+        baseline = self.baseline.suites(suite_filter, benchmark_filter)
+        suites = pairwise_compare(contender, baseline)
+
+        for suite_name, (suite_cont, suite_base) in suites:
+            benchmarks = pairwise_compare(
+                suite_cont.benchmarks, suite_base.benchmarks)
+
+            for bench_name, (bench_cont, bench_base) in benchmarks:
+                yield BenchmarkComparator(bench_cont, bench_base,
+                                          threshold=self.threshold,
+                                          suite_name=suite_name)
diff --git a/dev/archery/archery/utils/codec.py b/dev/archery/archery/utils/codec.py
new file mode 100644
index 00000000000..612f2df5605
--- /dev/null
+++ b/dev/archery/archery/utils/codec.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import json
+
+from ..benchmark.compare import BenchmarkComparator
+
+
+class JsonEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, BenchmarkComparator):
+            comparator = {
+                "benchmark": o.name,
+                "change": o.change,
+                "regression": o.regression,
+                "baseline": o.baseline.value,
+                "contender": o.contender.value,
+                "unit": o.unit,
+                "less_is_better": o.less_is_better,
+            }
+
+            suite_name = o.suite_name
+            if suite_name:
+                comparator["suite"] = suite_name
+
+            return comparator
+
+        return json.JSONEncoder.default(self, o)
diff --git a/dev/archery/tests/test_benchmarks.py b/dev/archery/tests/test_benchmarks.py
new file mode 100644
index 00000000000..d199a40a195
--- /dev/null
+++ b/dev/archery/tests/test_benchmarks.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from archery.benchmark.core import Benchmark
+from archery.benchmark.compare import BenchmarkComparator
+
+
+def test_benchmark_comparator():
+    unit = "micros"
+
+    assert not BenchmarkComparator(
+        Benchmark("contender", unit, True, [10]),
+        Benchmark("baseline", unit, True, [20])).regression
+
+    assert BenchmarkComparator(
+        Benchmark("contender", unit, False, [10]),
+        Benchmark("baseline", unit, False, [20])).regression
+
+    assert BenchmarkComparator(
+        Benchmark("contender", unit, True, [20]),
+        Benchmark("baseline", unit, True, [10])).regression
+
+    assert not BenchmarkComparator(
+        Benchmark("contender", unit, False, [20]),
+        Benchmark("baseline", unit, False, [10])).regression

From ee39a1feb742dc745fe9af7d9cfc3446c71e74a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 15:40:58 -0400
Subject: [PATCH 29/31] Move cpp_runner_from_rev_or_path in CppRunner

---
 dev/archery/archery/benchmark/runner.py | 29 ++++++++++++++++++++++
 dev/archery/archery/cli.py              | 33 +++----------------------
 dev/archery/archery/utils/cmake.py      |  4 +--
 3 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index 158f2763667..22d1621ab7d 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -21,6 +21,8 @@
 
 from .core import BenchmarkSuite
 from .google import GoogleBenchmarkCommand, GoogleBenchmark
+from ..lang.cpp import CppCMakeDefinition
+from ..utils.cmake import CMakeBuild
 from ..utils.logger import logger
 
 
@@ -83,3 +85,30 @@ def suites(self, suite_filter=None, benchmark_filter=None):
                 continue
 
             yield suite
+
+    @staticmethod
+    def from_rev_or_path(src, root, rev_or_path, cmake_conf):
+        """ Returns a CppBenchmarkRunner from a path or a git revision.
+
+        First, it checks if `rev_or_path` points to a valid CMake build
+        directory.  If so, it creates a CppBenchmarkRunner with this existing
+        CMakeBuild.
+
+        Otherwise, it assumes `rev_or_path` is a revision and clone/checkout
+        the given revision and create a fresh CMakeBuild.
+        """
+        build = None
+        if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
+            build = CMakeBuild.from_path(rev_or_path)
+        else:
+            root_rev = os.path.join(root, rev_or_path)
+            os.mkdir(root_rev)
+
+            clone_dir = os.path.join(root_rev, "arrow")
+            # Possibly checkout the sources at given revision, no need to
+            # perform cleanup on cloned repository as root_rev is reclaimed.
+            src_rev, _ = src.at_revision(rev_or_path, clone_dir)
+            cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
+            build = cmake_def.build(os.path.join(root_rev, "build"))
+
+        return CppBenchmarkRunner(build)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index c8929e7dd1b..2f203123ec5 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -27,7 +27,6 @@
 from .benchmark.compare import RunnerComparator
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
-from .utils.cmake import CMakeBuild
 from .utils.codec import JsonEncoder
 from .utils.logger import logger, ctx as log_ctx
 from .utils.source import ArrowSources
@@ -151,32 +150,6 @@ def tmpdir(preserve, prefix="arrow-bench-"):
             yield tmp
 
 
-def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf):
-    """ Returns a CppBenchmarkRunner from a path or a git revision.
-
-    First, it checks if `rev_or_path` points to a valid CMake build directory.
-    If so, it creates a CppBenchmarkRunner with this existing CMakeBuild.
-
-    Otherwise, it assumes `rev_or_path` is a revision and clone/checkout the
-    given revision and create a fresh CMakeBuild.
-    """
-    build = None
-    if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
-        build = CMakeBuild.from_path(rev_or_path)
-    else:
-        root_rev = os.path.join(root, rev_or_path)
-        os.mkdir(root_rev)
-
-        clone_dir = os.path.join(root_rev, "arrow")
-        # Possibly checkout the sources at given revision, no need to perform
-        # cleanup on cloned repository as root_rev is reclaimed.
-        src_rev, _ = src.at_revision(rev_or_path, clone_dir)
-        cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf)
-        build = cmake_def.build(os.path.join(root_rev, "build"))
-
-    return CppBenchmarkRunner(build)
-
-
 # Running all benchmarks would be prohibitive. Benchmark who needs to be
 # monitored for regression should be named with this prefix.
 DEFAULT_BENCHMARK_FILTER = "^Regression"
@@ -278,8 +251,10 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter,
             build_type="release", with_tests=True, with_benchmarks=True,
             with_python=False, cmake_extras=cmake_extras)
 
-        runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf)
-        runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf)
+        runner_cont = CppBenchmarkRunner.from_rev_or_path(
+            src, root, contender, conf)
+        runner_base = CppBenchmarkRunner.from_rev_or_path(
+            src, root, baseline, conf)
 
         runner_comp = RunnerComparator(runner_cont, runner_base, threshold)
         comparisons = runner_comp.comparisons(suite_filter, benchmark_filter)
diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py
index 9fa5ca66f52..38aedab2d16 100644
--- a/dev/archery/archery/utils/cmake.py
+++ b/dev/archery/archery/utils/cmake.py
@@ -200,9 +200,7 @@ def from_path(path):
         cmake_cache_path = os.path.join(path, "CMakeCache.txt")
         with open(cmake_cache_path, "r") as cmake_cache:
             candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read())
-            if len(candidates) != 1:
-                ValueError("Could not chose build_type from {candidates}")
-            build_type = candidates[0].lower()
+            build_type = candidates[0].lower() if candidates else "release"
 
         return CMakeBuild(path, generator, build_type)
 

From e95baf3178eb847d934766bb4e746fd1459600ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Wed, 24 Apr 2019 20:28:48 -0400
Subject: [PATCH 30/31] Add comments and move stuff

---
 dev/archery/archery/benchmark/compare.py | 2 ++
 dev/archery/archery/cli.py               | 8 ++++++--
 dev/archery/archery/utils/logger.py      | 1 -
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py
index 74a2538157b..bf9811f2c8d 100644
--- a/dev/archery/archery/benchmark/compare.py
+++ b/dev/archery/archery/benchmark/compare.py
@@ -16,6 +16,8 @@
 # under the License.
 
 
+# Define a global regression threshold as 5%. This is purely subjective and
+# flawed. This does not track cumulative regression.
 DEFAULT_THRESHOLD = 0.05
 
 
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 2f203123ec5..8b4780af987 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -24,13 +24,16 @@
 import sys
 from tempfile import mkdtemp, TemporaryDirectory
 
-from .benchmark.compare import RunnerComparator
+from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD
 from .benchmark.runner import CppBenchmarkRunner
 from .lang.cpp import CppCMakeDefinition, CppConfiguration
 from .utils.codec import JsonEncoder
 from .utils.logger import logger, ctx as log_ctx
 from .utils.source import ArrowSources
 
+# Set default logging to INFO in command line.
+logging.basicConfig(level=logging.INFO)
+
 
 @click.group()
 @click.option("--debug", type=bool, is_flag=True, default=False,
@@ -177,7 +180,8 @@ def benchmark(ctx):
               help="Regex filtering benchmark suites.")
 @click.option("--preserve", type=bool, default=False, show_default=True,
               is_flag=True, help="Preserve workspace for investigation.")
-@click.option("--threshold", type=float, default=0.05, show_default=True,
+@click.option("--threshold", type=float, default=DEFAULT_THRESHOLD,
+              show_default=True,
               help="Regression failure threshold in percentage.")
 @click.option("--cmake-extras", type=str, multiple=True,
               help="Extra flags/options to pass to cmake invocation. "
diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py
index 0a4a99edb2f..9d0feda88e6 100644
--- a/dev/archery/archery/utils/logger.py
+++ b/dev/archery/archery/utils/logger.py
@@ -18,7 +18,6 @@
 import logging
 
 """ Global logger. """
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("archery")
 
 

From a047ae4ed4ae884b8c8e9721ba4ab4d389130166 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= <fsaintjacques@gmail.com>
Date: Thu, 25 Apr 2019 10:06:05 -0400
Subject: [PATCH 31/31] Satisfy flake8

---
 dev/archery/archery/benchmark/runner.py | 2 +-
 dev/archery/archery/cli.py              | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py
index 22d1621ab7d..7dc56bdf096 100644
--- a/dev/archery/archery/benchmark/runner.py
+++ b/dev/archery/archery/benchmark/runner.py
@@ -98,7 +98,7 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf):
         the given revision and create a fresh CMakeBuild.
         """
         build = None
-        if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path):
+        if CMakeBuild.is_build_dir(rev_or_path):
             build = CMakeBuild.from_path(rev_or_path)
         else:
             root_rev = os.path.join(root, rev_or_path)
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 8b4780af987..4fa88966080 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -20,7 +20,6 @@
 from contextlib import contextmanager
 import json
 import logging
-import os
 import sys
 from tempfile import mkdtemp, TemporaryDirectory