From 712d2ed3ca2c4b31bd6f095ebe9a1072a8901bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 2 Apr 2019 12:12:38 -0400 Subject: [PATCH 01/31] initial commit --- dev/archery/archery/__init__.py | 0 dev/archery/archery/benchmark.py | 34 ++++++ dev/archery/archery/cli.py | 148 +++++++++++++++++++++++ dev/archery/archery/lang/cpp.py | 111 ++++++++++++++++++ dev/archery/archery/utils/cmake.py | 168 +++++++++++++++++++++++++++ dev/archery/archery/utils/command.py | 44 +++++++ dev/archery/archery/utils/git.py | 51 ++++++++ dev/archery/archery/utils/logger.py | 21 ++++ 8 files changed, 577 insertions(+) create mode 100644 dev/archery/archery/__init__.py create mode 100644 dev/archery/archery/benchmark.py create mode 100644 dev/archery/archery/cli.py create mode 100644 dev/archery/archery/lang/cpp.py create mode 100644 dev/archery/archery/utils/cmake.py create mode 100644 dev/archery/archery/utils/command.py create mode 100644 dev/archery/archery/utils/git.py create mode 100644 dev/archery/archery/utils/logger.py diff --git a/dev/archery/archery/__init__.py b/dev/archery/archery/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dev/archery/archery/benchmark.py b/dev/archery/archery/benchmark.py new file mode 100644 index 00000000000..a901c640922 --- /dev/null +++ b/dev/archery/archery/benchmark.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .utils.git import GitClone, GitCheckout + + +class BenchmarkSuite: + def __init__(self, git_repo, root, revision): + self.root = root + self.clone_dir = os.path.join(root, "arrow-src") + self.revision = revision + self.git_repo = git_repo + self.setup() + + def setup(self): + # Prepare local clone and checkout the proper revision. + GitClone("--local", self.git_repo, self.clone_dir) + GitCheckout("-b", self.revision, git_dir=self.clone_dir) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py new file mode 100644 index 00000000000..60a84de243b --- /dev/null +++ b/dev/archery/archery/cli.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import click +import logging +import os +import tempfile + +from .lang.cpp import CppCMakeDefinition, CppConfiguration, CppBenchmarkSuite +from .utils.git import GitClone, GitCheckout +from .utils.logger import logger + + +@click.group(name="archery") +@click.option("--debug", count=True) +def cli(debug): + if debug: + logger.setLevel(logging.DEBUG) + + +def contains_arrow_sources(path): + cpp_path = os.path.join(path, "cpp") + cmake_path = os.path.join(cpp_path, "CMakeLists.txt") + return os.path.exists(cmake_path): + +def validate_arrow_sources(ctx, param, path): + """ Ensure a directory contains Arrow cpp sources. """ + if not contains_arrow_sources(path): + raise click.BadParameter(f"No Arrow C++ sources found in {path}.") + return path + + +def resolve_arrow_sources(): + """ Infer Arrow sources directory from various method. + + The following guesses are done in order: + + 1. Checks if the environment variable `ARROW_SRC` is defined and use this. + No validation is performed. + + 2. Checks if the current working directory (cwd) contains is an Arrow + source directory. If successful return this. + + 3. Checks if this file (cli.py) is still in the original source repository. + If so, returns the relative path to the source directory. + """ + # Explicit via environment variable + arrow_env = os.environ.get("ARROW_SRC") + if arrow_env: + return arrow_env + + # Implicit via cwd + arrow_cwd = os.getcwd() + if contains_arrow_sources(arrow_cwd): + return arrow_cwd + + # Implicit via archery ran from sources, find relative sources from + this_dir = os.path.dirname(os.path.realpath(__file__)) + arrow_via_archery = os.path.join(this_dir, "..", "..", "..") + if contains_arrow_sources(arrow_via_archery) + return arrow_via_archery + + return None + + +source_dir_type = click.Path(exists=True, dir_okay=True, file_okay=False, + readable=True, resolve_path=True) +build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True) +# Supported build types +build_type = click.Choice(["debug", "relwithdebinfo", "release"], + case_sensitive=False) +# Supported warn levels +warn_level_type = click.Choice(["everything", "checkin", "production"], + case_sensitive=False) + + +@cli.command() +# toolchain +@click.option("--cc", help="C compiler.") +@click.option("--cxx", help="C++ compiler.") +@click.option("--cxx_flags", help="C++ compiler flags.") +@click.option("--build-type", default="release", type=build_type, + help="CMake's CMAKE_BUILD_TYPE") +@click.option("--warn-level", default="production", type=warn_level_type, + help="Controls compiler warnings -W(no-)error.") +# components +@click.option("--with-tests", default=True, type=bool, + help="Build with tests.") +@click.option("--with-benchmarks", default=False, type=bool, + help="Build with benchmarks.") +@click.option("--with-python", default=True, type=bool, + help="Build with python extension.") +@click.option("--with-parquet", default=False, type=bool, + help="Build with parquet file support.") +@click.option("--with-gandiva", default=False, type=bool, + help="Build with Gandiva expression compiler support.") +@click.option("--with-plasma", default=False, type=bool, + help="Build with Plasma object store support.") +@click.option("--with-flight", default=False, type=bool, + help="Build with Flight rpc support.") +# misc +@click.option("-f", "--force", help="Delete existing build directory if found.") +@click.option("--build-and-test", type=bool, default=False, help="") +@click.argument("build_dir", type=build_dir_type) +@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources, + callback=validate_arrow_sources, required=False) +def build(src_dir, build_dir, force, build_and_test, **kwargs): + # Arrow's cpp cmake configuration + conf = CppConfiguration(**kwargs) + # This is a closure around cmake invocation, e.g. calling `def.build()` + # yields a directory ready to be run with the generator + cpp_path = os.path.join(src_dir, "cpp") + cmake_def = CppCMakeDefinition(cpp_path, conf) + # Create build directory + build = cmake_def.build(build_dir, force=force) + if build_and_test: + build.all() + + +@cli.command() +@click.argument("rev_a") +@click.argument("rev_b") +@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources, + callback=validate_arrow_sources, required=False) +def benchmark(rev_a, rev_b, src_dir): + with tempfile.TemporaryDirectory() as tmp_dir: + root_a = os.path.join(tmp_dir, rev_a) + bench_a = CppBenchmarkSuite(src_dir, root_a, rev_a) + + root_b = os.path.join(tmp_dir, rev_b) + bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b) + +if __name__ == "__main__": + cli() diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py new file mode 100644 index 00000000000..a4c458911eb --- /dev/null +++ b/dev/archery/archery/lang/cpp.py @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from ..benchmark import BenchmarkSuite +from ..utils.cmake import CMakeDefinition + + +def thrutifier(value): + return "ON" if value else "OFF" + + +def or_else(value, default): + return value if value else default + + +class CppConfiguration: + def __init__(self, + # toolchain + cc=None, cxx=None, cxx_flags=None, + build_type=None, warn_level=None, + # components + with_tests=True, with_benchmarks=False, with_python=True, + with_parquet=False, with_gandiva=False, with_plasma=False, + with_flight=False): + self.cc = cc + self.cxx = cxx + self.cxx_flags = cxx_flags + + self.build_type = build_type + self.warn_level = warn_level + + self.with_tests = with_tests + self.with_benchmarks = with_benchmarks + self.with_python = with_python + self.with_parquet = with_parquet + self.with_gandiva = with_gandiva + self.with_plasma = with_plasma + self.with_flight = with_flight + + def _gen_defs(self): + if self.cxx_flags: + yield ("ARROW_CXXFLAGS", self.cxx_flags) + + yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) + yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) + + yield ("ARROW_BUILD_TESTS", thrutifier(self.with_tests)) + yield ("ARROW_BUILD_BENCHMARKS", thrutifier(self.with_benchmarks)) + + yield ("ARROW_PYTHON", thrutifier(self.with_python)) + yield ("ARROW_PARQUET", thrutifier(self.with_parquet)) + yield ("ARROW_GANDIVA", thrutifier(self.with_gandiva)) + yield ("ARROW_PLASMA", thrutifier(self.with_plasma)) + yield ("ARROW_FLIGHT", thrutifier(self.with_flight)) + + @property + def definitions(self): + return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] + + @property + def environment(self): + env = os.environ.copy() + + if self.cc: + env["CC"] = self.cc + + if self.cxx: + env["CXX"] = self.cxx + + return env + + +class CppCMakeDefinition(CMakeDefinition): + def __init__(self, source, conf, **kwargs): + self.configuration = conf + super().__init__(source, **kwargs, + definitions=conf.definitions, env=conf.environment) + + +default_benchmark_conf = CppConfiguration( + build_type="release", with_tests=True, with_benchmarks=True) + + +class CppBenchmarkSuite(BenchmarkSuite): + def __init__(self, *args, conf=None, **kwargs): + super().__init__(*args, **kwargs) + + self.conf = conf if conf else default_benchmark_conf + assert self.conf.build_type is not "debug" + self.build_dir = os.path.join(self.root, "build") + self.cmake_def = CppCMakeDefinition( + os.path.join(self.clone_dir, "cpp"), self.conf) + self.build = self.cmake_def.build(self.build_dir) + # Build sources + self.build.all() diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py new file mode 100644 index 00000000000..920d21ee6ed --- /dev/null +++ b/dev/archery/archery/utils/cmake.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +from shutil import rmtree, which + +from .command import Command + + +class CMake(Command): + def __init__(self, cmake_bin=None): + self.bin = cmake_bin if cmake_bin else os.environ.get("CMAKE", "cmake") + + @staticmethod + def default_generator(): + """ Infer default generator. + + Gives precedence to ninja if there exists an executable named `ninja` + in the search path. + """ + found_ninja = which("ninja") + return "Ninja" if found_ninja else "Make" + + +cmake = CMake() + + +class CMakeDefinition: + """ CMakeDefinition captures the cmake invocation arguments. + + It allows creating build directories with the same definition, e.g. + ``` + build_1 = cmake_def.build("/tmp/build-1") + build_2 = cmake_def.build("/tmp/build-2") + + ... + + build1.all() + build2.all() + """ + + def __init__(self, source, build_type="release", generator=None, + definitions=None, env=None): + """ Initialize a CMakeDefinition + + Parameters + ---------- + source : str + Source directory where the top-level CMakeLists.txt is + located. This is usually the root of the project. + generator : str, optional + definitions: list(str), optional + env : dict(str,str), optional + Environment to use when invoking cmake. This can be required to + work around cmake deficiencies, e.g. CC and CXX. + """ + self.source = os.path.abspath(source) + self.generator = generator if generator else cmake.default_generator() + self.definitions = definitions if definitions else [] + self.env = env + + @property + def arguments(self): + """" Return the arguments to cmake invocation. """ + arguments = [ + f"-G{self.generator}", + ] + self.definitions + [ + self.source + ] + return arguments + + def build(self, build_dir, force=False, **kwargs): + """ Invoke cmake into a build directory. + + Parameters + ---------- + build_dir : str + Directory in which the CMake build will be instanciated. + force : bool + If the build folder exists, delete it before. Otherwise if it's + present, an error will be returned. + """ + if os.path.exists(build_dir): + # Extra safety to ensure we're deleting a build folder. + if not CMakeBuild.is_build_dir(build_dir): + raise FileExistsError(f"{build_dir} exists but is not a cmake build, delete manually.") + if not force: + raise FileExistsError(f"{build_dir} exists pass force to delete") + rmtree(build_dir) + + os.mkdir(build_dir) + + cmake(*self.arguments, cwd=build_dir, env=self.env) + return CMakeBuild(self, build_dir, **kwargs) + + def __repr__(self): + return f"CMakeDefinition[source={self.source}]" + + +class CMakeBuild(Command): + """ CMakeBuild represents a build directory initialized by cmake. + + The build instance can be used to build/test/install. It alleviates the + user to know which generator is used. + """ + + def __init__(self, definition, build_dir): + """ Initialize a CMakeBuild. + + The caller must ensure that cmake was invoked in the build directory. + + Parameters + ---------- + definition : CMakeDefinition + The definition to build from. + build_dir : str + The build directory to setup into. + """ + self.definition = definition + self.build_dir = os.path.abspath(build_dir) + + @property + def bin(self): + return self.definition.generator.lower() + + def run(self, *argv, verbose=False, **kwargs): + extra = [] + if verbose: + is_make = self.bin is "make" + extra.append("VERBOSE=1" if is_make else "-v") + # Commands must be ran under the build directory + super().run(*extra, *argv, **kwargs, cwd=self.build_dir) + return self + + def all(self): + return self.run("all") + + def clean(self): + return self.run("clean") + + def install(self): + return self.run("install") + + def test(self): + return self.run("test") + + @staticmethod + def is_build_dir(path): + cmake_cache = os.path.join(path, "CMakeCache.txt") + cmake_files = os.path.join(path, "CMakeFiles") + return os.path.exists(cmake_cache) and os.path.exists(cmake_files) + + def __repr__(self): + return f"CMakeBuild[build={self.build_dir},definition={self.definition}]" diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py new file mode 100644 index 00000000000..214de92b7ad --- /dev/null +++ b/dev/archery/archery/utils/command.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import shutil +import subprocess + +from .logger import logger + + +def find_exec(executable): + if os.path.exists(executable): + return executable + + return shutil.which(executable) + + +class Command: + def bin(self): + raise NotImplementedError("Command must implement bin() method") + + def run(self, *argv, **kwargs): + invocation = [find_exec(self.bin)] + invocation.extend(argv) + + logger.debug(f"Executing `{' '.join(invocation)}`") + return subprocess.run(invocation, **kwargs) + + def __call__(self, *argv, **kwargs): + self.run(*argv, **kwargs) diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py new file mode 100644 index 00000000000..4e0e8fd74d0 --- /dev/null +++ b/dev/archery/archery/utils/git.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .command import Command + + +class Git(Command): + def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs): + self.bin = git_bin if git_bin else os.environ.get("GIT", "git") + self.git_dir = git_dir + self.run(*argv, **kwargs) + + @property + def global_opts(self): + if self.git_dir: + # For some reason, git does not accept `-Cdir` + yield "-C" + yield self.git_dir + + +class GitSubCommand(Git): + def run(self, *argv, **kwargs): + super().run(*self.global_opts, self.cmd, *argv, **kwargs) + + +class GitClone(GitSubCommand): + @property + def cmd(self): + return "clone" + + +class GitCheckout(GitSubCommand): + @property + def cmd(self): + return "checkout" diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py new file mode 100644 index 00000000000..ab97204decb --- /dev/null +++ b/dev/archery/archery/utils/logger.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("archery") From a5ad76d111eaec7ae6c3be7553d4bf1bffa440b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 2 Apr 2019 13:33:15 -0400 Subject: [PATCH 02/31] Fix syntax --- dev/archery/archery/cli.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 60a84de243b..90ba14478cf 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -35,7 +35,8 @@ def cli(debug): def contains_arrow_sources(path): cpp_path = os.path.join(path, "cpp") cmake_path = os.path.join(cpp_path, "CMakeLists.txt") - return os.path.exists(cmake_path): + return os.path.exists(cmake_path) + def validate_arrow_sources(ctx, param, path): """ Ensure a directory contains Arrow cpp sources. """ @@ -71,7 +72,7 @@ def resolve_arrow_sources(): # Implicit via archery ran from sources, find relative sources from this_dir = os.path.dirname(os.path.realpath(__file__)) arrow_via_archery = os.path.join(this_dir, "..", "..", "..") - if contains_arrow_sources(arrow_via_archery) + if contains_arrow_sources(arrow_via_archery): return arrow_via_archery return None @@ -113,22 +114,26 @@ def resolve_arrow_sources(): @click.option("--with-flight", default=False, type=bool, help="Build with Flight rpc support.") # misc -@click.option("-f", "--force", help="Delete existing build directory if found.") -@click.option("--build-and-test", type=bool, default=False, help="") +@click.option("-f", "--force", + help="Delete existing build directory if found.") +@click.option("--targets", type=str, multiple=True, + help="Generator targets to run") @click.argument("build_dir", type=build_dir_type) @click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources, callback=validate_arrow_sources, required=False) -def build(src_dir, build_dir, force, build_and_test, **kwargs): +def build(src_dir, build_dir, force, targets, **kwargs): + src_cpp = os.path.join(src_dir, "cpp") + # Arrow's cpp cmake configuration conf = CppConfiguration(**kwargs) # This is a closure around cmake invocation, e.g. calling `def.build()` # yields a directory ready to be run with the generator - cpp_path = os.path.join(src_dir, "cpp") - cmake_def = CppCMakeDefinition(cpp_path, conf) + cmake_def = CppCMakeDefinition(src_cpp, conf) # Create build directory build = cmake_def.build(build_dir, force=force) - if build_and_test: - build.all() + + for target in targets: + build.run(target) @cli.command() @@ -144,5 +149,6 @@ def benchmark(rev_a, rev_b, src_dir): root_b = os.path.join(tmp_dir, rev_b) bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b) + if __name__ == "__main__": cli() From a38f49cd92c6d8e36b85fc6316d311b7d325496a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Fri, 5 Apr 2019 11:15:37 -0400 Subject: [PATCH 03/31] checkpoint --- dev/archery/archery/benchmark.py | 34 --- dev/archery/archery/benchmark/__init__.py | 0 dev/archery/archery/benchmark/core.py | 80 +++++++ dev/archery/archery/benchmark/google.py | 269 ++++++++++++++++++++++ dev/archery/archery/benchmark/runner.py | 80 +++++++ dev/archery/archery/cli.py | 163 +++++++------ dev/archery/archery/lang/cpp.py | 24 +- dev/archery/archery/utils/cmake.py | 28 ++- dev/archery/archery/utils/command.py | 10 +- dev/archery/archery/utils/git.py | 4 +- dev/archery/archery/utils/source.py | 128 ++++++++++ 11 files changed, 682 insertions(+), 138 deletions(-) delete mode 100644 dev/archery/archery/benchmark.py create mode 100644 dev/archery/archery/benchmark/__init__.py create mode 100644 dev/archery/archery/benchmark/core.py create mode 100644 dev/archery/archery/benchmark/google.py create mode 100644 dev/archery/archery/benchmark/runner.py create mode 100644 dev/archery/archery/utils/source.py diff --git a/dev/archery/archery/benchmark.py b/dev/archery/archery/benchmark.py deleted file mode 100644 index a901c640922..00000000000 --- a/dev/archery/archery/benchmark.py +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -from .utils.git import GitClone, GitCheckout - - -class BenchmarkSuite: - def __init__(self, git_repo, root, revision): - self.root = root - self.clone_dir = os.path.join(root, "arrow-src") - self.revision = revision - self.git_repo = git_repo - self.setup() - - def setup(self): - # Prepare local clone and checkout the proper revision. - GitClone("--local", self.git_repo, self.clone_dir) - GitCheckout("-b", self.revision, git_dir=self.clone_dir) diff --git a/dev/archery/archery/benchmark/__init__.py b/dev/archery/archery/benchmark/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py new file mode 100644 index 00000000000..ccd2630d444 --- /dev/null +++ b/dev/archery/archery/benchmark/core.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +class Statistics: + def __init__(self, min, max, mean, stddev, q1, median, q3): + self.min = min + self.max = max + self.mean = mean + self.stddev = stddev + self.q1 = q1 + self.median = median + self.q3 = q3 + + @staticmethod + def from_values(values): + return None + + +class Benchmark: + def __init__(self, name, values, stats=None): + self.name = name + self.values = values + self.statistics = stats if stats else Statistics.from_values(values) + self.less_is_better = less_is_better + + @property + def value(self): + return self.statistics.median + + +def changes(old, new): + if old == 0 and new == 0: + return 0.0 + if old == 0: + return float(new - old) / (float(old + new) / 2) + return float(new - old) / abs(old) + + +class BenchmarkSuite: + def __init__(self, name, benchmarks): + self.name = name + self.benchmarks = benchmarks + + +def default_comparator(old, new, threshold=0.05): + # negative change is better, positive is tolerable until it exceeds + # threshold + return changes(old, new) < threshold + + +class BenchmarkComparator: + def __init__(self, baseline, contender): + self.baseline = baseline + self.contender = contender + + def compare(self, comparator=None): + comparator = comparator if comparator else default_comparator + return comparator(self.baseline.value, self.contender.value) + + def confidence(self): + """ Indicate if a comparison of benchmarks should be trusted. """ + return True + + def __call__(self, **kwargs): + return self.compare(**kwargs) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py new file mode 100644 index 00000000000..15a65f819ec --- /dev/null +++ b/dev/archery/archery/benchmark/google.py @@ -0,0 +1,269 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from functools import cmp_to_key, lru_cache +from itertools import filterfalse, groupby, tee +import json +import six +from socket import gethostname +import subprocess +import sys +from uuid import getnode + +from ..utils.command import Command + + +def memoize(fn): + return lru_cache(maxsize=1)(fn) + + +def partition(pred, iterable): + # adapted from python's examples + t1, t2 = tee(iterable) + return list(filter(pred, t1)), list(filterfalse(pred, t2)) + + +# Taken from merge_arrow_pr.py +def run_cmd(cmd): + if isinstance(cmd, six.string_types): + cmd = cmd.split(' ') + + try: + output = subprocess.check_output(cmd) + except subprocess.CalledProcessError as e: + # this avoids hiding the stdout / stderr of failed processes + print('Command failed: %s' % cmd) + print('With output:') + print('--------------') + print(e.output) + print('--------------') + raise e + + if isinstance(output, six.binary_type): + output = output.decode('utf-8') + return output.rstrip() + + +class GoogleBenchmarkCommand(Command): + def __init__(self, benchmark_bin): + self.bin = benchmark_bin + + def list_benchmarks(self, benchmark_filter=None): + argv = ["--benchmark_list_tests"] + if benchmark_filter: + argv.append(f"--benchmark_filter={benchmark_filter}") + result = self.run(*argv, stdout=subprocess.PIPE) + return [b for b in result.stdout] + + +class Context: + """ Represents the runtime environment """ + + def __init__(self, date=None, executable=None, **kwargs): + self.date = date + self.executable = executable + + @property + def host(self): + host = { + "hostname": gethostname(), + # Not sure if we should leak this. + "mac_address": getnode(), + } + return host + + @property + def git(self): + head = run_cmd("git rev-parse HEAD") + # %ai: author date, ISO 8601-like format + fmt = "%ai" + timestamp = run_cmd(f"git log -1 --pretty='{fmt}' {head}") + branch = run_cmd("git rev-parse --abbrev-ref HEAD") + git_info = { + "git_commit_timestamp": timestamp, + "git_hash": head, + "git_branch": branch, + } + return git_info + + @property + def toolchain(self): + # TODO parse local CMake generated info to extract compile flags and + # arrow features + deps = {} + toolchain = { + "language_implementation_version": "c++11", + "dependencies": deps, + } + return toolchain + + def as_arrow(self): + ctx = { + "benchmark_language": "C++", + "run_timestamp": self.date, + } + + for extra in (self.host, self.git, self.toolchain): + ctx.update(extra) + + return ctx + + @classmethod + def from_json(cls, version, payload): + return cls(**payload) + + +class GoogleBenchmarkObservation: + """ Represents one run of a single benchmark. """ + + def __init__(self, version, **kwargs): + self._name = kwargs.get("name") + self.version = version + self.real_time = kwargs.get("real_time") + self.cpu_time = kwargs.get("cpu_time") + self.time_unit = kwargs.get("time_unit") + self.size = kwargs.get("size") + self.bytes_per_second = kwargs.get("bytes_per_second") + + @property + def is_mean(self): + return self._name.endswith("_mean") + + @property + def is_median(self): + return self._name.endswith("_median") + + @property + def is_stddev(self): + return self._name.endswith("_stddev") + + @property + def is_agg(self): + return self.is_mean or self.is_median or self.is_stddev + + @property + def is_realtime(self): + return self.name.find("/realtime") != -1 + + @property + @memoize + def name(self): + name = self._name + return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name + + @property + def value(self): + """ Return the benchmark value.""" + if self.size: + return self.bytes_per_second + return self.real_time if self.is_realtime else self.cpu_time + + @property + def unit(self): + if self.size: + return "bytes_per_second" + return self.time_unit + + def __str__(self): + return f"BenchmarkObservation[name={self.name}]" + + +class BenchmarkSuite: + def __init__(self, name, version, runs): + self.name = name + self.version = version + # exclude google benchmark aggregate artifacts + aggs, runs = partition(lambda b: b.is_agg, runs) + self.runs = sorted(runs, key=lambda b: b.value) + self.values = [b.value for b in self.runs] + self.aggregates = aggs + + @property + def n_runs(self): + return len(self.runs) + + @property + @memoize + def mean(self): + maybe_mean = [b for b in self.aggregates if b.is_mean] + if maybe_mean: + return maybe_mean[0].value + # fallback + return sum(self.values) / self.n_runs + + @property + @memoize + def median(self): + maybe_median = [b for b in self.aggregates if b.is_median] + if maybe_median: + return maybe_median[0].value + # fallback + return self.runs[int(self.n_runs / 2)].value + + @property + @memoize + def stddev(self): + maybe_stddev = [b for b in self.aggregates if b.is_stddev] + if maybe_stddev: + return maybe_stddev[0].value + + sum_diff = sum([(val - self.mean)**2 for val in self.values]) + return (sum_diff / (self.n_runs - 1))**0.5 if self.n_runs > 1 else 0.0 + + @property + def min(self): + return self.values[0] + + @property + def max(self): + return self.values[-1] + + def quartile(self, q): + return self.values[int(q * self.n_runs / 4)] + + @property + def q1(self): + return self.quartile(1) + + @property + def q3(self): + return self.quartile(3) + + @property + def parameters(self): + """ Extract parameters from Benchmark's name""" + def parse_param(idx, param): + k_v = param.split(":") + # nameless parameter are transformed into positional names + name = k_v[0] if len(k_v) > 1 else f"arg{idx}" + return name, k_v[-1] + + params = enumerate(self.name.split("/")[1:]) + named_params = [parse_param(idx, p) for idx, p in params if p] + return {k: v for k, v in named_params} + + def __str__(self): + return f"BenchmarkSuite[name={self.name},runs={len(self.runs)}]" + + @classmethod + def from_json(cls, version, payload): + def group_key(x): + return x.name + + benchmarks = map(lambda x: BenchmarkObservation(version, **x), payload) + groups = groupby(sorted(benchmarks, key=group_key), group_key) + return [cls(k, version, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py new file mode 100644 index 00000000000..5fb1f19086a --- /dev/null +++ b/dev/archery/archery/benchmark/runner.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import glob +import os +import re +import subprocess + +from .core import Benchmark, BenchmarkSuite +from .google import GoogleBenchmarkCommand +from ..utils.logger import logger + + +def regex_filter(re_expr): + if re_expr is None: + return lambda s: True + + re_comp = re.compile(re_expr) + return lambda s: re_comp.search(s) + + +class BenchmarkRunner: + def suites(self, suite_filter=None, benchmark_filter=None): + raise NotImplementedError("BenchmarkRunner must implement suites") + + +class CppBenchmarkRunner(BenchmarkRunner): + def __init__(self, build): + """ Initialize a CppBenchmarkRunner. """ + self.build = build + # Ensure build is ready to run benchmarks + self.build.all() + + @property + def suites_binaries(self): + """ Returns a list of benchmark binaries for this build. """ + glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark") + return {os.path.basename(b): b for b in glob.glob(glob_expr)} + + def benchmarks(self, suite_bin, benchmark_filter): + """ Returns the resulting benchmarks for a given suite. """ + suite_cmd = GoogleBenchmarkCommand(suite_bin) + benchmark_names = suite_cmd.list_benchmarks(benchmark_filter) + + if not benchmark_names: + return [] + + return benchmark_names + + def suites(self, suite_filter=None, benchmark_filter=None): + suite_matcher = regex_filter(suite_filter) + + suite_and_binaries = self.suites_binaries + for suite_name in suite_and_binaries: + if suite_matcher(suite_name): + suite_bin = suite_and_binaries[suite_name] + benchmarks = self.benchmarks(suite_bin, + benchmark_filter=benchmark_filter) + + # Filter may exclude all benchmarks + if not benchmarks: + continue + + yield BenchmarkSuite(suite_name, benchmarks) + else: + logger.debug(f"Ignoring suite {suite_name}") diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 90ba14478cf..426673fee7f 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,70 +17,37 @@ # under the License. import click +from contextlib import contextmanager import logging import os -import tempfile +import re +from tempfile import mkdtemp, TemporaryDirectory -from .lang.cpp import CppCMakeDefinition, CppConfiguration, CppBenchmarkSuite -from .utils.git import GitClone, GitCheckout +# from .benchmark import GoogleCompareBenchmark +from .benchmark.runner import CppBenchmarkRunner +from .lang.cpp import CppCMakeDefinition, CppConfiguration +from .utils.git import Git from .utils.logger import logger +from .utils.source import ArrowSources -@click.group(name="archery") +@click.group() @click.option("--debug", count=True) -def cli(debug): +def archery(debug): + """ Apache Arrow developer utilities. """ if debug: logger.setLevel(logging.DEBUG) -def contains_arrow_sources(path): - cpp_path = os.path.join(path, "cpp") - cmake_path = os.path.join(cpp_path, "CMakeLists.txt") - return os.path.exists(cmake_path) - - -def validate_arrow_sources(ctx, param, path): +def validate_arrow_sources(ctx, param, src): """ Ensure a directory contains Arrow cpp sources. """ - if not contains_arrow_sources(path): - raise click.BadParameter(f"No Arrow C++ sources found in {path}.") - return path - - -def resolve_arrow_sources(): - """ Infer Arrow sources directory from various method. + if isinstance(src, str): + if not ArrowSources.valid(src): + raise click.BadParameter(f"No Arrow C++ sources found in {src}.") + src = ArrowSources(src) + return src - The following guesses are done in order: - 1. Checks if the environment variable `ARROW_SRC` is defined and use this. - No validation is performed. - - 2. Checks if the current working directory (cwd) contains is an Arrow - source directory. If successful return this. - - 3. Checks if this file (cli.py) is still in the original source repository. - If so, returns the relative path to the source directory. - """ - # Explicit via environment variable - arrow_env = os.environ.get("ARROW_SRC") - if arrow_env: - return arrow_env - - # Implicit via cwd - arrow_cwd = os.getcwd() - if contains_arrow_sources(arrow_cwd): - return arrow_cwd - - # Implicit via archery ran from sources, find relative sources from - this_dir = os.path.dirname(os.path.realpath(__file__)) - arrow_via_archery = os.path.join(this_dir, "..", "..", "..") - if contains_arrow_sources(arrow_via_archery): - return arrow_via_archery - - return None - - -source_dir_type = click.Path(exists=True, dir_okay=True, file_okay=False, - readable=True, resolve_path=True) build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True) # Supported build types build_type = click.Choice(["debug", "relwithdebinfo", "release"], @@ -89,7 +57,10 @@ def resolve_arrow_sources(): case_sensitive=False) -@cli.command() +@archery.command(short_help="Initialize an Arrow C++ build") +@click.option("--src", default=ArrowSources.find(), + callback=validate_arrow_sources, + help="Specify Arrow source directory") # toolchain @click.option("--cc", help="C compiler.") @click.option("--cxx", help="C++ compiler.") @@ -114,21 +85,18 @@ def resolve_arrow_sources(): @click.option("--with-flight", default=False, type=bool, help="Build with Flight rpc support.") # misc -@click.option("-f", "--force", +@click.option("-f", "--force", type=bool, is_flag=True, default=False, help="Delete existing build directory if found.") @click.option("--targets", type=str, multiple=True, help="Generator targets to run") @click.argument("build_dir", type=build_dir_type) -@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources, - callback=validate_arrow_sources, required=False) -def build(src_dir, build_dir, force, targets, **kwargs): - src_cpp = os.path.join(src_dir, "cpp") - +def build(src, build_dir, force, targets, **kwargs): + """ Build. """ # Arrow's cpp cmake configuration conf = CppConfiguration(**kwargs) # This is a closure around cmake invocation, e.g. calling `def.build()` # yields a directory ready to be run with the generator - cmake_def = CppCMakeDefinition(src_cpp, conf) + cmake_def = CppCMakeDefinition(src.cpp, conf) # Create build directory build = cmake_def.build(build_dir, force=force) @@ -136,19 +104,74 @@ def build(src_dir, build_dir, force, targets, **kwargs): build.run(target) -@cli.command() -@click.argument("rev_a") -@click.argument("rev_b") -@click.argument("src_dir", type=source_dir_type, default=resolve_arrow_sources, - callback=validate_arrow_sources, required=False) -def benchmark(rev_a, rev_b, src_dir): - with tempfile.TemporaryDirectory() as tmp_dir: - root_a = os.path.join(tmp_dir, rev_a) - bench_a = CppBenchmarkSuite(src_dir, root_a, rev_a) +@contextmanager +def tmpdir(preserve, prefix="arrow-bench-"): + if preserve: + yield mkdtemp(prefix=prefix) + else: + with TemporaryDirectory(prefix=prefix) as tmp: + yield tmp + + +DEFAULT_BENCHMARK_CONF = CppConfiguration( + build_type="release", with_tests=True, with_benchmarks=True) + + +def arrow_cpp_benchmark_runner(src, root, rev): + root_rev = os.path.join(root, rev) + os.mkdir(root_rev) + + root_src = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision + src_rev = src if rev == Git.WORKSPACE else src.at_revision(rev, root_src) + + # TODO: find a way to pass custom configuration without cluttering the cli + # Ideally via a configuration file that can be shared with the + # `build` sub-command. + cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF) + build = cmake_def.build(os.path.join(root_rev, "build")) + return CppBenchmarkRunner(build) + + +DEFAULT_BENCHMARK_FILTER = "^Regression" + + +@archery.command(short_help="Run the C++ benchmark suite") +@click.option("--src", default=ArrowSources.find(), + callback=validate_arrow_sources, + help="Specify Arrow source directory") +@click.option("--preserve", type=bool, default=False, is_flag=True, + help="Preserve temporary workspace directory for investigation.") +@click.option("--suite-filter", type=str, default=None, + help="Regex filtering benchmark suites.") +@click.option("--benchmark-filter", type=str, default=DEFAULT_BENCHMARK_FILTER, + help="Regex filtering benchmark suites.") +@click.argument("contender_rev", default=Git.WORKSPACE, required=False) +@click.argument("baseline_rev", default="master", required=False) +def benchmark(src, preserve, suite_filter, benchmark_filter, + contender_rev, baseline_rev): + """ Benchmark Arrow C++ implementation. + """ + with tmpdir(preserve) as root: + runner_cont = arrow_cpp_benchmark_runner(src, root, contender_rev) + runner_base = arrow_cpp_benchmark_runner(src, root, baseline_rev) + + suites_cont = {s.name: s for s in runner_cont.suites(suite_filter, + benchmark_filter)} + suites_base = {s.name: s for s in runner_base.suites(suite_filter, + benchmark_filter)} + + for name in suites_cont.keys() & suites_base.keys(): + logger.debug(f"Comparing {name}") + contender_bin = suites_cont[name] + baseline_bin = suites_base[name] - root_b = os.path.join(tmp_dir, rev_b) - bench_b = CppBenchmarkSuite(src_dir, root_b, rev_b) + # Note that compare.py requires contender and baseline inverted. + # GoogleCompareBenchmark("benchmarks", "--display_aggregates_only", + # baseline_bin, contender_bin, + # f"--benchmark_filter={benchmark_filter}", + # "--benchmark_repetitions=20") if __name__ == "__main__": - cli() + archery() diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index a4c458911eb..136dadc6b4e 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -15,9 +15,10 @@ # specific language governing permissions and limitations # under the License. +import glob import os -from ..benchmark import BenchmarkSuite +from ..benchmark.runner import BenchmarkRunner from ..utils.cmake import CMakeDefinition @@ -90,22 +91,5 @@ class CppCMakeDefinition(CMakeDefinition): def __init__(self, source, conf, **kwargs): self.configuration = conf super().__init__(source, **kwargs, - definitions=conf.definitions, env=conf.environment) - - -default_benchmark_conf = CppConfiguration( - build_type="release", with_tests=True, with_benchmarks=True) - - -class CppBenchmarkSuite(BenchmarkSuite): - def __init__(self, *args, conf=None, **kwargs): - super().__init__(*args, **kwargs) - - self.conf = conf if conf else default_benchmark_conf - assert self.conf.build_type is not "debug" - self.build_dir = os.path.join(self.root, "build") - self.cmake_def = CppCMakeDefinition( - os.path.join(self.clone_dir, "cpp"), self.conf) - self.build = self.cmake_def.build(self.build_dir) - # Build sources - self.build.all() + definitions=conf.definitions, env=conf.environment, + build_type=conf.build_type) diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py index 920d21ee6ed..b2e7c75b511 100644 --- a/dev/archery/archery/utils/cmake.py +++ b/dev/archery/archery/utils/cmake.py @@ -69,6 +69,7 @@ def __init__(self, source, build_type="release", generator=None, work around cmake deficiencies, e.g. CC and CXX. """ self.source = os.path.abspath(source) + self.build_type = build_type self.generator = generator if generator else cmake.default_generator() self.definitions = definitions if definitions else [] self.env = env @@ -97,15 +98,16 @@ def build(self, build_dir, force=False, **kwargs): if os.path.exists(build_dir): # Extra safety to ensure we're deleting a build folder. if not CMakeBuild.is_build_dir(build_dir): - raise FileExistsError(f"{build_dir} exists but is not a cmake build, delete manually.") + raise FileExistsError(f"{build_dir} is not a cmake build") if not force: - raise FileExistsError(f"{build_dir} exists pass force to delete") + raise FileExistsError(f"{build_dir} exists use force=True") rmtree(build_dir) os.mkdir(build_dir) cmake(*self.arguments, cwd=build_dir, env=self.env) - return CMakeBuild(self, build_dir, **kwargs) + return CMakeBuild(build_dir, self.generator.lower(), self.build_type, + definition=self, **kwargs) def __repr__(self): return f"CMakeDefinition[source={self.source}]" @@ -118,7 +120,7 @@ class CMakeBuild(Command): user to know which generator is used. """ - def __init__(self, definition, build_dir): + def __init__(self, build_dir, generator, build_type, definition=None): """ Initialize a CMakeBuild. The caller must ensure that cmake was invoked in the build directory. @@ -130,18 +132,19 @@ def __init__(self, definition, build_dir): build_dir : str The build directory to setup into. """ - self.definition = definition self.build_dir = os.path.abspath(build_dir) + self.bin = generator + self.build_type = build_type + self.definition = definition @property - def bin(self): - return self.definition.generator.lower() + def binaries_dir(self): + return os.path.join(self.build_dir, self.build_type) def run(self, *argv, verbose=False, **kwargs): extra = [] if verbose: - is_make = self.bin is "make" - extra.append("VERBOSE=1" if is_make else "-v") + extra.append("-v" if self.bin.endswith("ninja") else "VERBOSE=1") # Commands must be ran under the build directory super().run(*extra, *argv, **kwargs, cwd=self.build_dir) return self @@ -165,4 +168,9 @@ def is_build_dir(path): return os.path.exists(cmake_cache) and os.path.exists(cmake_files) def __repr__(self): - return f"CMakeBuild[build={self.build_dir},definition={self.definition}]" + return ("CMakeBuild[" + "build = {}," + "build_type = {}," + "definition = {}]".format(self.build_dir, + self.build_type, + self.definition)) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 214de92b7ad..77b2230bd6c 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -33,12 +33,16 @@ class Command: def bin(self): raise NotImplementedError("Command must implement bin() method") - def run(self, *argv, **kwargs): + def run(self, *argv, raise_on_failure=True, **kwargs): invocation = [find_exec(self.bin)] invocation.extend(argv) - logger.debug(f"Executing `{' '.join(invocation)}`") - return subprocess.run(invocation, **kwargs) + logger.debug(f"Executing `{invocation}`") + result = subprocess.run(invocation, **kwargs) + if raise_on_failure: + result.check_returncode() + + return result def __call__(self, *argv, **kwargs): self.run(*argv, **kwargs) diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py index 4e0e8fd74d0..18c52bb07cb 100644 --- a/dev/archery/archery/utils/git.py +++ b/dev/archery/archery/utils/git.py @@ -21,6 +21,8 @@ class Git(Command): + WORKSPACE = "WORKSPACE" + def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs): self.bin = git_bin if git_bin else os.environ.get("GIT", "git") self.git_dir = git_dir @@ -29,7 +31,7 @@ def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs): @property def global_opts(self): if self.git_dir: - # For some reason, git does not accept `-Cdir` + # For some reason, git does not accept `-Cdir` or `-C=dir` yield "-C" yield self.git_dir diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py new file mode 100644 index 00000000000..24e22c552cd --- /dev/null +++ b/dev/archery/archery/utils/source.py @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from .git import Git, GitCheckout, GitClone + + +class ArrowSources: + """ ArrowSources is a companion class representing a directory containing + Apache Arrow's sources. + """ + + def __init__(self, path): + """ Initialize an ArrowSources + + The caller must ensure that path is valid arrow source directory (can + be checked with ArrowSources.valid + + Parameters + ---------- + path : src + """ + assert isinstance(path, str) and ArrowSources.valid(path) + self.path = path + + @property + def cpp(self): + """ Returns the cpp directory of an Arrow sources. """ + return os.path.join(self.path, "cpp") + + @property + def python(self): + """ Returns the python directory of an Arrow sources. """ + return os.path.join(self.path, "python") + + @property + def git_backed(self): + """ Indicate if the sources are backed by git. """ + git_path = os.path.join(self.path, ".git") + return os.path.exists(git_path) + + def at_revision(self, revision, clone_dir): + """ Return a copy of the current sources for a specified git revision. + + This method may return the current object if no checkout is required. + The caller is responsible to remove the cloned repository directory. + + Parameters + ---------- + revision : str + Revision to checkout sources at. + clone_dir : str + Path to checkout the local clone. + """ + if not self.git_backed: + raise ValueError(f"{self} is not backed by git") + + # A local clone is required to leave the current sources intact such + # that build depending on said sources are not invalidated (or worse + # slightly affected when re-invoking the generator). + GitClone("--local", self.path, clone_dir) + GitCheckout("-b", revision, git_dir=clone_dir) + + return ArrowSources(clone_dir) + + @staticmethod + def valid(src): + """ Indicate if current sources are valid. """ + if isinstance(src, ArrowSources): + return True + if isinstance(src, str): + cpp_path = os.path.join(src, "cpp") + cmake_path = os.path.join(cpp_path, "CMakeLists.txt") + return os.path.exists(cmake_path) + return False + + @staticmethod + def find(path=None): + """ Infer Arrow sources directory from various method. + + The following guesses are done in order until a valid match is found: + + 1. Checks the given optional parameter. + + 2. Checks if the environment variable `ARROW_SRC` is defined and use + this. + + 3. Checks if the current working directory (cwd) contains is an Arrow + source directory. + + 4. Checks if this file (cli.py) is still in the original source + repository. If so, returns the relative path to the source + directory. + """ + + # Explicit via environment + env = os.environ.get("ARROW_SRC") + + # Implicit via cwd + cwd = os.getcwd() + + # Implicit via current file + this_dir = os.path.dirname(os.path.realpath(__file__)) + this = os.path.join(this_dir, "..", "..", "..", "..") + + for p in [path, env, cwd, this]: + if ArrowSources.valid(p): + return ArrowSources(p) + + return None + + def __repr__(self): + return f"ArrowSources[{self.path}]" From 2c0d512f89bccc8f6d0e9e951a834a4154ace4a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Fri, 5 Apr 2019 22:19:56 -0400 Subject: [PATCH 04/31] Checkpoint --- dev/archery/archery/benchmark/core.py | 19 ++- dev/archery/archery/benchmark/google.py | 172 +++++++----------------- dev/archery/archery/benchmark/runner.py | 44 +++--- dev/archery/archery/cli.py | 151 +++++++++++++++------ dev/archery/archery/utils/cmake.py | 26 ++++ dev/archery/archery/utils/command.py | 14 ++ dev/archery/archery/utils/git.py | 65 ++++++--- dev/archery/archery/utils/source.py | 6 +- 8 files changed, 286 insertions(+), 211 deletions(-) diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py index ccd2630d444..751e9773def 100644 --- a/dev/archery/archery/benchmark/core.py +++ b/dev/archery/archery/benchmark/core.py @@ -56,6 +56,11 @@ def __init__(self, name, benchmarks): self.name = name self.benchmarks = benchmarks + def __repr__(self): + name = self.name + benchmarks = self.benchmarks + return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]" + def default_comparator(old, new, threshold=0.05): # negative change is better, positive is tolerable until it exceeds @@ -63,14 +68,22 @@ def default_comparator(old, new, threshold=0.05): return changes(old, new) < threshold +def changes(old, new): + if old == 0 and new == 0: + return 0.0 + if old == 0: + return float(new - old) / (float(old + new) / 2) + return float(new - old) / abs(old) + + class BenchmarkComparator: - def __init__(self, baseline, contender): - self.baseline = baseline + def __init__(self, contender, baseline): self.contender = contender + self.baseline = baseline def compare(self, comparator=None): comparator = comparator if comparator else default_comparator - return comparator(self.baseline.value, self.contender.value) + return changes(self.baseline.value, self.contender.value) def confidence(self): """ Indicate if a comparison of benchmarks should be trusted. """ diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index 15a65f819ec..a330a523f1e 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -15,20 +15,13 @@ # specific language governing permissions and limitations # under the License. -from functools import cmp_to_key, lru_cache from itertools import filterfalse, groupby, tee import json -import six -from socket import gethostname import subprocess -import sys -from uuid import getnode +from .core import Benchmark from ..utils.command import Command - - -def memoize(fn): - return lru_cache(maxsize=1)(fn) +from ..utils.git import git def partition(pred, iterable): @@ -37,40 +30,28 @@ def partition(pred, iterable): return list(filter(pred, t1)), list(filterfalse(pred, t2)) -# Taken from merge_arrow_pr.py -def run_cmd(cmd): - if isinstance(cmd, six.string_types): - cmd = cmd.split(' ') - - try: - output = subprocess.check_output(cmd) - except subprocess.CalledProcessError as e: - # this avoids hiding the stdout / stderr of failed processes - print('Command failed: %s' % cmd) - print('With output:') - print('--------------') - print(e.output) - print('--------------') - raise e - - if isinstance(output, six.binary_type): - output = output.decode('utf-8') - return output.rstrip() - - class GoogleBenchmarkCommand(Command): - def __init__(self, benchmark_bin): + def __init__(self, benchmark_bin, benchmark_filter=None): self.bin = benchmark_bin + self.benchmark_filter = benchmark_filter - def list_benchmarks(self, benchmark_filter=None): + def list_benchmarks(self): argv = ["--benchmark_list_tests"] - if benchmark_filter: - argv.append(f"--benchmark_filter={benchmark_filter}") + if self.benchmark_filter: + argv.append(f"--benchmark_filter={self.benchmark_filter}") result = self.run(*argv, stdout=subprocess.PIPE) return [b for b in result.stdout] + def results(self): + argv = ["--benchmark_format=json", "--benchmark_repetitions=20"] + + if self.benchmark_filter: + argv.append(f"--benchmark_filter={self.benchmark_filter}") + + return json.loads(self.run(*argv, stdout=subprocess.PIPE).stdout) -class Context: + +class GoogleContext: """ Represents the runtime environment """ def __init__(self, date=None, executable=None, **kwargs): @@ -88,11 +69,10 @@ def host(self): @property def git(self): - head = run_cmd("git rev-parse HEAD") + head = git.head() # %ai: author date, ISO 8601-like format - fmt = "%ai" - timestamp = run_cmd(f"git log -1 --pretty='{fmt}' {head}") - branch = run_cmd("git rev-parse --abbrev-ref HEAD") + timestamp = git.log("-1", "--pretty='%ai'", head) + branch = git.current_branch(), git_info = { "git_commit_timestamp": timestamp, "git_hash": head, @@ -123,125 +103,59 @@ def as_arrow(self): return ctx @classmethod - def from_json(cls, version, payload): + def from_json(cls, payload): return cls(**payload) -class GoogleBenchmarkObservation: +class BenchmarkObservation: """ Represents one run of a single benchmark. """ - def __init__(self, version, **kwargs): + def __init__(self, **kwargs): self._name = kwargs.get("name") - self.version = version self.real_time = kwargs.get("real_time") self.cpu_time = kwargs.get("cpu_time") self.time_unit = kwargs.get("time_unit") self.size = kwargs.get("size") self.bytes_per_second = kwargs.get("bytes_per_second") - @property - def is_mean(self): - return self._name.endswith("_mean") - - @property - def is_median(self): - return self._name.endswith("_median") - - @property - def is_stddev(self): - return self._name.endswith("_stddev") - @property def is_agg(self): - return self.is_mean or self.is_median or self.is_stddev + suffixes = ["_mean", "_median", "_stddev"] + return any(map(lambda x: self._name.endswith(x), suffixes)) @property def is_realtime(self): return self.name.find("/realtime") != -1 @property - @memoize def name(self): name = self._name return name.rsplit("_", maxsplit=1)[0] if self.is_agg else name + @property + def time(self): + return self.real_time if self.is_realtime else self.cpu_time + @property def value(self): """ Return the benchmark value.""" - if self.size: - return self.bytes_per_second - return self.real_time if self.is_realtime else self.cpu_time + return self.bytes_per_second if self.size else self.time @property def unit(self): - if self.size: - return "bytes_per_second" - return self.time_unit + return "bytes_per_second" if self.size else self.time_unit - def __str__(self): - return f"BenchmarkObservation[name={self.name}]" + def __repr__(self): + return f"{self.value}" -class BenchmarkSuite: - def __init__(self, name, version, runs): +class GoogleBenchmark(Benchmark): + def __init__(self, name, runs): self.name = name - self.version = version # exclude google benchmark aggregate artifacts - aggs, runs = partition(lambda b: b.is_agg, runs) + _, runs = partition(lambda b: b.is_agg, runs) self.runs = sorted(runs, key=lambda b: b.value) - self.values = [b.value for b in self.runs] - self.aggregates = aggs - - @property - def n_runs(self): - return len(self.runs) - - @property - @memoize - def mean(self): - maybe_mean = [b for b in self.aggregates if b.is_mean] - if maybe_mean: - return maybe_mean[0].value - # fallback - return sum(self.values) / self.n_runs - - @property - @memoize - def median(self): - maybe_median = [b for b in self.aggregates if b.is_median] - if maybe_median: - return maybe_median[0].value - # fallback - return self.runs[int(self.n_runs / 2)].value - - @property - @memoize - def stddev(self): - maybe_stddev = [b for b in self.aggregates if b.is_stddev] - if maybe_stddev: - return maybe_stddev[0].value - - sum_diff = sum([(val - self.mean)**2 for val in self.values]) - return (sum_diff / (self.n_runs - 1))**0.5 if self.n_runs > 1 else 0.0 - - @property - def min(self): - return self.values[0] - - @property - def max(self): - return self.values[-1] - - def quartile(self, q): - return self.values[int(q * self.n_runs / 4)] - - @property - def q1(self): - return self.quartile(1) - - @property - def q3(self): - return self.quartile(3) + super().__init__(name, [b.value for b in self.runs]) @property def parameters(self): @@ -256,14 +170,18 @@ def parse_param(idx, param): named_params = [parse_param(idx, p) for idx, p in params if p] return {k: v for k, v in named_params} - def __str__(self): - return f"BenchmarkSuite[name={self.name},runs={len(self.runs)}]" + @property + def less_is_better(self): + return self.runs[0].size is None + + def __repr__(self): + return f"GoogleBenchmark[name={self.name},runs={self.runs}]" @classmethod - def from_json(cls, version, payload): + def from_json(cls, payload): def group_key(x): return x.name - benchmarks = map(lambda x: BenchmarkObservation(version, **x), payload) + benchmarks = map(lambda x: BenchmarkObservation(**x), payload) groups = groupby(sorted(benchmarks, key=group_key), group_key) - return [cls(k, version, list(bs)) for k, bs in groups] + return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 5fb1f19086a..13786020f4d 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -21,14 +21,13 @@ import subprocess from .core import Benchmark, BenchmarkSuite -from .google import GoogleBenchmarkCommand +from .google import GoogleBenchmarkCommand, GoogleBenchmark, GoogleContext from ..utils.logger import logger def regex_filter(re_expr): if re_expr is None: return lambda s: True - re_comp = re.compile(re_expr) return lambda s: re_comp.search(s) @@ -42,39 +41,46 @@ class CppBenchmarkRunner(BenchmarkRunner): def __init__(self, build): """ Initialize a CppBenchmarkRunner. """ self.build = build - # Ensure build is ready to run benchmarks - self.build.all() @property def suites_binaries(self): """ Returns a list of benchmark binaries for this build. """ + # Ensure build is up-to-date to run benchmarks + self.build() + # Not the best method, but works for now glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark") return {os.path.basename(b): b for b in glob.glob(glob_expr)} - def benchmarks(self, suite_bin, benchmark_filter): + def suite(self, name, suite_bin, benchmark_filter): """ Returns the resulting benchmarks for a given suite. """ - suite_cmd = GoogleBenchmarkCommand(suite_bin) - benchmark_names = suite_cmd.list_benchmarks(benchmark_filter) + suite_cmd = GoogleBenchmarkCommand(suite_bin, benchmark_filter) + # Ensure there will be data + benchmark_names = suite_cmd.list_benchmarks() if not benchmark_names: - return [] + return None - return benchmark_names + results = suite_cmd.results() + benchmarks = GoogleBenchmark.from_json(results.get("benchmarks")) + return BenchmarkSuite(name, benchmarks) def suites(self, suite_filter=None, benchmark_filter=None): + """ Returns all suite for a runner. """ suite_matcher = regex_filter(suite_filter) suite_and_binaries = self.suites_binaries for suite_name in suite_and_binaries: - if suite_matcher(suite_name): - suite_bin = suite_and_binaries[suite_name] - benchmarks = self.benchmarks(suite_bin, - benchmark_filter=benchmark_filter) + if not suite_matcher(suite_name): + logger.debug(f"Ignoring suite {suite_name}") + continue - # Filter may exclude all benchmarks - if not benchmarks: - continue + suite_bin = suite_and_binaries[suite_name] + suite = self.suite(suite_name, suite_bin, + benchmark_filter=benchmark_filter) - yield BenchmarkSuite(suite_name, benchmarks) - else: - logger.debug(f"Ignoring suite {suite_name}") + # Filter may exclude all benchmarks + if not suite: + logger.debug(f"Suite {suite_name} executed but no results") + continue + + yield suite diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 426673fee7f..b27357a582d 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -23,9 +23,10 @@ import re from tempfile import mkdtemp, TemporaryDirectory -# from .benchmark import GoogleCompareBenchmark +from .benchmark.core import BenchmarkComparator from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration +from .utils.cmake import CMakeBuild from .utils.git import Git from .utils.logger import logger from .utils.source import ArrowSources @@ -33,8 +34,12 @@ @click.group() @click.option("--debug", count=True) -def archery(debug): +@click.pass_context +def archery(ctx, debug): """ Apache Arrow developer utilities. """ + # Ensure ctx.obj exists + ctx.ensure_object(dict) + ctx.obj["DEBUG"] = debug if debug: logger.setLevel(logging.DEBUG) @@ -58,7 +63,7 @@ def validate_arrow_sources(ctx, param, src): @archery.command(short_help="Initialize an Arrow C++ build") -@click.option("--src", default=ArrowSources.find(), +@click.option("--src", metavar="", default=ArrowSources.find(), callback=validate_arrow_sources, help="Specify Arrow source directory") # toolchain @@ -90,7 +95,8 @@ def validate_arrow_sources(ctx, param, src): @click.option("--targets", type=str, multiple=True, help="Generator targets to run") @click.argument("build_dir", type=build_dir_type) -def build(src, build_dir, force, targets, **kwargs): +@click.pass_context +def build(ctx, src, build_dir, force, targets, **kwargs): """ Build. """ # Arrow's cpp cmake configuration conf = CppConfiguration(**kwargs) @@ -117,61 +123,130 @@ def tmpdir(preserve, prefix="arrow-bench-"): build_type="release", with_tests=True, with_benchmarks=True) -def arrow_cpp_benchmark_runner(src, root, rev): - root_rev = os.path.join(root, rev) - os.mkdir(root_rev) +def cpp_runner_from_rev_or_path(src, root, rev_or_path): + build = None + if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): + build = CMakeBuild.from_path(rev_or_path) + else: + root_rev = os.path.join(root, rev_or_path) + os.mkdir(root_rev) + + # Possibly checkout the sources at given revision + src_rev = src + if rev_or_path != Git.WORKSPACE: + root_src = os.path.join(root_rev, "arrow") + src_rev = src.at_revision(rev_or_path, root_src) - root_src = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision - src_rev = src if rev == Git.WORKSPACE else src.at_revision(rev, root_src) + # TODO: find a way to pass custom configuration without cluttering + # the cli. Ideally via a configuration file that can be shared with the + # `build` sub-command. + cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF) + build = cmake_def.build(os.path.join(root_rev, "build")) - # TODO: find a way to pass custom configuration without cluttering the cli - # Ideally via a configuration file that can be shared with the - # `build` sub-command. - cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF) - build = cmake_def.build(os.path.join(root_rev, "build")) return CppBenchmarkRunner(build) DEFAULT_BENCHMARK_FILTER = "^Regression" -@archery.command(short_help="Run the C++ benchmark suite") -@click.option("--src", default=ArrowSources.find(), +@archery.group() +@click.pass_context +def benchmark(ctx): + """ Arrow benchmarking. + + Use the diff sub-command to benchmake revisions, and/or build directories. + """ + pass + + +@benchmark.command(name="diff", short_help="Run the C++ benchmark suite") +@click.option("--src", metavar="", default=ArrowSources.find(), callback=validate_arrow_sources, help="Specify Arrow source directory") -@click.option("--preserve", type=bool, default=False, is_flag=True, - help="Preserve temporary workspace directory for investigation.") -@click.option("--suite-filter", type=str, default=None, +@click.option("--suite-filter", metavar="", type=str, default=None, help="Regex filtering benchmark suites.") -@click.option("--benchmark-filter", type=str, default=DEFAULT_BENCHMARK_FILTER, +@click.option("--benchmark-filter", metavar="", type=str, + default=DEFAULT_BENCHMARK_FILTER, help="Regex filtering benchmark suites.") -@click.argument("contender_rev", default=Git.WORKSPACE, required=False) -@click.argument("baseline_rev", default="master", required=False) -def benchmark(src, preserve, suite_filter, benchmark_filter, - contender_rev, baseline_rev): - """ Benchmark Arrow C++ implementation. +@click.option("--preserve", type=bool, default=False, is_flag=True, + help="Preserve temporary workspace directory for investigation.") +@click.argument("contender", metavar="[", default=Git.WORKSPACE, + required=False) +@click.argument("baseline", metavar="[]]", default="master", + required=False) +@click.pass_context +def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, + contender, baseline): + """ Compare (diff) benchmark runs. + + This command acts like git-diff but for benchmark results. + + The caller can optionally specify both the contender and the baseline. If + unspecified, the contender will default to the current workspace (like git) + and the baseline will default to master. + + Each target (contender or baseline) can either be a git object reference + (commit, tag, special values like HEAD) or a cmake build directory. This + allow comparing git commits, and/or different compilers and/or compiler + flags. + + When a commit is referenced, a local clone of the arrow sources (specified + via --src) is performed and the proper branch is created. This is done in + a temporary directory which can be left intact with the `---preserve` flag. + + The special token "WORKSPACE" is reserved to specify the current git + workspace. This imply that no clone will be performed. + + Examples: + + \b + # Compare workspace (contender) with master (baseline) + archery benchmark diff + + \b + # Compare master (contender) with latest version (baseline) + export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1) + archery benchmark diff master "$LAST" + + \b + # Compare g++7 (contender) with clang++-7 (baseline) builds + archery build --with-benchmarks=true \\ + --cc=gcc7 --cxx=g++7 gcc7-build + archery build --with-benchmarks=true \\ + --cc=clang-7 --cxx=clang++-7 clang7-build + archery benchmark diff gcc7-build clang7-build + + \b + # Compare default but only scoped to the suites matching `aggregate` and + # the benchmarks matching `Kernel`. + archery benchmark diff --suite-filter=aggregate --benchmark-filter=Kernel """ with tmpdir(preserve) as root: - runner_cont = arrow_cpp_benchmark_runner(src, root, contender_rev) - runner_base = arrow_cpp_benchmark_runner(src, root, baseline_rev) + runner_cont = cpp_runner_from_rev_or_path(src, root, contender) + runner_base = cpp_runner_from_rev_or_path(src, root, baseline) suites_cont = {s.name: s for s in runner_cont.suites(suite_filter, benchmark_filter)} suites_base = {s.name: s for s in runner_base.suites(suite_filter, benchmark_filter)} - for name in suites_cont.keys() & suites_base.keys(): - logger.debug(f"Comparing {name}") - contender_bin = suites_cont[name] - baseline_bin = suites_base[name] + for suite_name in suites_cont.keys() & suites_base.keys(): + logger.debug(f"Comparing {suite_name}") + + suite_cont = { + b.name: b for b in suites_cont[suite_name].benchmarks} + suite_base = { + b.name: b for b in suites_base[suite_name].benchmarks} + + for bench_name in suite_cont.keys() & suite_base.keys(): + logger.debug(f"Comparing {bench_name}") + + bench_cont = suite_cont[bench_name] + bench_base = suite_base[bench_name] - # Note that compare.py requires contender and baseline inverted. - # GoogleCompareBenchmark("benchmarks", "--display_aggregates_only", - # baseline_bin, contender_bin, - # f"--benchmark_filter={benchmark_filter}", - # "--benchmark_repetitions=20") + comp = BenchmarkComparator(bench_cont, bench_base) + print(comp.compare()) if __name__ == "__main__": - archery() + archery(obj={}) diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py index b2e7c75b511..a549f07b8b4 100644 --- a/dev/archery/archery/utils/cmake.py +++ b/dev/archery/archery/utils/cmake.py @@ -16,6 +16,7 @@ # under the License. import os +import re from shutil import rmtree, which from .command import Command @@ -113,6 +114,9 @@ def __repr__(self): return f"CMakeDefinition[source={self.source}]" +CMAKE_BUILD_TYPE_RE = re.compile("CMAKE_BUILD_TYPE:STRING=([a-zA-Z]+)") + + class CMakeBuild(Command): """ CMakeBuild represents a build directory initialized by cmake. @@ -132,6 +136,7 @@ def __init__(self, build_dir, generator, build_type, definition=None): build_dir : str The build directory to setup into. """ + assert CMakeBuild.is_build_dir(build_dir) self.build_dir = os.path.abspath(build_dir) self.bin = generator self.build_type = build_type @@ -167,6 +172,27 @@ def is_build_dir(path): cmake_files = os.path.join(path, "CMakeFiles") return os.path.exists(cmake_cache) and os.path.exists(cmake_files) + @staticmethod + def from_path(path): + if not CMakeBuild.is_build_dir(path): + raise ValueError(f"Not a valid CMakeBuild path: {path}") + + generator = "make" + if os.path.exists(os.path.join(path, "build.ninja")): + generator = "ninja" + + build_type = None + # Infer build_type by looking at CMakeCache.txt and looking for a magic + # definition + cmake_cache_path = os.path.join(path, "CMakeCache.txt") + with open(cmake_cache_path, "r") as cmake_cache: + candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read()) + if len(candidates) != 1: + ValueError("Could not chose build_type from {candidates}") + build_type = candidates[0].lower() + + return CMakeBuild(path, generator, build_type) + def __repr__(self): return ("CMakeBuild[" "build = {}," diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 77b2230bd6c..a9f2bb7a393 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -29,6 +29,20 @@ def find_exec(executable): return shutil.which(executable) +# Decorator running a command and returning stdout +class capture_stdout: + def __init__(self, strip=False): + self.strip = strip + + def __call__(self, f): + def strip_it(x): + return x.strip() if self.strip else x + + def wrapper(*argv, **kwargs): + return strip_it(fn(*argv, **kwargs, stdout=subprocess.PIPE).stdout) + return wrapper + + class Command: def bin(self): raise NotImplementedError("Command must implement bin() method") diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py index 18c52bb07cb..2d62eeb31bd 100644 --- a/dev/archery/archery/utils/git.py +++ b/dev/archery/archery/utils/git.py @@ -17,37 +17,60 @@ import os -from .command import Command +from .command import Command, capture_stdout + + +# Decorator prepending argv with the git sub-command found with the method +# name. +def git_cmd(fn): + # function name is the subcommand + sub_cmd = fn.__name__.replace("_", "-") + + def wrapper(self, *argv, **kwargs): + return fn(self, sub_cmd, *argv, **kwargs) + return wrapper class Git(Command): WORKSPACE = "WORKSPACE" + HEAD = "HEAD" - def __init__(self, *argv, git_bin=None, git_dir=None, **kwargs): + def __init__(self, git_bin=None): self.bin = git_bin if git_bin else os.environ.get("GIT", "git") - self.git_dir = git_dir - self.run(*argv, **kwargs) - @property - def global_opts(self): - if self.git_dir: - # For some reason, git does not accept `-Cdir` or `-C=dir` - yield "-C" - yield self.git_dir + def run_cmd(self, cmd, *argv, git_dir=None, **kwargs): + """ Inject flags before sub-command in argv. """ + opts = [] + if git_dir and isinstance(git_dir, str): + opts.extend(("-C", git_dir)) + + return self.run(*opts, cmd, *argv, **kwargs) + + @git_cmd + def clone(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) + + @git_cmd + def checkout(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) + @git_cmd + def log(self, *argv, **kwargs): + return self.run_cmd(*argv, **kwargs) -class GitSubCommand(Git): - def run(self, *argv, **kwargs): - super().run(*self.global_opts, self.cmd, *argv, **kwargs) + @git_cmd + def rev_parse(self, *argv, **kwargs): + print(self.head()) + return self.run_cmd(*argv, **kwargs) + @capture_stdout(strip=True) + def head(self, **kwargs): + """ Return commit pointed by HEAD. """ + return self.rev_parse(Git.HEAD, **kwargs) -class GitClone(GitSubCommand): - @property - def cmd(self): - return "clone" + @capture_stdout(strip=True) + def current_branch(self, **kwargs): + return self.rev_parse("--abbrev-ref", Git.HEAD, **kwargs) -class GitCheckout(GitSubCommand): - @property - def cmd(self): - return "checkout" +git = Git() diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 24e22c552cd..8f75b8f2675 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -17,7 +17,7 @@ import os -from .git import Git, GitCheckout, GitClone +from .git import git class ArrowSources: @@ -73,8 +73,8 @@ def at_revision(self, revision, clone_dir): # A local clone is required to leave the current sources intact such # that build depending on said sources are not invalidated (or worse # slightly affected when re-invoking the generator). - GitClone("--local", self.path, clone_dir) - GitCheckout("-b", revision, git_dir=clone_dir) + git.clone("--local", self.path, clone_dir) + git.checkout("-b", revision, git_dir=clone_dir) return ArrowSources(clone_dir) From 703cf987abe44e40b6e6179b9c17d395fe2c9529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 9 Apr 2019 09:12:48 -0400 Subject: [PATCH 05/31] commit --- dev/archery/archery/benchmark/core.py | 50 ++++++++++++++++------ dev/archery/archery/benchmark/google.py | 10 ++++- dev/archery/archery/cli.py | 57 ++++++++++++++++++------- dev/archery/archery/utils/command.py | 12 +++++- dev/archery/archery/utils/logger.py | 4 ++ dev/archery/archery/utils/source.py | 2 +- 6 files changed, 103 insertions(+), 32 deletions(-) diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py index 751e9773def..3cb4d9568ba 100644 --- a/dev/archery/archery/benchmark/core.py +++ b/dev/archery/archery/benchmark/core.py @@ -28,7 +28,21 @@ def __init__(self, min, max, mean, stddev, q1, median, q3): @staticmethod def from_values(values): - return None + sorted(values) + n = len(values) + mean = sum(values) / len(values) + sum_diff = sum([(val - mean)**2 for val in values]) + stddev = (sum_diff / (n - 1))**0.5 if n > 1 else 0.0 + + return Statistics( + values[0], + values[-1], + mean, + stddev, + values[int(n/4)], + values[int(n/2)], + values[int((3*n)/4)], + ) class Benchmark: @@ -36,19 +50,18 @@ def __init__(self, name, values, stats=None): self.name = name self.values = values self.statistics = stats if stats else Statistics.from_values(values) - self.less_is_better = less_is_better @property def value(self): return self.statistics.median + @property + def unit(self): + return None -def changes(old, new): - if old == 0 and new == 0: - return 0.0 - if old == 0: - return float(new - old) / (float(old + new) / 2) - return float(new - old) / abs(old) + @property + def less_is_better(self): + return True class BenchmarkSuite: @@ -62,10 +75,10 @@ def __repr__(self): return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]" -def default_comparator(old, new, threshold=0.05): +def regress(change, threshold=0.05): # negative change is better, positive is tolerable until it exceeds # threshold - return changes(old, new) < threshold + return change > threshold def changes(old, new): @@ -82,8 +95,21 @@ def __init__(self, contender, baseline): self.baseline = baseline def compare(self, comparator=None): - comparator = comparator if comparator else default_comparator - return changes(self.baseline.value, self.contender.value) + base = self.baseline.value + cont = self.contender.value + change = changes(base, cont) + adjusted_change = change + if not self.baseline.less_is_better: + adjusted_change = change * -1.0 + return { + "benchmark": self.baseline.name, + "change": change, + "regression": regress(adjusted_change), + "baseline": base, + "contender": cont, + "unit": self.baseline.unit, + "less_is_better": self.baseline.less_is_better, + } def confidence(self): """ Indicate if a comparison of benchmarks should be trusted. """ diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index a330a523f1e..ce2d060ccb2 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -39,7 +39,8 @@ def list_benchmarks(self): argv = ["--benchmark_list_tests"] if self.benchmark_filter: argv.append(f"--benchmark_filter={self.benchmark_filter}") - result = self.run(*argv, stdout=subprocess.PIPE) + result = self.run(*argv, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) return [b for b in result.stdout] def results(self): @@ -48,7 +49,8 @@ def results(self): if self.benchmark_filter: argv.append(f"--benchmark_filter={self.benchmark_filter}") - return json.loads(self.run(*argv, stdout=subprocess.PIPE).stdout) + return json.loads(self.run(*argv, stdout=subprocess.PIPE, + stderr=subprocess.PIPE).stdout) class GoogleContext: @@ -170,6 +172,10 @@ def parse_param(idx, param): named_params = [parse_param(idx, p) for idx, p in params if p] return {k: v for k, v in named_params} + @property + def unit(self): + return self.runs[0].unit + @property def less_is_better(self): return self.runs[0].size is None diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index b27357a582d..07936e32179 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -23,23 +23,34 @@ import re from tempfile import mkdtemp, TemporaryDirectory +from .utils.logger import logger +import archery.utils.logger as log + from .benchmark.core import BenchmarkComparator from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration from .utils.cmake import CMakeBuild from .utils.git import Git -from .utils.logger import logger from .utils.source import ArrowSources @click.group() -@click.option("--debug", count=True) +@click.option("--debug", type=bool, is_flag=True, default=False, + help="Increase logging with debugging output.") +@click.option("--quiet", type=bool, is_flag=True, default=False, + help="Silence executed commands.") @click.pass_context -def archery(ctx, debug): +def archery(ctx, debug, quiet): """ Apache Arrow developer utilities. """ # Ensure ctx.obj exists ctx.ensure_object(dict) - ctx.obj["DEBUG"] = debug + + print("Init") + print(id(log.quiet)) + print(log.quiet) + log.quiet = quiet + print(log.quiet) + if debug: logger.setLevel(logging.DEBUG) @@ -160,16 +171,17 @@ def benchmark(ctx): @benchmark.command(name="diff", short_help="Run the C++ benchmark suite") -@click.option("--src", metavar="", default=ArrowSources.find(), +@click.option("--src", metavar="", show_default=True, + default=ArrowSources.find(), callback=validate_arrow_sources, help="Specify Arrow source directory") -@click.option("--suite-filter", metavar="", type=str, default=None, +@click.option("--suite-filter", metavar="", show_default=True, + type=str, default=None, help="Regex filtering benchmark suites.") +@click.option("--benchmark-filter", metavar="", show_default=True, + type=str, default=DEFAULT_BENCHMARK_FILTER, help="Regex filtering benchmark suites.") -@click.option("--benchmark-filter", metavar="", type=str, - default=DEFAULT_BENCHMARK_FILTER, - help="Regex filtering benchmark suites.") -@click.option("--preserve", type=bool, default=False, is_flag=True, - help="Preserve temporary workspace directory for investigation.") +@click.option("--preserve", type=bool, default=False, show_default=True, + is_flag=True, help="Preserve workspace for investigation.") @click.argument("contender", metavar="[", default=Git.WORKSPACE, required=False) @click.argument("baseline", metavar="[]]", default="master", @@ -201,25 +213,33 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, \b # Compare workspace (contender) with master (baseline) + \b archery benchmark diff \b # Compare master (contender) with latest version (baseline) + \b export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1) + \b archery benchmark diff master "$LAST" \b # Compare g++7 (contender) with clang++-7 (baseline) builds + \b archery build --with-benchmarks=true \\ --cc=gcc7 --cxx=g++7 gcc7-build + \b archery build --with-benchmarks=true \\ --cc=clang-7 --cxx=clang++-7 clang7-build + \b archery benchmark diff gcc7-build clang7-build \b - # Compare default but only scoped to the suites matching `aggregate` and - # the benchmarks matching `Kernel`. - archery benchmark diff --suite-filter=aggregate --benchmark-filter=Kernel + # Compare default targets but scoped to the suites matching + # `^arrow-compute-aggregate` and benchmarks matching `(Sum|Mean)Kernel`. + \b + archery benchmark diff --suite-filter="^arrow-compute-aggregate" \\ + --benchmark-filter="(Sum|Mean)Kernel" """ with tmpdir(preserve) as root: runner_cont = cpp_runner_from_rev_or_path(src, root, contender) @@ -244,8 +264,13 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, bench_cont = suite_cont[bench_name] bench_base = suite_base[bench_name] - comp = BenchmarkComparator(bench_cont, bench_base) - print(comp.compare()) + comparator = BenchmarkComparator(bench_cont, bench_base) + comparison = comparator() + + comparison["suite"] = suite_name + comparison["benchmark"] = bench_name + + print(comparison) if __name__ == "__main__": diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index a9f2bb7a393..6a8955b5adc 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -19,7 +19,7 @@ import shutil import subprocess -from .logger import logger +from .logger import logger, quiet def find_exec(executable): @@ -51,6 +51,16 @@ def run(self, *argv, raise_on_failure=True, **kwargs): invocation = [find_exec(self.bin)] invocation.extend(argv) + print("Within command module") + print(id(quiet)) + print(quiet) + + if "stdout" not in kwargs and quiet: + kwargs["stdout"] = subprocess.PIPE + + if "stderr" not in kwargs and quiet: + kwargs["stderr"] = subprocess.PIPE + logger.debug(f"Executing `{invocation}`") result = subprocess.run(invocation, **kwargs) if raise_on_failure: diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py index ab97204decb..951221ddfc8 100644 --- a/dev/archery/archery/utils/logger.py +++ b/dev/archery/archery/utils/logger.py @@ -17,5 +17,9 @@ import logging + +quiet = False + +""" Global logger. """ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("archery") diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 8f75b8f2675..6ff05fb5498 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -125,4 +125,4 @@ def find(path=None): return None def __repr__(self): - return f"ArrowSources[{self.path}]" + return f"{self.path}" From c85661cf3a6a773377564f2ab62682fcd53a6693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 10:42:54 -0400 Subject: [PATCH 06/31] Add documentation --- .../compute/kernels/aggregate-benchmark.cc | 4 +- dev/archery/archery/cli.py | 60 ++++++--- dev/archery/archery/utils/command.py | 35 +++-- dev/archery/archery/utils/logger.py | 11 +- dev/archery/setup.py | 40 ++++++ docs/source/developers/benchmarks.rst | 123 ++++++++++++++++++ 6 files changed, 230 insertions(+), 43 deletions(-) create mode 100644 dev/archery/setup.py create mode 100644 docs/source/developers/benchmarks.rst diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc index e81f87947b0..bea2656d73e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -309,7 +309,7 @@ BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive)->Apply(BenchmarkSetArgs); BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader)->Apply(BenchmarkSetArgs); BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll)->Apply(BenchmarkSetArgs); -static void BenchSumKernel(benchmark::State& state) { +static void RegressionSumKernel(benchmark::State& state) { const int64_t array_size = state.range(0) / sizeof(int64_t); const double null_percent = static_cast(state.range(1)) / 100.0; auto rand = random::RandomArrayGenerator(1923); @@ -328,7 +328,7 @@ static void BenchSumKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(BenchSumKernel)->Apply(BenchmarkSetArgs); +BENCHMARK(RegressionSumKernel)->Apply(SetArgs); } // namespace compute } // namespace arrow diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 07936e32179..9e271843de9 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -18,39 +18,38 @@ import click from contextlib import contextmanager +import json import logging import os import re from tempfile import mkdtemp, TemporaryDirectory -from .utils.logger import logger -import archery.utils.logger as log from .benchmark.core import BenchmarkComparator from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration from .utils.cmake import CMakeBuild from .utils.git import Git +from .utils.logger import logger, ctx as log_ctx from .utils.source import ArrowSources @click.group() @click.option("--debug", type=bool, is_flag=True, default=False, help="Increase logging with debugging output.") -@click.option("--quiet", type=bool, is_flag=True, default=False, +@click.option("-q", "--quiet", type=bool, is_flag=True, default=False, help="Silence executed commands.") @click.pass_context def archery(ctx, debug, quiet): - """ Apache Arrow developer utilities. """ + """ Apache Arrow developer utilities. + + See sub-commands help with `archery --help`. + + """ # Ensure ctx.obj exists ctx.ensure_object(dict) - print("Init") - print(id(log.quiet)) - print(log.quiet) - log.quiet = quiet - print(log.quiet) - + log_ctx.quiet = quiet if debug: logger.setLevel(logging.DEBUG) @@ -78,8 +77,8 @@ def validate_arrow_sources(ctx, param, src): callback=validate_arrow_sources, help="Specify Arrow source directory") # toolchain -@click.option("--cc", help="C compiler.") -@click.option("--cxx", help="C++ compiler.") +@click.option("--cc", metavar="", help="C compiler.") +@click.option("--cxx", metavar="", help="C++ compiler.") @click.option("--cxx_flags", help="C++ compiler flags.") @click.option("--build-type", default="release", type=build_type, help="CMake's CMAKE_BUILD_TYPE") @@ -108,7 +107,27 @@ def validate_arrow_sources(ctx, param, src): @click.argument("build_dir", type=build_dir_type) @click.pass_context def build(ctx, src, build_dir, force, targets, **kwargs): - """ Build. """ + """ Initialize a C++ build directory. + + The build command creates a directory initialized with Arrow's cpp source + cmake configuration. It can also optionally invoke the generator to test + the build (and used in scripts). + + Note that archery will carry the caller environment. It will also not touch + an existing directory, one must use the `--force` option to remove the + existing directory. + + Examples: + + \b + # Initialize build with clang7 and avx2 support in directory `clang7-build` + \b + archery build --cc=clang-7 --cxx=clang++-7 --cxx_flags=-mavx2 clang7-build + + \b + # Builds and run test + archery build --targets=all --targets=test build + """ # Arrow's cpp cmake configuration conf = CppConfiguration(**kwargs) # This is a closure around cmake invocation, e.g. calling `def.build()` @@ -187,7 +206,7 @@ def benchmark(ctx): @click.argument("baseline", metavar="[]]", default="master", required=False) @click.pass_context -def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, +def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, contender, baseline): """ Compare (diff) benchmark runs. @@ -227,9 +246,11 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, # Compare g++7 (contender) with clang++-7 (baseline) builds \b archery build --with-benchmarks=true \\ - --cc=gcc7 --cxx=g++7 gcc7-build + --cxx_flags=-ftree-vectorize \\ + --cc=gcc-7 --cxx=g++-7 gcc7-build \b archery build --with-benchmarks=true \\ + --cxx_flags=-flax-vector-conversions \\ --cc=clang-7 --cxx=clang++-7 clang7-build \b archery benchmark diff gcc7-build clang7-build @@ -242,6 +263,9 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, --benchmark-filter="(Sum|Mean)Kernel" """ with tmpdir(preserve) as root: + logger.debug(f"Comparing {contender} (contender) with " + f"{baseline} (baseline)") + runner_cont = cpp_runner_from_rev_or_path(src, root, contender) runner_base = cpp_runner_from_rev_or_path(src, root, baseline) @@ -250,7 +274,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, suites_base = {s.name: s for s in runner_base.suites(suite_filter, benchmark_filter)} - for suite_name in suites_cont.keys() & suites_base.keys(): + for suite_name in sorted(suites_cont.keys() & suites_base.keys()): logger.debug(f"Comparing {suite_name}") suite_cont = { @@ -258,7 +282,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, suite_base = { b.name: b for b in suites_base[suite_name].benchmarks} - for bench_name in suite_cont.keys() & suite_base.keys(): + for bench_name in sorted(suite_cont.keys() & suite_base.keys()): logger.debug(f"Comparing {bench_name}") bench_cont = suite_cont[bench_name] @@ -270,7 +294,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, comparison["suite"] = suite_name comparison["benchmark"] = bench_name - print(comparison) + print(json.dumps(comparison)) if __name__ == "__main__": diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 6a8955b5adc..4e05487221e 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -19,14 +19,12 @@ import shutil import subprocess -from .logger import logger, quiet +from .logger import logger, ctx def find_exec(executable): - if os.path.exists(executable): - return executable - - return shutil.which(executable) + exec_exists = os.path.exists(executable) + return executable if exec_exists else shutil.which(executable) # Decorator running a command and returning stdout @@ -39,7 +37,9 @@ def strip_it(x): return x.strip() if self.strip else x def wrapper(*argv, **kwargs): - return strip_it(fn(*argv, **kwargs, stdout=subprocess.PIPE).stdout) + # Ensure stdout is captured + kwargs["stdout"] = subprocess.PIPE + return strip_it(fn(*argv, **kwargs).stdout) return wrapper @@ -47,26 +47,21 @@ class Command: def bin(self): raise NotImplementedError("Command must implement bin() method") - def run(self, *argv, raise_on_failure=True, **kwargs): + def run(self, *argv, **kwargs): invocation = [find_exec(self.bin)] invocation.extend(argv) - print("Within command module") - print(id(quiet)) - print(quiet) + for key in ["stdout", "stderr"]: + # Preserve caller intention, otherwise silence + if key not in kwargs and ctx.quiet: + kwargs["key"] = subprocess.PIPE - if "stdout" not in kwargs and quiet: - kwargs["stdout"] = subprocess.PIPE - - if "stderr" not in kwargs and quiet: - kwargs["stderr"] = subprocess.PIPE + # Prefer safe by default + if "check" not in kwargs: + kwargs["check"] = True logger.debug(f"Executing `{invocation}`") - result = subprocess.run(invocation, **kwargs) - if raise_on_failure: - result.check_returncode() - - return result + return subprocess.run(invocation, **kwargs) def __call__(self, *argv, **kwargs): self.run(*argv, **kwargs) diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py index 951221ddfc8..0a4a99edb2f 100644 --- a/dev/archery/archery/utils/logger.py +++ b/dev/archery/archery/utils/logger.py @@ -17,9 +17,14 @@ import logging - -quiet = False - """ Global logger. """ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("archery") + + +class LoggingContext: + def __init__(self, quiet=False): + self.quiet = quiet + + +ctx = LoggingContext() diff --git a/dev/archery/setup.py b/dev/archery/setup.py new file mode 100644 index 00000000000..54f1c14966e --- /dev/null +++ b/dev/archery/setup.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +from setuptools import setup + + +if sys.version_info < (3, 5): + sys.exit('Python < 3.5 is not supported due to subprocess.run') + + +setup( + name='archery', + version="0.1.0", + description='Apache Arrow Developers Tools', + url='http://github.com/apache/arrow', + maintainer='Arrow Developers', + maintainer_email='dev@arrow.apache.org', + packages=['archery'], + install_requires=['click'], + entry_points=''' + [console_scripts] + archery=archery.cli:archery + ''', +) diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst new file mode 100644 index 00000000000..64428a15984 --- /dev/null +++ b/docs/source/developers/benchmarks.rst @@ -0,0 +1,123 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _benchmarks: + +********** +Benchmarks +********** + +Archery +======= + +``archery`` is a python library and command line utility made to interact with +Arrow's sources. The main feature is the benchmarking process. + +Installation +~~~~~~~~~~~~ + +The simplest way to install archery is with pip from the top-level directory. +It is recommended to use the ``-e,--editable`` flag such that pip don't copy +the module files but use the actual sources. + +.. code-block:: shell + pip install -e dev/archery + archery --help + +Comparison +========== + +One desire with benchmarks is to detect performance regressions. Thus, +``archery`` implements a benchmark comparison facility via the ``benchmark +diff`` command. + +In the default invocation, it will compare the current source (known as the +current workspace in git) with local master branch. + +For more information, invoke the ``archery benchmark diff --help`` command for +multiple examples of invocation. + +Iterating efficiently +~~~~~~~~~~~~~~~~~~~~~ + +Iterating with benchmarks development can be a tedious process due to long +build time and long run times. ``archery benchmark diff`` provides 2 methods +to reduce this overhead. + +First, the benchmark command supports comparing existing +build directories, This can be paired with the ``--preserve`` flag to +avoid rebuilding sources from zero. + +.. code-block:: shell + + # First invocation clone and checkouts in a temporary directory. The + # directory is preserved with --preserve + archery benchmark diff --preserve + + # Modify C++ sources + + # Re-run benchmark in the previously created build directory. + archery benchmark diff /tmp/arrow-bench*/{WORKSPACE,master}/build + +Second, the benchmark command supports filtering suites (``--suite-filter``) +and benchmarks (``--benchmark-filter``), both options supports regular +expressions. + +.. code-block:: shell + + # Taking over a previous run, but only filtering for benchmarks matching + # `Kernel` and suite matching `compute-aggregate`. + archery benchmark diff \ + --suite-filter=compute-aggregate --benchmark-filter=Kernel \ + /tmp/arrow-bench*/{WORKSPACE,master}/build + +Both method can be combined. + +Regression detection +==================== + +Writing a benchmark +~~~~~~~~~~~~~~~~~~~ + +1. The benchmark command will filter (by default) benchmarks with the regular + expression ``^Regression``. This way, not all benchmarks are run by default. + Thus, if you want your benchmark to be verified for regression + automatically, the name must match. + +2. The benchmark command will run with the ``--benchmark_repetitions=K`` + options for statistical stability. Thus, a benchmark should not override the + repetitions in the (C++) benchmark's arguments definition. + +3. Due to #2, a benchmark should run sufficiently fast. Often, when the input + does not fit in memory (L2/L3), the benchmark will be memory bound instead + of CPU bound. In this case, the input can be downsized. + +Scripting +========= + +``archery`` is written as a python library with a command line frontend. The +library can be imported to automate some tasks. + +Some invocation of the command line interface can be quite verbose due to build +output. This can be controlled/avoided with the ``--quiet`` option, e.g. + +.. code-block:: shell + + archery --quiet benchmark diff --benchmark-filter=Kernel + {"benchmark": "BenchSumKernel/32768/0", "change": -0.6498, "regression": true, ... + {"benchmark": "BenchSumKernel/32768/1", "change": 0.01553, "regression": false, ... + ... From 2a81744cfb4aed1c437be01ab382d7377d200976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 10:44:09 -0400 Subject: [PATCH 07/31] Ooops. --- dev/archery/archery/utils/command.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 4e05487221e..15a4a3411ab 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -54,7 +54,7 @@ def run(self, *argv, **kwargs): for key in ["stdout", "stderr"]: # Preserve caller intention, otherwise silence if key not in kwargs and ctx.quiet: - kwargs["key"] = subprocess.PIPE + kwargs[key] = subprocess.PIPE # Prefer safe by default if "check" not in kwargs: From 21b2e14fcf9ffdaffadb286bf0ee22a44150a26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 10:55:23 -0400 Subject: [PATCH 08/31] Add doc and fix bugs --- cpp/src/arrow/compute/kernels/aggregate-benchmark.cc | 2 +- dev/archery/archery/benchmark/core.py | 10 +++++++--- dev/archery/archery/cli.py | 7 +++++-- docs/source/developers/benchmarks.rst | 4 ++-- docs/source/developers/index.rst | 1 + 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc index bea2656d73e..6d35dd8f477 100644 --- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -328,7 +328,7 @@ static void RegressionSumKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(RegressionSumKernel)->Apply(SetArgs); +BENCHMARK(RegressionSumKernel)->Apply(BenchmarkSetArgs); } // namespace compute } // namespace arrow diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py index 3cb4d9568ba..133945b49d0 100644 --- a/dev/archery/archery/benchmark/core.py +++ b/dev/archery/archery/benchmark/core.py @@ -75,7 +75,7 @@ def __repr__(self): return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]" -def regress(change, threshold=0.05): +def regress(change, threshold): # negative change is better, positive is tolerable until it exceeds # threshold return change > threshold @@ -89,10 +89,14 @@ def changes(old, new): return float(new - old) / abs(old) +DEFAULT_THRESHOLD = 0.05 + + class BenchmarkComparator: - def __init__(self, contender, baseline): + def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD): self.contender = contender self.baseline = baseline + self.threshold = threshold def compare(self, comparator=None): base = self.baseline.value @@ -104,7 +108,7 @@ def compare(self, comparator=None): return { "benchmark": self.baseline.name, "change": change, - "regression": regress(adjusted_change), + "regression": regress(adjusted_change, self.threshold), "baseline": base, "contender": cont, "unit": self.baseline.unit, diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 9e271843de9..ad02e30b058 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -201,13 +201,15 @@ def benchmark(ctx): help="Regex filtering benchmark suites.") @click.option("--preserve", type=bool, default=False, show_default=True, is_flag=True, help="Preserve workspace for investigation.") +@click.option("--threshold", type=float, default=0.05, show_default=True, + help="Regression failure threshold in percentage.") @click.argument("contender", metavar="[", default=Git.WORKSPACE, required=False) @click.argument("baseline", metavar="[]]", default="master", required=False) @click.pass_context def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, - contender, baseline): + threshold, contender, baseline): """ Compare (diff) benchmark runs. This command acts like git-diff but for benchmark results. @@ -288,7 +290,8 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, bench_cont = suite_cont[bench_name] bench_base = suite_base[bench_name] - comparator = BenchmarkComparator(bench_cont, bench_base) + comparator = BenchmarkComparator(bench_cont, bench_base, + threshold=threshold) comparison = comparator() comparison["suite"] = suite_name diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst index 64428a15984..bb9e32eb79a 100644 --- a/docs/source/developers/benchmarks.rst +++ b/docs/source/developers/benchmarks.rst @@ -99,8 +99,8 @@ Writing a benchmark automatically, the name must match. 2. The benchmark command will run with the ``--benchmark_repetitions=K`` - options for statistical stability. Thus, a benchmark should not override the - repetitions in the (C++) benchmark's arguments definition. + options for statistical significance. Thus, a benchmark should not override + the repetitions in the (C++) benchmark's arguments definition. 3. Due to #2, a benchmark should run sufficiently fast. Often, when the input does not fit in memory (L2/L3), the benchmark will be memory bound instead diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst index a58f96997a6..d30963893cf 100644 --- a/docs/source/developers/index.rst +++ b/docs/source/developers/index.rst @@ -22,4 +22,5 @@ cpp python integration + benchmarks documentation From d6733b6f403a0afcf5ef751a8dfee6f97fc300a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 11:53:04 -0400 Subject: [PATCH 09/31] Formatting --- docs/source/developers/benchmarks.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst index bb9e32eb79a..0d69e9d645b 100644 --- a/docs/source/developers/benchmarks.rst +++ b/docs/source/developers/benchmarks.rst @@ -35,6 +35,7 @@ It is recommended to use the ``-e,--editable`` flag such that pip don't copy the module files but use the actual sources. .. code-block:: shell + pip install -e dev/archery archery --help From bc111b2d38134fd7d26ddb23a4a399a5a56972c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 13:38:33 -0400 Subject: [PATCH 10/31] Removes copied stuff --- dev/archery/archery/benchmark/google.py | 69 ------------------------- dev/archery/archery/benchmark/runner.py | 2 +- 2 files changed, 1 insertion(+), 70 deletions(-) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index ce2d060ccb2..9515d3fc6e3 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -53,62 +53,6 @@ def results(self): stderr=subprocess.PIPE).stdout) -class GoogleContext: - """ Represents the runtime environment """ - - def __init__(self, date=None, executable=None, **kwargs): - self.date = date - self.executable = executable - - @property - def host(self): - host = { - "hostname": gethostname(), - # Not sure if we should leak this. - "mac_address": getnode(), - } - return host - - @property - def git(self): - head = git.head() - # %ai: author date, ISO 8601-like format - timestamp = git.log("-1", "--pretty='%ai'", head) - branch = git.current_branch(), - git_info = { - "git_commit_timestamp": timestamp, - "git_hash": head, - "git_branch": branch, - } - return git_info - - @property - def toolchain(self): - # TODO parse local CMake generated info to extract compile flags and - # arrow features - deps = {} - toolchain = { - "language_implementation_version": "c++11", - "dependencies": deps, - } - return toolchain - - def as_arrow(self): - ctx = { - "benchmark_language": "C++", - "run_timestamp": self.date, - } - - for extra in (self.host, self.git, self.toolchain): - ctx.update(extra) - - return ctx - - @classmethod - def from_json(cls, payload): - return cls(**payload) - - class BenchmarkObservation: """ Represents one run of a single benchmark. """ @@ -159,19 +103,6 @@ def __init__(self, name, runs): self.runs = sorted(runs, key=lambda b: b.value) super().__init__(name, [b.value for b in self.runs]) - @property - def parameters(self): - """ Extract parameters from Benchmark's name""" - def parse_param(idx, param): - k_v = param.split(":") - # nameless parameter are transformed into positional names - name = k_v[0] if len(k_v) > 1 else f"arg{idx}" - return name, k_v[-1] - - params = enumerate(self.name.split("/")[1:]) - named_params = [parse_param(idx, p) for idx, p in params if p] - return {k: v for k, v in named_params} - @property def unit(self): return self.runs[0].unit diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 13786020f4d..f9ef63e5251 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -21,7 +21,7 @@ import subprocess from .core import Benchmark, BenchmarkSuite -from .google import GoogleBenchmarkCommand, GoogleBenchmark, GoogleContext +from .google import GoogleBenchmarkCommand, GoogleBenchmark from ..utils.logger import logger From 1b028390c6ed4fb98cbb97a064fafd1d95d132ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 16:28:41 -0400 Subject: [PATCH 11/31] Rename --cxx_flags to --cxx-flags --- dev/archery/archery/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index ad02e30b058..987c9b27b25 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -79,7 +79,7 @@ def validate_arrow_sources(ctx, param, src): # toolchain @click.option("--cc", metavar="", help="C compiler.") @click.option("--cxx", metavar="", help="C++ compiler.") -@click.option("--cxx_flags", help="C++ compiler flags.") +@click.option("--cxx-flags", help="C++ compiler flags.") @click.option("--build-type", default="release", type=build_type, help="CMake's CMAKE_BUILD_TYPE") @click.option("--warn-level", default="production", type=warn_level_type, @@ -122,7 +122,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): \b # Initialize build with clang7 and avx2 support in directory `clang7-build` \b - archery build --cc=clang-7 --cxx=clang++-7 --cxx_flags=-mavx2 clang7-build + archery build --cc=clang-7 --cxx=clang++-7 --cxx-flags=-mavx2 clang7-build \b # Builds and run test @@ -248,11 +248,11 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, # Compare g++7 (contender) with clang++-7 (baseline) builds \b archery build --with-benchmarks=true \\ - --cxx_flags=-ftree-vectorize \\ + --cxx-flags=-ftree-vectorize \\ --cc=gcc-7 --cxx=g++-7 gcc7-build \b archery build --with-benchmarks=true \\ - --cxx_flags=-flax-vector-conversions \\ + --cxx-flags=-flax-vector-conversions \\ --cc=clang-7 --cxx=clang++-7 clang7-build \b archery benchmark diff gcc7-build clang7-build From a281ae8e6a6d51755d8adb2d552f299cc6565b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 11 Apr 2019 16:37:38 -0400 Subject: [PATCH 12/31] Various language fixes --- dev/archery/archery/cli.py | 4 ++-- dev/archery/archery/lang/cpp.py | 16 ++++++++-------- dev/archery/archery/utils/source.py | 8 ++++---- docs/source/developers/benchmarks.rst | 10 +++++----- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 987c9b27b25..1e0344ea586 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -110,8 +110,8 @@ def build(ctx, src, build_dir, force, targets, **kwargs): """ Initialize a C++ build directory. The build command creates a directory initialized with Arrow's cpp source - cmake configuration. It can also optionally invoke the generator to test - the build (and used in scripts). + cmake and configuration. It can also optionally invoke the generator to + test the build (and used in scripts). Note that archery will carry the caller environment. It will also not touch an existing directory, one must use the `--force` option to remove the diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 136dadc6b4e..a1fded5a095 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -22,7 +22,7 @@ from ..utils.cmake import CMakeDefinition -def thrutifier(value): +def truthifier(value): return "ON" if value else "OFF" @@ -61,14 +61,14 @@ def _gen_defs(self): yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) - yield ("ARROW_BUILD_TESTS", thrutifier(self.with_tests)) - yield ("ARROW_BUILD_BENCHMARKS", thrutifier(self.with_benchmarks)) + yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests)) + yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks)) - yield ("ARROW_PYTHON", thrutifier(self.with_python)) - yield ("ARROW_PARQUET", thrutifier(self.with_parquet)) - yield ("ARROW_GANDIVA", thrutifier(self.with_gandiva)) - yield ("ARROW_PLASMA", thrutifier(self.with_plasma)) - yield ("ARROW_FLIGHT", thrutifier(self.with_flight)) + yield ("ARROW_PYTHON", truthifier(self.with_python)) + yield ("ARROW_PARQUET", truthifier(self.with_parquet)) + yield ("ARROW_GANDIVA", truthifier(self.with_gandiva)) + yield ("ARROW_PLASMA", truthifier(self.with_plasma)) + yield ("ARROW_FLIGHT", truthifier(self.with_flight)) @property def definitions(self): diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 6ff05fb5498..3ba4983eb98 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -29,7 +29,7 @@ def __init__(self, path): """ Initialize an ArrowSources The caller must ensure that path is valid arrow source directory (can - be checked with ArrowSources.valid + be checked with ArrowSources.valid) Parameters ---------- @@ -71,7 +71,7 @@ def at_revision(self, revision, clone_dir): raise ValueError(f"{self} is not backed by git") # A local clone is required to leave the current sources intact such - # that build depending on said sources are not invalidated (or worse + # that builds depending on said sources are not invalidated (or worse # slightly affected when re-invoking the generator). git.clone("--local", self.path, clone_dir) git.checkout("-b", revision, git_dir=clone_dir) @@ -100,8 +100,8 @@ def find(path=None): 2. Checks if the environment variable `ARROW_SRC` is defined and use this. - 3. Checks if the current working directory (cwd) contains is an Arrow - source directory. + 3. Checks if the current working directory (cwd) is an Arrow source + directory. 4. Checks if this file (cli.py) is still in the original source repository. If so, returns the relative path to the source diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst index 0d69e9d645b..b403392ffbc 100644 --- a/docs/source/developers/benchmarks.rst +++ b/docs/source/developers/benchmarks.rst @@ -31,8 +31,8 @@ Installation ~~~~~~~~~~~~ The simplest way to install archery is with pip from the top-level directory. -It is recommended to use the ``-e,--editable`` flag such that pip don't copy -the module files but use the actual sources. +It is recommended to use the ``-e,--editable`` flag so that pip don't copy +the module files but uses the actual sources. .. code-block:: shell @@ -42,7 +42,7 @@ the module files but use the actual sources. Comparison ========== -One desire with benchmarks is to detect performance regressions. Thus, +One goal with benchmarking is to detect performance regressions. To this end, ``archery`` implements a benchmark comparison facility via the ``benchmark diff`` command. @@ -55,7 +55,7 @@ multiple examples of invocation. Iterating efficiently ~~~~~~~~~~~~~~~~~~~~~ -Iterating with benchmarks development can be a tedious process due to long +Iterating with benchmark development can be a tedious process due to long build time and long run times. ``archery benchmark diff`` provides 2 methods to reduce this overhead. @@ -86,7 +86,7 @@ expressions. --suite-filter=compute-aggregate --benchmark-filter=Kernel \ /tmp/arrow-bench*/{WORKSPACE,master}/build -Both method can be combined. +Both methods can be combined. Regression detection ==================== From 7696202ba2f145acd1bf4e17871edd72728027cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Fri, 12 Apr 2019 10:14:27 -0400 Subject: [PATCH 13/31] Add doc for bin attribute. --- dev/archery/archery/utils/command.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 15a4a3411ab..32e8d9859a0 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -44,10 +44,14 @@ def wrapper(*argv, **kwargs): class Command: - def bin(self): - raise NotImplementedError("Command must implement bin() method") + """ A runnable command. + + Class inheriting from the Command class must provide the bin + property/attribute. + """ def run(self, *argv, **kwargs): + assert(hasattr(self, "bin")) invocation = [find_exec(self.bin)] invocation.extend(argv) From 90578af61bf546c06f7f22fe96243f478430eeaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Fri, 12 Apr 2019 10:25:48 -0400 Subject: [PATCH 14/31] Add --cmake-extras to build command --- dev/archery/archery/cli.py | 5 ++++- dev/archery/archery/lang/cpp.py | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 1e0344ea586..fe11f65acb5 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -99,11 +99,14 @@ def validate_arrow_sources(ctx, param, src): help="Build with Plasma object store support.") @click.option("--with-flight", default=False, type=bool, help="Build with Flight rpc support.") +@click.option("--cmake-extras", type=str, multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked") # misc @click.option("-f", "--force", type=bool, is_flag=True, default=False, help="Delete existing build directory if found.") @click.option("--targets", type=str, multiple=True, - help="Generator targets to run") + help="Generator targets to run. Can be stacked.") @click.argument("build_dir", type=build_dir_type) @click.pass_context def build(ctx, src, build_dir, force, targets, **kwargs): diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index a1fded5a095..240bd3e6d48 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -38,7 +38,7 @@ def __init__(self, # components with_tests=True, with_benchmarks=False, with_python=True, with_parquet=False, with_gandiva=False, with_plasma=False, - with_flight=False): + with_flight=False, cmake_extras=None): self.cc = cc self.cxx = cxx self.cxx_flags = cxx_flags @@ -53,6 +53,7 @@ def __init__(self, self.with_gandiva = with_gandiva self.with_plasma = with_plasma self.with_flight = with_flight + self.cmake_extras = cmake_extras def _gen_defs(self): if self.cxx_flags: @@ -72,7 +73,8 @@ def _gen_defs(self): @property def definitions(self): - return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] + extras = list(self.cmake_extras) if self.cmake_extras else [] + return [f"-D{d[0]}={d[1]}" for d in self._gen_defs()] + extras @property def environment(self): From d9692bc8fd5813215defbf35be481673070dbf49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Mon, 15 Apr 2019 12:53:32 -0400 Subject: [PATCH 15/31] Fix splitlines --- dev/archery/archery/benchmark/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index 9515d3fc6e3..b3b71b417ab 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -41,7 +41,7 @@ def list_benchmarks(self): argv.append(f"--benchmark_filter={self.benchmark_filter}") result = self.run(*argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return [b for b in result.stdout] + return str.splitlines(result.stdout.decode("utf-8")) def results(self): argv = ["--benchmark_format=json", "--benchmark_repetitions=20"] From 96f999748390d1d52f0712c788bd3ef1a2e1727a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Mon, 15 Apr 2019 13:02:39 -0400 Subject: [PATCH 16/31] Add gitignore entry --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6bb237af98e..6d08d4607d5 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ perf.data.old cpp/.idea/ cpp/apidoc/xml/ +dev/archery/archery.egg-info/ docs/example.gz docs/example1.dat docs/example3.dat From 1949f749ce6f9bd43014d54ce3496b8ea9ed45b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 16 Apr 2019 10:25:00 -0400 Subject: [PATCH 17/31] Supports HEAD revisions --- dev/archery/archery/utils/source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 3ba4983eb98..45795a5ba6e 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -74,7 +74,7 @@ def at_revision(self, revision, clone_dir): # that builds depending on said sources are not invalidated (or worse # slightly affected when re-invoking the generator). git.clone("--local", self.path, clone_dir) - git.checkout("-b", revision, git_dir=clone_dir) + git.checkout(revision, git_dir=clone_dir) return ArrowSources(clone_dir) From 8845e3e78095393e2ed9265cf1604ea4b904bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 16 Apr 2019 14:03:34 -0400 Subject: [PATCH 18/31] Remove empty __init__.py --- dev/archery/archery/__init__.py | 0 dev/archery/archery/benchmark/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 dev/archery/archery/__init__.py delete mode 100644 dev/archery/archery/benchmark/__init__.py diff --git a/dev/archery/archery/__init__.py b/dev/archery/archery/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/dev/archery/archery/benchmark/__init__.py b/dev/archery/archery/benchmark/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 From c3719214c64a4a0e7f9129bfba6a163afb67919a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 16 Apr 2019 14:39:06 -0400 Subject: [PATCH 19/31] Fix flake8 warnings --- dev/archery/archery/benchmark/google.py | 1 - dev/archery/archery/benchmark/runner.py | 3 +-- dev/archery/archery/cli.py | 1 - dev/archery/archery/lang/cpp.py | 2 -- dev/archery/archery/utils/command.py | 2 +- 5 files changed, 2 insertions(+), 7 deletions(-) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index b3b71b417ab..c3c836a329a 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -21,7 +21,6 @@ from .core import Benchmark from ..utils.command import Command -from ..utils.git import git def partition(pred, iterable): diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index f9ef63e5251..158f2763667 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -18,9 +18,8 @@ import glob import os import re -import subprocess -from .core import Benchmark, BenchmarkSuite +from .core import BenchmarkSuite from .google import GoogleBenchmarkCommand, GoogleBenchmark from ..utils.logger import logger diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index fe11f65acb5..6408aabbad1 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -21,7 +21,6 @@ import json import logging import os -import re from tempfile import mkdtemp, TemporaryDirectory diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 240bd3e6d48..c83acb22b9b 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -15,10 +15,8 @@ # specific language governing permissions and limitations # under the License. -import glob import os -from ..benchmark.runner import BenchmarkRunner from ..utils.cmake import CMakeDefinition diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py index 32e8d9859a0..46d0066d23b 100644 --- a/dev/archery/archery/utils/command.py +++ b/dev/archery/archery/utils/command.py @@ -39,7 +39,7 @@ def strip_it(x): def wrapper(*argv, **kwargs): # Ensure stdout is captured kwargs["stdout"] = subprocess.PIPE - return strip_it(fn(*argv, **kwargs).stdout) + return strip_it(f(*argv, **kwargs).stdout) return wrapper From 71b10e98addb6363ebc49ab2e40c9143b38be682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 18 Apr 2019 14:11:14 -0400 Subject: [PATCH 20/31] Disable python in benchmarks --- dev/archery/archery/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 6408aabbad1..135117cd5b8 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -152,7 +152,8 @@ def tmpdir(preserve, prefix="arrow-bench-"): DEFAULT_BENCHMARK_CONF = CppConfiguration( - build_type="release", with_tests=True, with_benchmarks=True) + build_type="release", with_tests=True, with_benchmarks=True, + with_python=False) def cpp_runner_from_rev_or_path(src, root, rev_or_path): From 048ba0ede54bfb19b13152381be0a944a9381639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 18 Apr 2019 14:21:57 -0400 Subject: [PATCH 21/31] Add verbose_third_party --- dev/archery/archery/lang/cpp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index c83acb22b9b..90822fe6122 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -32,7 +32,7 @@ class CppConfiguration: def __init__(self, # toolchain cc=None, cxx=None, cxx_flags=None, - build_type=None, warn_level=None, + build_type=None, warn_level=None, verbose_third_party=True # components with_tests=True, with_benchmarks=False, with_python=True, with_parquet=False, with_gandiva=False, with_plasma=False, @@ -43,6 +43,7 @@ def __init__(self, self.build_type = build_type self.warn_level = warn_level + self.verbose_third_party = verbose_third_party self.with_tests = with_tests self.with_benchmarks = with_benchmarks @@ -59,6 +60,8 @@ def _gen_defs(self): yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) + yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", + truthifier(self.verbose_third_party)) yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests)) yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks)) From e6762899cbf48795e9211d32d70b368673b50321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 18 Apr 2019 14:28:35 -0400 Subject: [PATCH 22/31] Typo --- dev/archery/archery/lang/cpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 90822fe6122..9af0ad8d446 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -32,7 +32,7 @@ class CppConfiguration: def __init__(self, # toolchain cc=None, cxx=None, cxx_flags=None, - build_type=None, warn_level=None, verbose_third_party=True + build_type=None, warn_level=None, verbose_third_party=True, # components with_tests=True, with_benchmarks=False, with_python=True, with_parquet=False, with_gandiva=False, with_plasma=False, From 28254676c27f255303966c5aaf5adb4aff74570c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Mon, 22 Apr 2019 08:53:25 -0400 Subject: [PATCH 23/31] Add --cmake-extras to benchmark-diff command --- dev/archery/archery/cli.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 135117cd5b8..65a335aec94 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -151,12 +151,7 @@ def tmpdir(preserve, prefix="arrow-bench-"): yield tmp -DEFAULT_BENCHMARK_CONF = CppConfiguration( - build_type="release", with_tests=True, with_benchmarks=True, - with_python=False) - - -def cpp_runner_from_rev_or_path(src, root, rev_or_path): +def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf): build = None if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): build = CMakeBuild.from_path(rev_or_path) @@ -173,7 +168,7 @@ def cpp_runner_from_rev_or_path(src, root, rev_or_path): # TODO: find a way to pass custom configuration without cluttering # the cli. Ideally via a configuration file that can be shared with the # `build` sub-command. - cmake_def = CppCMakeDefinition(src_rev.cpp, DEFAULT_BENCHMARK_CONF) + cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) build = cmake_def.build(os.path.join(root_rev, "build")) return CppBenchmarkRunner(build) @@ -206,13 +201,16 @@ def benchmark(ctx): is_flag=True, help="Preserve workspace for investigation.") @click.option("--threshold", type=float, default=0.05, show_default=True, help="Regression failure threshold in percentage.") +@click.option("--cmake-extras", type=str, multiple=True, + help="Extra flags/options to pass to cmake invocation. " + "Can be stacked") @click.argument("contender", metavar="[", default=Git.WORKSPACE, required=False) @click.argument("baseline", metavar="[]]", default="master", required=False) @click.pass_context def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, - threshold, contender, baseline): + threshold, cmake_extras, contender, baseline): """ Compare (diff) benchmark runs. This command acts like git-diff but for benchmark results. @@ -271,8 +269,12 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, logger.debug(f"Comparing {contender} (contender) with " f"{baseline} (baseline)") - runner_cont = cpp_runner_from_rev_or_path(src, root, contender) - runner_base = cpp_runner_from_rev_or_path(src, root, baseline) + conf = CppConfiguration( + build_type="release", with_tests=True, with_benchmarks=True, + with_python=False, cmake_extras=cmake_extras) + + runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf) + runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf) suites_cont = {s.name: s for s in runner_cont.suites(suite_filter, benchmark_filter)} From dc031bde7fa9080a68694ad6f08f05252b46c8e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Tue, 23 Apr 2019 09:48:44 -0400 Subject: [PATCH 24/31] Support conda toolchain --- dev/archery/archery/lang/cpp.py | 41 +++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 9af0ad8d446..669c6f59601 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -18,6 +18,7 @@ import os from ..utils.cmake import CMakeDefinition +from ..utils.logger import ctx def truthifier(value): @@ -32,7 +33,8 @@ class CppConfiguration: def __init__(self, # toolchain cc=None, cxx=None, cxx_flags=None, - build_type=None, warn_level=None, verbose_third_party=True, + build_type=None, warn_level=None, + install_prefix=None, use_conda=None, # components with_tests=True, with_benchmarks=False, with_python=True, with_parquet=False, with_gandiva=False, with_plasma=False, @@ -43,7 +45,8 @@ def __init__(self, self.build_type = build_type self.warn_level = warn_level - self.verbose_third_party = verbose_third_party + self._install_prefix = install_prefix + self._use_conda = use_conda self.with_tests = with_tests self.with_benchmarks = with_benchmarks @@ -60,8 +63,13 @@ def _gen_defs(self): yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) - yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", - truthifier(self.verbose_third_party)) + + if not ctx.quiet: + yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + + maybe_prefix = self.install_prefix + if maybe_prefix: + yield ("CMAKE_INSTALL_PREFIX", maybe_prefix) yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests)) yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks)) @@ -72,6 +80,31 @@ def _gen_defs(self): yield ("ARROW_PLASMA", truthifier(self.with_plasma)) yield ("ARROW_FLIGHT", truthifier(self.with_flight)) + # Detect custom conda toolchain + if self.use_conda: + for d, v in [('CMAKE_AR', 'AR'), ('CMAKE_RANLIB', 'RANLIB')]: + v = os.environ.get(v) + if v: + yield (d, v) + + @property + def install_prefix(self): + if self._install_prefix: + return self._install_prefix + + if self.use_conda: + return os.environ.get("CONDA_PREFIX") + + return None + + @property + def use_conda(self): + # If the user didn't specify a preference, guess via environment + if self._use_conda is None: + return os.environ.get("CONDA_PREFIX") is not None + + return self._use_conda + @property def definitions(self): extras = list(self.cmake_extras) if self.cmake_extras else [] From 280c93be4ea6954a06dd426975e95fcfcda6ab90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 11:09:59 -0400 Subject: [PATCH 25/31] Update gitignore --- .gitignore | 3 ++- python/.gitignore | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6d08d4607d5..4a03020f950 100644 --- a/.gitignore +++ b/.gitignore @@ -45,12 +45,13 @@ perf.data.old cpp/.idea/ cpp/apidoc/xml/ -dev/archery/archery.egg-info/ docs/example.gz docs/example1.dat docs/example3.dat python/.eggs/ python/doc/ +# Egg metadata +*.egg-info .vscode .idea/ diff --git a/python/.gitignore b/python/.gitignore index 3346aa62df4..8f08f9377bf 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -25,8 +25,6 @@ pyarrow/include build # setup.py dist directory dist -# Egg metadata -*.egg-info # Coverage .coverage coverage.xml From 514e8e42819eb994c2446e65658ce295a618a29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 13:57:06 -0400 Subject: [PATCH 26/31] Introduce RegressionSetArgs --- cpp/src/arrow/compute/benchmark-util.h | 13 +++++++++++++ .../arrow/compute/kernels/aggregate-benchmark.cc | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/benchmark-util.h b/cpp/src/arrow/compute/benchmark-util.h index 1678f8d51e2..865da6671e3 100644 --- a/cpp/src/arrow/compute/benchmark-util.h +++ b/cpp/src/arrow/compute/benchmark-util.h @@ -55,5 +55,18 @@ void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { bench->Args({static_cast(size), nulls}); } +void RegressionSetArgs(benchmark::internal::Benchmark* bench) { + // Benchmark changed its parameter type between releases from + // int to int64_t. As it doesn't have version macros, we need + // to apply C++ template magic. + using ArgsType = + typename BenchmarkArgsType::type; + bench->Unit(benchmark::kMicrosecond); + + // Regressions should only bench L1 data for better stability + for (auto nulls : std::vector({0, 1, 10, 50})) + bench->Args({static_cast(kL1Size), nulls}); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc index 6d35dd8f477..bbc923f6ebd 100644 --- a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -328,7 +328,7 @@ static void RegressionSumKernel(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); } -BENCHMARK(RegressionSumKernel)->Apply(BenchmarkSetArgs); +BENCHMARK(RegressionSumKernel)->Apply(RegressionSetArgs); } // namespace compute } // namespace arrow From d8e3c1c85f288f3d3a8c13266c450ef6fba6263f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 13:59:37 -0400 Subject: [PATCH 27/31] Review --- dev/archery/archery/benchmark/core.py | 97 +++---------------------- dev/archery/archery/benchmark/google.py | 77 +++++++++++++++----- dev/archery/archery/cli.py | 71 ++++++++---------- dev/archery/archery/lang/cpp.py | 5 +- dev/archery/archery/utils/cmake.py | 13 ++++ dev/archery/archery/utils/git.py | 7 +- dev/archery/archery/utils/source.py | 15 +++- dev/archery/setup.py | 4 +- docs/source/developers/benchmarks.rst | 3 + 9 files changed, 133 insertions(+), 159 deletions(-) diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py index 133945b49d0..83bc2735dd9 100644 --- a/dev/archery/archery/benchmark/core.py +++ b/dev/archery/archery/benchmark/core.py @@ -15,53 +15,24 @@ # specific language governing permissions and limitations # under the License. - -class Statistics: - def __init__(self, min, max, mean, stddev, q1, median, q3): - self.min = min - self.max = max - self.mean = mean - self.stddev = stddev - self.q1 = q1 - self.median = median - self.q3 = q3 - - @staticmethod - def from_values(values): - sorted(values) - n = len(values) - mean = sum(values) / len(values) - sum_diff = sum([(val - mean)**2 for val in values]) - stddev = (sum_diff / (n - 1))**0.5 if n > 1 else 0.0 - - return Statistics( - values[0], - values[-1], - mean, - stddev, - values[int(n/4)], - values[int(n/2)], - values[int((3*n)/4)], - ) +import pandas as pa class Benchmark: - def __init__(self, name, values, stats=None): + def __init__(self, name, unit, less_is_better, values, stats=None): self.name = name - self.values = values - self.statistics = stats if stats else Statistics.from_values(values) + self.unit = unit + self.less_is_better = less_is_better + self.values = pa.Series(values) + self.statistics = self.values.describe() @property def value(self): - return self.statistics.median - - @property - def unit(self): - return None + median = "50%" + return float(self.statistics[median]) - @property - def less_is_better(self): - return True + def __repr__(self): + return f"Benchmark[name={self.name},value={self.value}]" class BenchmarkSuite: @@ -73,51 +44,3 @@ def __repr__(self): name = self.name benchmarks = self.benchmarks return f"BenchmarkSuite[name={name}, benchmarks={benchmarks}]" - - -def regress(change, threshold): - # negative change is better, positive is tolerable until it exceeds - # threshold - return change > threshold - - -def changes(old, new): - if old == 0 and new == 0: - return 0.0 - if old == 0: - return float(new - old) / (float(old + new) / 2) - return float(new - old) / abs(old) - - -DEFAULT_THRESHOLD = 0.05 - - -class BenchmarkComparator: - def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD): - self.contender = contender - self.baseline = baseline - self.threshold = threshold - - def compare(self, comparator=None): - base = self.baseline.value - cont = self.contender.value - change = changes(base, cont) - adjusted_change = change - if not self.baseline.less_is_better: - adjusted_change = change * -1.0 - return { - "benchmark": self.baseline.name, - "change": change, - "regression": regress(adjusted_change, self.threshold), - "baseline": base, - "contender": cont, - "unit": self.baseline.unit, - "less_is_better": self.baseline.less_is_better, - } - - def confidence(self): - """ Indicate if a comparison of benchmarks should be trusted. """ - return True - - def __call__(self, **kwargs): - return self.compare(**kwargs) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py index c3c836a329a..d6efb77dc22 100644 --- a/dev/archery/archery/benchmark/google.py +++ b/dev/archery/archery/benchmark/google.py @@ -30,6 +30,12 @@ def partition(pred, iterable): class GoogleBenchmarkCommand(Command): + """ Run a google benchmark binary. + + This assumes the binary supports the standard command line options, + notably `--benchmark_filter`, `--benchmark_format`, etc... + """ + def __init__(self, benchmark_bin, benchmark_filter=None): self.bin = benchmark_bin self.benchmark_filter = benchmark_filter @@ -52,24 +58,48 @@ def results(self): stderr=subprocess.PIPE).stdout) -class BenchmarkObservation: - """ Represents one run of a single benchmark. """ +class GoogleBenchmarkObservation: + """ Represents one run of a single (google c++) benchmark. + + Observations are found when running with `--benchmark_repetitions`. Sadly, + the format mixes values and aggregates, e.g. + + RegressionSumKernel/32768/0 1 us 1 us 25.8077GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.7066GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.1481GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.846GB/s + RegressionSumKernel/32768/0 1 us 1 us 25.6453GB/s + RegressionSumKernel/32768/0_mean 1 us 1 us 25.6307GB/s + RegressionSumKernel/32768/0_median 1 us 1 us 25.7066GB/s + RegressionSumKernel/32768/0_stddev 0 us 0 us 288.046MB/s - def __init__(self, **kwargs): - self._name = kwargs.get("name") - self.real_time = kwargs.get("real_time") - self.cpu_time = kwargs.get("cpu_time") - self.time_unit = kwargs.get("time_unit") - self.size = kwargs.get("size") - self.bytes_per_second = kwargs.get("bytes_per_second") + As from benchmark v1.4.1 (2019-04-24), the only way to differentiate an + actual run from the aggregates, is to match on the benchmark name. The + aggregates will be appended with `_$agg_name`. + + This class encapsulate the logic to separate runs from aggregate . This is + hopefully avoided in benchmark's master version with a separate json + attribute. + """ + + def __init__(self, name, real_time, cpu_time, time_unit, size=None, + bytes_per_second=None, **kwargs): + self._name = name + self.real_time = real_time + self.cpu_time = cpu_time + self.time_unit = time_unit + self.size = size + self.bytes_per_second = bytes_per_second @property def is_agg(self): + """ Indicate if the observation is a run or an aggregate. """ suffixes = ["_mean", "_median", "_stddev"] return any(map(lambda x: self._name.endswith(x), suffixes)) @property def is_realtime(self): + """ Indicate if the preferred value is realtime instead of cputime. """ return self.name.find("/realtime") != -1 @property @@ -95,20 +125,29 @@ def __repr__(self): class GoogleBenchmark(Benchmark): + """ A set of GoogleBenchmarkObservations. """ + def __init__(self, name, runs): + """ Initialize a GoogleBenchmark. + + Parameters + ---------- + name: str + Name of the benchmark + runs: list(GoogleBenchmarkObservation) + Repetitions of GoogleBenchmarkObservation run. + + """ self.name = name # exclude google benchmark aggregate artifacts _, runs = partition(lambda b: b.is_agg, runs) self.runs = sorted(runs, key=lambda b: b.value) - super().__init__(name, [b.value for b in self.runs]) - - @property - def unit(self): - return self.runs[0].unit - - @property - def less_is_better(self): - return self.runs[0].size is None + unit = self.runs[0].unit + # If `size` is found in the json dict, then the benchmark is reported + # in bytes per second + less_is_better = self.runs[0].size is None + values = [b.value for b in self.runs] + super().__init__(name, unit, less_is_better, values) def __repr__(self): return f"GoogleBenchmark[name={self.name},runs={self.runs}]" @@ -118,6 +157,6 @@ def from_json(cls, payload): def group_key(x): return x.name - benchmarks = map(lambda x: BenchmarkObservation(**x), payload) + benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload) groups = groupby(sorted(benchmarks, key=group_key), group_key) return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 65a335aec94..c8929e7dd1b 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -21,14 +21,14 @@ import json import logging import os +import sys from tempfile import mkdtemp, TemporaryDirectory - -from .benchmark.core import BenchmarkComparator +from .benchmark.compare import RunnerComparator from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration from .utils.cmake import CMakeBuild -from .utils.git import Git +from .utils.codec import JsonEncoder from .utils.logger import logger, ctx as log_ctx from .utils.source import ArrowSources @@ -152,6 +152,14 @@ def tmpdir(preserve, prefix="arrow-bench-"): def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf): + """ Returns a CppBenchmarkRunner from a path or a git revision. + + First, it checks if `rev_or_path` points to a valid CMake build directory. + If so, it creates a CppBenchmarkRunner with this existing CMakeBuild. + + Otherwise, it assumes `rev_or_path` is a revision and clone/checkout the + given revision and create a fresh CMakeBuild. + """ build = None if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): build = CMakeBuild.from_path(rev_or_path) @@ -159,21 +167,18 @@ def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf): root_rev = os.path.join(root, rev_or_path) os.mkdir(root_rev) - # Possibly checkout the sources at given revision - src_rev = src - if rev_or_path != Git.WORKSPACE: - root_src = os.path.join(root_rev, "arrow") - src_rev = src.at_revision(rev_or_path, root_src) - - # TODO: find a way to pass custom configuration without cluttering - # the cli. Ideally via a configuration file that can be shared with the - # `build` sub-command. + clone_dir = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision, no need to perform + # cleanup on cloned repository as root_rev is reclaimed. + src_rev, _ = src.at_revision(rev_or_path, clone_dir) cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) build = cmake_def.build(os.path.join(root_rev, "build")) return CppBenchmarkRunner(build) +# Running all benchmarks would be prohibitive. Benchmark who needs to be +# monitored for regression should be named with this prefix. DEFAULT_BENCHMARK_FILTER = "^Regression" @@ -204,8 +209,8 @@ def benchmark(ctx): @click.option("--cmake-extras", type=str, multiple=True, help="Extra flags/options to pass to cmake invocation. " "Can be stacked") -@click.argument("contender", metavar="[", default=Git.WORKSPACE, - required=False) +@click.argument("contender", metavar="[", + default=ArrowSources.WORKSPACE, required=False) @click.argument("baseline", metavar="[]]", default="master", required=False) @click.pass_context @@ -219,7 +224,7 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, unspecified, the contender will default to the current workspace (like git) and the baseline will default to master. - Each target (contender or baseline) can either be a git object reference + Each target (contender or baseline) can either be a git revision (commit, tag, special values like HEAD) or a cmake build directory. This allow comparing git commits, and/or different compilers and/or compiler flags. @@ -270,39 +275,21 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, f"{baseline} (baseline)") conf = CppConfiguration( - build_type="release", with_tests=True, with_benchmarks=True, - with_python=False, cmake_extras=cmake_extras) + build_type="release", with_tests=True, with_benchmarks=True, + with_python=False, cmake_extras=cmake_extras) runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf) runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf) - suites_cont = {s.name: s for s in runner_cont.suites(suite_filter, - benchmark_filter)} - suites_base = {s.name: s for s in runner_base.suites(suite_filter, - benchmark_filter)} - - for suite_name in sorted(suites_cont.keys() & suites_base.keys()): - logger.debug(f"Comparing {suite_name}") - - suite_cont = { - b.name: b for b in suites_cont[suite_name].benchmarks} - suite_base = { - b.name: b for b in suites_base[suite_name].benchmarks} - - for bench_name in sorted(suite_cont.keys() & suite_base.keys()): - logger.debug(f"Comparing {bench_name}") - - bench_cont = suite_cont[bench_name] - bench_base = suite_base[bench_name] - - comparator = BenchmarkComparator(bench_cont, bench_base, - threshold=threshold) - comparison = comparator() + runner_comp = RunnerComparator(runner_cont, runner_base, threshold) + comparisons = runner_comp.comparisons(suite_filter, benchmark_filter) - comparison["suite"] = suite_name - comparison["benchmark"] = bench_name + regressions = 0 + for comparator in comparisons: + regressions += comparator.regression + print(json.dumps(comparator, cls=JsonEncoder)) - print(json.dumps(comparison)) + sys.exit(regressions) if __name__ == "__main__": diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 669c6f59601..84b6346e14e 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -18,7 +18,6 @@ import os from ..utils.cmake import CMakeDefinition -from ..utils.logger import ctx def truthifier(value): @@ -64,8 +63,8 @@ def _gen_defs(self): yield ("CMAKE_BUILD_TYPE", or_else(self.build_type, "debug")) yield ("BUILD_WARNING_LEVEL", or_else(self.warn_level, "production")) - if not ctx.quiet: - yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + # if not ctx.quiet: + # yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") maybe_prefix = self.install_prefix if maybe_prefix: diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py index a549f07b8b4..9fa5ca66f52 100644 --- a/dev/archery/archery/utils/cmake.py +++ b/dev/archery/archery/utils/cmake.py @@ -168,12 +168,25 @@ def test(self): @staticmethod def is_build_dir(path): + """ Indicate if a path is CMake build directory. + + This method only checks for the existence of paths and does not do any + validation whatsoever. + """ cmake_cache = os.path.join(path, "CMakeCache.txt") cmake_files = os.path.join(path, "CMakeFiles") return os.path.exists(cmake_cache) and os.path.exists(cmake_files) @staticmethod def from_path(path): + """ Instantiate a CMakeBuild from a path. + + This is used to recover from an existing physical directory (created + with or without CMakeBuild). + + Note that this method is not idempotent as the original definition will + be lost. Only some parameters are recovered (generator and build_type). + """ if not CMakeBuild.is_build_dir(path): raise ValueError(f"Not a valid CMakeBuild path: {path}") diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py index 2d62eeb31bd..c6113524f1b 100644 --- a/dev/archery/archery/utils/git.py +++ b/dev/archery/archery/utils/git.py @@ -32,9 +32,6 @@ def wrapper(self, *argv, **kwargs): class Git(Command): - WORKSPACE = "WORKSPACE" - HEAD = "HEAD" - def __init__(self, git_bin=None): self.bin = git_bin if git_bin else os.environ.get("GIT", "git") @@ -66,11 +63,11 @@ def rev_parse(self, *argv, **kwargs): @capture_stdout(strip=True) def head(self, **kwargs): """ Return commit pointed by HEAD. """ - return self.rev_parse(Git.HEAD, **kwargs) + return self.rev_parse("HEAD", **kwargs) @capture_stdout(strip=True) def current_branch(self, **kwargs): - return self.rev_parse("--abbrev-ref", Git.HEAD, **kwargs) + return self.rev_parse("--abbrev-ref", "HEAD", **kwargs) git = Git() diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index 45795a5ba6e..12dc735ea0b 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -24,6 +24,10 @@ class ArrowSources: """ ArrowSources is a companion class representing a directory containing Apache Arrow's sources. """ + # Note that WORKSPACE is a reserved git revision name by this module to + # reference the current git workspace. In other words, this indicates to + # ArrowSources.at_revision that no cloning/checkout is required. + WORKSPACE = "WORKSPACE" def __init__(self, path): """ Initialize an ArrowSources @@ -60,6 +64,12 @@ def at_revision(self, revision, clone_dir): This method may return the current object if no checkout is required. The caller is responsible to remove the cloned repository directory. + The user can use the special WORKSPACE token to mean the current git + workspace (no checkout performed). + + The second value of the returned tuple indicates if a clone was + performed. + Parameters ---------- revision : str @@ -70,13 +80,16 @@ def at_revision(self, revision, clone_dir): if not self.git_backed: raise ValueError(f"{self} is not backed by git") + if revision == ArrowSources.WORKSPACE: + return self, False + # A local clone is required to leave the current sources intact such # that builds depending on said sources are not invalidated (or worse # slightly affected when re-invoking the generator). git.clone("--local", self.path, clone_dir) git.checkout(revision, git_dir=clone_dir) - return ArrowSources(clone_dir) + return ArrowSources(clone_dir), True @staticmethod def valid(src): diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 54f1c14966e..2cf692c005a 100644 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -21,7 +21,7 @@ if sys.version_info < (3, 5): - sys.exit('Python < 3.5 is not supported due to subprocess.run') + sys.exit('Python < 3.5 is not supported') setup( @@ -32,7 +32,7 @@ maintainer='Arrow Developers', maintainer_email='dev@arrow.apache.org', packages=['archery'], - install_requires=['click'], + install_requires=['click', 'pandas'], entry_points=''' [console_scripts] archery=archery.cli:archery diff --git a/docs/source/developers/benchmarks.rst b/docs/source/developers/benchmarks.rst index b403392ffbc..d0e6f1b8aa4 100644 --- a/docs/source/developers/benchmarks.rst +++ b/docs/source/developers/benchmarks.rst @@ -39,6 +39,9 @@ the module files but uses the actual sources. pip install -e dev/archery archery --help + # optional: enable bash/zsh autocompletion + eval "$(_ARCHERY_COMPLETE=source archery)" + Comparison ========== From 2a953f1808566da01bbb90faeabe8131ff55f902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 14:05:38 -0400 Subject: [PATCH 28/31] Missing files --- dev/archery/archery/benchmark/compare.py | 120 +++++++++++++++++++++++ dev/archery/archery/utils/codec.py | 43 ++++++++ dev/archery/tests/test_benchmarks.py | 39 ++++++++ 3 files changed, 202 insertions(+) create mode 100644 dev/archery/archery/benchmark/compare.py create mode 100644 dev/archery/archery/utils/codec.py create mode 100644 dev/archery/tests/test_benchmarks.py diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py new file mode 100644 index 00000000000..74a2538157b --- /dev/null +++ b/dev/archery/archery/benchmark/compare.py @@ -0,0 +1,120 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +DEFAULT_THRESHOLD = 0.05 + + +class BenchmarkComparator: + """ Compares two benchmarks. + + Encodes the logic of comparing two benchmarks and taking a decision on + if it induce a regression. + """ + + def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD, + suite_name=None): + self.contender = contender + self.baseline = baseline + self.threshold = threshold + self.suite_name = suite_name + + @property + def name(self): + return self.baseline.name + + @property + def less_is_better(self): + return self.baseline.less_is_better + + @property + def unit(self): + return self.baseline.unit + + @property + def change(self): + new = self.contender.value + old = self.baseline.value + + if old == 0 and new == 0: + return 0.0 + if old == 0: + return 0.0 + + return float(new - old) / abs(old) + + @property + def confidence(self): + """ Indicate if a comparison of benchmarks should be trusted. """ + return True + + @property + def regression(self): + change = self.change + adjusted_change = change if self.less_is_better else -change + return (self.confidence and adjusted_change > self.threshold) + + def compare(self, comparator=None): + return { + "benchmark": self.name, + "change": self.change, + "regression": self.regression, + "baseline": self.baseline.value, + "contender": self.contender.value, + "unit": self.unit, + "less_is_better": self.less_is_better, + } + + def __call__(self, **kwargs): + return self.compare(**kwargs) + + +def pairwise_compare(contender, baseline): + dict_contender = {e.name: e for e in contender} + dict_baseline = {e.name: e for e in baseline} + + for name in (dict_contender.keys() & dict_baseline.keys()): + yield name, (dict_contender[name], dict_baseline[name]) + + +class RunnerComparator: + """ Compares suites/benchmarks from runners. + + It is up to the caller that ensure that runners are compatible (both from + the same language implementation). + """ + + def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD): + self.contender = contender + self.baseline = baseline + self.threshold = threshold + + def comparisons(self, suite_filter=None, benchmark_filter=None): + """ + """ + contender = self.contender.suites(suite_filter, benchmark_filter) + baseline = self.baseline.suites(suite_filter, benchmark_filter) + suites = pairwise_compare(contender, baseline) + + for suite_name, (suite_cont, suite_base) in suites: + benchmarks = pairwise_compare( + suite_cont.benchmarks, suite_base.benchmarks) + + for bench_name, (bench_cont, bench_base) in benchmarks: + yield BenchmarkComparator(bench_cont, bench_base, + threshold=self.threshold, + suite_name=suite_name) diff --git a/dev/archery/archery/utils/codec.py b/dev/archery/archery/utils/codec.py new file mode 100644 index 00000000000..612f2df5605 --- /dev/null +++ b/dev/archery/archery/utils/codec.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import json + +from ..benchmark.compare import BenchmarkComparator + + +class JsonEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, BenchmarkComparator): + comparator = { + "benchmark": o.name, + "change": o.change, + "regression": o.regression, + "baseline": o.baseline.value, + "contender": o.contender.value, + "unit": o.unit, + "less_is_better": o.less_is_better, + } + + suite_name = o.suite_name + if suite_name: + comparator["suite"] = suite_name + + return comparator + + return json.JSONEncoder.default(self, o) diff --git a/dev/archery/tests/test_benchmarks.py b/dev/archery/tests/test_benchmarks.py new file mode 100644 index 00000000000..d199a40a195 --- /dev/null +++ b/dev/archery/tests/test_benchmarks.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from archery.benchmark.core import Benchmark +from archery.benchmark.compare import BenchmarkComparator + + +def test_benchmark_comparator(): + unit = "micros" + + assert not BenchmarkComparator( + Benchmark("contender", unit, True, [10]), + Benchmark("baseline", unit, True, [20])).regression + + assert BenchmarkComparator( + Benchmark("contender", unit, False, [10]), + Benchmark("baseline", unit, False, [20])).regression + + assert BenchmarkComparator( + Benchmark("contender", unit, True, [20]), + Benchmark("baseline", unit, True, [10])).regression + + assert not BenchmarkComparator( + Benchmark("contender", unit, False, [20]), + Benchmark("baseline", unit, False, [10])).regression From ee39a1feb742dc745fe9af7d9cfc3446c71e74a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 15:40:58 -0400 Subject: [PATCH 29/31] Move cpp_runner_from_rev_or_path in CppRunner --- dev/archery/archery/benchmark/runner.py | 29 ++++++++++++++++++++++ dev/archery/archery/cli.py | 33 +++---------------------- dev/archery/archery/utils/cmake.py | 4 +-- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 158f2763667..22d1621ab7d 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -21,6 +21,8 @@ from .core import BenchmarkSuite from .google import GoogleBenchmarkCommand, GoogleBenchmark +from ..lang.cpp import CppCMakeDefinition +from ..utils.cmake import CMakeBuild from ..utils.logger import logger @@ -83,3 +85,30 @@ def suites(self, suite_filter=None, benchmark_filter=None): continue yield suite + + @staticmethod + def from_rev_or_path(src, root, rev_or_path, cmake_conf): + """ Returns a CppBenchmarkRunner from a path or a git revision. + + First, it checks if `rev_or_path` points to a valid CMake build + directory. If so, it creates a CppBenchmarkRunner with this existing + CMakeBuild. + + Otherwise, it assumes `rev_or_path` is a revision and clone/checkout + the given revision and create a fresh CMakeBuild. + """ + build = None + if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): + build = CMakeBuild.from_path(rev_or_path) + else: + root_rev = os.path.join(root, rev_or_path) + os.mkdir(root_rev) + + clone_dir = os.path.join(root_rev, "arrow") + # Possibly checkout the sources at given revision, no need to + # perform cleanup on cloned repository as root_rev is reclaimed. + src_rev, _ = src.at_revision(rev_or_path, clone_dir) + cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) + build = cmake_def.build(os.path.join(root_rev, "build")) + + return CppBenchmarkRunner(build) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index c8929e7dd1b..2f203123ec5 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -27,7 +27,6 @@ from .benchmark.compare import RunnerComparator from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration -from .utils.cmake import CMakeBuild from .utils.codec import JsonEncoder from .utils.logger import logger, ctx as log_ctx from .utils.source import ArrowSources @@ -151,32 +150,6 @@ def tmpdir(preserve, prefix="arrow-bench-"): yield tmp -def cpp_runner_from_rev_or_path(src, root, rev_or_path, cmake_conf): - """ Returns a CppBenchmarkRunner from a path or a git revision. - - First, it checks if `rev_or_path` points to a valid CMake build directory. - If so, it creates a CppBenchmarkRunner with this existing CMakeBuild. - - Otherwise, it assumes `rev_or_path` is a revision and clone/checkout the - given revision and create a fresh CMakeBuild. - """ - build = None - if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): - build = CMakeBuild.from_path(rev_or_path) - else: - root_rev = os.path.join(root, rev_or_path) - os.mkdir(root_rev) - - clone_dir = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision, no need to perform - # cleanup on cloned repository as root_rev is reclaimed. - src_rev, _ = src.at_revision(rev_or_path, clone_dir) - cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) - build = cmake_def.build(os.path.join(root_rev, "build")) - - return CppBenchmarkRunner(build) - - # Running all benchmarks would be prohibitive. Benchmark who needs to be # monitored for regression should be named with this prefix. DEFAULT_BENCHMARK_FILTER = "^Regression" @@ -278,8 +251,10 @@ def benchmark_diff(ctx, src, preserve, suite_filter, benchmark_filter, build_type="release", with_tests=True, with_benchmarks=True, with_python=False, cmake_extras=cmake_extras) - runner_cont = cpp_runner_from_rev_or_path(src, root, contender, conf) - runner_base = cpp_runner_from_rev_or_path(src, root, baseline, conf) + runner_cont = CppBenchmarkRunner.from_rev_or_path( + src, root, contender, conf) + runner_base = CppBenchmarkRunner.from_rev_or_path( + src, root, baseline, conf) runner_comp = RunnerComparator(runner_cont, runner_base, threshold) comparisons = runner_comp.comparisons(suite_filter, benchmark_filter) diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py index 9fa5ca66f52..38aedab2d16 100644 --- a/dev/archery/archery/utils/cmake.py +++ b/dev/archery/archery/utils/cmake.py @@ -200,9 +200,7 @@ def from_path(path): cmake_cache_path = os.path.join(path, "CMakeCache.txt") with open(cmake_cache_path, "r") as cmake_cache: candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read()) - if len(candidates) != 1: - ValueError("Could not chose build_type from {candidates}") - build_type = candidates[0].lower() + build_type = candidates[0].lower() if candidates else "release" return CMakeBuild(path, generator, build_type) From e95baf3178eb847d934766bb4e746fd1459600ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 24 Apr 2019 20:28:48 -0400 Subject: [PATCH 30/31] Add comments and move stuff --- dev/archery/archery/benchmark/compare.py | 2 ++ dev/archery/archery/cli.py | 8 ++++++-- dev/archery/archery/utils/logger.py | 1 - 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py index 74a2538157b..bf9811f2c8d 100644 --- a/dev/archery/archery/benchmark/compare.py +++ b/dev/archery/archery/benchmark/compare.py @@ -16,6 +16,8 @@ # under the License. +# Define a global regression threshold as 5%. This is purely subjective and +# flawed. This does not track cumulative regression. DEFAULT_THRESHOLD = 0.05 diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 2f203123ec5..8b4780af987 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -24,13 +24,16 @@ import sys from tempfile import mkdtemp, TemporaryDirectory -from .benchmark.compare import RunnerComparator +from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD from .benchmark.runner import CppBenchmarkRunner from .lang.cpp import CppCMakeDefinition, CppConfiguration from .utils.codec import JsonEncoder from .utils.logger import logger, ctx as log_ctx from .utils.source import ArrowSources +# Set default logging to INFO in command line. +logging.basicConfig(level=logging.INFO) + @click.group() @click.option("--debug", type=bool, is_flag=True, default=False, @@ -177,7 +180,8 @@ def benchmark(ctx): help="Regex filtering benchmark suites.") @click.option("--preserve", type=bool, default=False, show_default=True, is_flag=True, help="Preserve workspace for investigation.") -@click.option("--threshold", type=float, default=0.05, show_default=True, +@click.option("--threshold", type=float, default=DEFAULT_THRESHOLD, + show_default=True, help="Regression failure threshold in percentage.") @click.option("--cmake-extras", type=str, multiple=True, help="Extra flags/options to pass to cmake invocation. " diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py index 0a4a99edb2f..9d0feda88e6 100644 --- a/dev/archery/archery/utils/logger.py +++ b/dev/archery/archery/utils/logger.py @@ -18,7 +18,6 @@ import logging """ Global logger. """ -logging.basicConfig(level=logging.INFO) logger = logging.getLogger("archery") From a047ae4ed4ae884b8c8e9721ba4ab4d389130166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Thu, 25 Apr 2019 10:06:05 -0400 Subject: [PATCH 31/31] Satisfy flake8 --- dev/archery/archery/benchmark/runner.py | 2 +- dev/archery/archery/cli.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py index 22d1621ab7d..7dc56bdf096 100644 --- a/dev/archery/archery/benchmark/runner.py +++ b/dev/archery/archery/benchmark/runner.py @@ -98,7 +98,7 @@ def from_rev_or_path(src, root, rev_or_path, cmake_conf): the given revision and create a fresh CMakeBuild. """ build = None - if os.path.exists(rev_or_path) and CMakeBuild.is_build_dir(rev_or_path): + if CMakeBuild.is_build_dir(rev_or_path): build = CMakeBuild.from_path(rev_or_path) else: root_rev = os.path.join(root, rev_or_path) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 8b4780af987..4fa88966080 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -20,7 +20,6 @@ from contextlib import contextmanager import json import logging -import os import sys from tempfile import mkdtemp, TemporaryDirectory