From 8dd562d684b24eafc25219883b72f819bc37561f Mon Sep 17 00:00:00 2001 From: Diana Clarke Date: Tue, 8 Feb 2022 18:48:49 -0700 Subject: [PATCH 1/2] DataFusion + Conbench Integration --- conbench/.flake8 | 2 + conbench/.gitignore | 130 +++++++++++++++ conbench/.isort.cfg | 2 + conbench/README.md | 252 ++++++++++++++++++++++++++++++ conbench/_criterion.py | 99 ++++++++++++ conbench/benchmarks.json | 8 + conbench/benchmarks.py | 41 +++++ conbench/requirements-test.txt | 3 + conbench/requirements.txt | 1 + dev/release/rat_exclude_files.txt | 5 + 10 files changed, 543 insertions(+) create mode 100644 conbench/.flake8 create mode 100755 conbench/.gitignore create mode 100644 conbench/.isort.cfg create mode 100644 conbench/README.md create mode 100644 conbench/_criterion.py create mode 100644 conbench/benchmarks.json create mode 100644 conbench/benchmarks.py create mode 100644 conbench/requirements-test.txt create mode 100644 conbench/requirements.txt diff --git a/conbench/.flake8 b/conbench/.flake8 new file mode 100644 index 0000000000000..e44b81084185c --- /dev/null +++ b/conbench/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501 diff --git a/conbench/.gitignore b/conbench/.gitignore new file mode 100755 index 0000000000000..aa44ee2adbd4c --- /dev/null +++ b/conbench/.gitignore @@ -0,0 +1,130 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + diff --git a/conbench/.isort.cfg b/conbench/.isort.cfg new file mode 100644 index 0000000000000..f238bf7ea137e --- /dev/null +++ b/conbench/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black diff --git a/conbench/README.md b/conbench/README.md new file mode 100644 index 0000000000000..ec4a300e4b7b8 --- /dev/null +++ b/conbench/README.md @@ -0,0 +1,252 @@ + + +# DataFusion + Conbench Integration + + +## Quick start + +``` +$ cd ~/arrow-datafusion/conbench/ +$ conda create -y -n conbench python=3.9 +$ conda activate conbench +(conbench) $ pip install -r requirements.txt +(conbench) $ conbench datafusion --src-dir=/Users/diana/workspace/arrow-datafusion +``` + +## Example output + +``` +{ + "batch_id": "3c82f9d23fce49328b78ba9fd963b254", + "context": { + "benchmark_language": "Rust" + }, + "github": { + "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d", + "repository": "https://github.com/dianaclarke/arrow-datafusion" + }, + "info": {}, + "machine_info": { + "architecture_name": "x86_64", + "cpu_core_count": "8", + "cpu_frequency_max_hz": "2400000000", + "cpu_l1d_cache_bytes": "65536", + "cpu_l1i_cache_bytes": "131072", + "cpu_l2_cache_bytes": "4194304", + "cpu_l3_cache_bytes": "0", + "cpu_model_name": "Apple M1", + "cpu_thread_count": "8", + "gpu_count": "0", + "gpu_product_names": [], + "kernel_name": "20.6.0", + "memory_bytes": "17179869184", + "name": "diana", + "os_name": "macOS", + "os_version": "10.16" + }, + "run_id": "ec2a50b9380c470b96d7eb7d63ab5b77", + "stats": { + "data": [ + "0.001532", + "0.001394", + "0.001333", + "0.001356", + "0.001379", + "0.001361", + "0.001307", + "0.001348", + "0.001436", + "0.001397", + "0.001339", + "0.001523", + "0.001593", + "0.001415", + "0.001344", + "0.001312", + "0.001402", + "0.001362", + "0.001329", + "0.001330", + "0.001447", + "0.001413", + "0.001536", + "0.001330", + "0.001333", + "0.001338", + "0.001333", + "0.001331", + "0.001426", + "0.001575", + "0.001362", + "0.001343", + "0.001334", + "0.001383", + "0.001476", + "0.001356", + "0.001362", + "0.001334", + "0.001390", + "0.001497", + "0.001330", + "0.001347", + "0.001331", + "0.001468", + "0.001377", + "0.001351", + "0.001328", + "0.001509", + "0.001338", + "0.001355", + "0.001332", + "0.001485", + "0.001370", + "0.001366", + "0.001507", + "0.001358", + "0.001331", + "0.001463", + "0.001362", + "0.001336", + "0.001428", + "0.001343", + "0.001359", + "0.001905", + "0.001726", + "0.001411", + "0.001433", + "0.001391", + "0.001453", + "0.001346", + "0.001339", + "0.001420", + "0.001330", + "0.001422", + "0.001683", + "0.001426", + "0.001349", + "0.001342", + "0.001430", + "0.001330", + "0.001436", + "0.001331", + "0.001415", + "0.001332", + "0.001408", + "0.001343", + "0.001392", + "0.001371", + "0.001655", + "0.001354", + "0.001438", + "0.001347", + "0.001341", + "0.001374", + "0.001453", + "0.001352", + "0.001358", + "0.001398", + "0.001362", + "0.001454" + ], + "iqr": "0.000088", + "iterations": 100, + "max": "0.001905", + "mean": "0.001401", + "median": "0.001362", + "min": "0.001307", + "q1": "0.001340", + "q3": "0.001428", + "stdev": "0.000095", + "time_unit": "s", + "times": [], + "unit": "s" + }, + "tags": { + "name": "aggregate_query_group_by", + "suite": "aggregate_query_group_by" + }, + "timestamp": "2022-02-09T01:32:55.769468+00:00" +} +``` + +## Debug with test benchmark + +``` +(conbench) $ cd ~/arrow-datafusion/conbench/ +(conbench) $ conbench test --iterations=3 + +Benchmark result: +{ + "batch_id": "41a144761bc24d82b94efa70d6e460b3", + "context": { + "benchmark_language": "Python" + }, + "github": { + "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d", + "repository": "https://github.com/dianaclarke/arrow-datafusion" + }, + "info": { + "benchmark_language_version": "Python 3.9.7" + }, + "machine_info": { + "architecture_name": "x86_64", + "cpu_core_count": "8", + "cpu_frequency_max_hz": "2400000000", + "cpu_l1d_cache_bytes": "65536", + "cpu_l1i_cache_bytes": "131072", + "cpu_l2_cache_bytes": "4194304", + "cpu_l3_cache_bytes": "0", + "cpu_model_name": "Apple M1", + "cpu_thread_count": "8", + "gpu_count": "0", + "gpu_product_names": [], + "kernel_name": "20.6.0", + "memory_bytes": "17179869184", + "name": "diana", + "os_name": "macOS", + "os_version": "10.16" + }, + "run_id": "71f46362db8844afacea82cba119cefc", + "stats": { + "data": [ + "0.000001", + "0.000001", + "0.000000" + ], + "iqr": "0.000000", + "iterations": 3, + "max": "0.000001", + "mean": "0.000001", + "median": "0.000001", + "min": "0.000000", + "q1": "0.000000", + "q3": "0.000001", + "stdev": "0.000001", + "time_unit": "s", + "times": [], + "unit": "s" + }, + "tags": { + "name": "test" + }, + "timestamp": "2022-02-09T01:36:45.823615+00:00" +} +``` + diff --git a/conbench/_criterion.py b/conbench/_criterion.py new file mode 100644 index 0000000000000..7e40ef83ef1e4 --- /dev/null +++ b/conbench/_criterion.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import collections +import csv +import os +import pathlib +import subprocess + +import conbench.runner +from conbench.machine_info import github_info + + +def _result_in_seconds(row): + # sample_measured_value - The value of the measurement for this sample. + # Note that this is the measured value for the whole sample, not the + # time-per-iteration To calculate the time-per-iteration, use + # sample_measured_value/iteration_count + # -- https://bheisler.github.io/criterion.rs/book/user_guide/csv_output.html + count = int(row["iteration_count"]) + sample = float(row["sample_measured_value"]) + return sample / count / 10**9 + + +def _parse_benchmark_group(row): + parts = row["group"].split(",") + if len(parts) > 1: + suite, name = parts[0], ",".join(parts[1:]) + else: + suite, name = row["group"], row["group"] + return suite.strip(), name.strip() + + +def _read_results(src_dir): + results = collections.defaultdict(lambda: collections.defaultdict(list)) + path = pathlib.Path(os.path.join(src_dir, "target", "criterion")) + for path in list(path.glob("**/new/raw.csv")): + with open(path) as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + suite, name = _parse_benchmark_group(row) + results[suite][name].append(_result_in_seconds(row)) + return results + + +def _execute_command(command): + try: + print(command) + result = subprocess.run(command, capture_output=True, check=True) + except subprocess.CalledProcessError as e: + print(e.stderr.decode("utf-8")) + raise e + return result.stdout.decode("utf-8"), result.stderr.decode("utf-8") + + +class CriterionBenchmark(conbench.runner.Benchmark): + external, iterations = True, None + options = {"src_dir": {"type": str}} + + def run(self, **kwargs): + src_dir = kwargs["src_dir"] + self._cargo_bench(src_dir) + results = _read_results(src_dir) + for suite in results: + self.conbench.mark_new_batch() + for name, data in results[suite].items(): + yield self._record_result(suite, name, data, kwargs) + + def _cargo_bench(self, src_dir): + os.chdir(src_dir) + _execute_command(["cargo", "bench"]) + + def _record_result(self, suite, name, data, options): + tags = {"suite": suite} + result = {"data": data, "unit": "s"} + context = {"benchmark_language": "Rust"} + github = github_info() + return self.conbench.record( + result, + name, + tags=tags, + context=context, + github=github, + options=options, + ) diff --git a/conbench/benchmarks.json b/conbench/benchmarks.json new file mode 100644 index 0000000000000..bb7033547722b --- /dev/null +++ b/conbench/benchmarks.json @@ -0,0 +1,8 @@ +[ + { + "command": "datafusion", + "flags": { + "language": "Rust" + } + } +] diff --git a/conbench/benchmarks.py b/conbench/benchmarks.py new file mode 100644 index 0000000000000..9ad3e314ee4e7 --- /dev/null +++ b/conbench/benchmarks.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import conbench.runner + +import _criterion + + +@conbench.runner.register_benchmark +class TestBenchmark(conbench.runner.Benchmark): + name = "test" + + def run(self, **kwargs): + yield self.conbench.benchmark( + self._f(), + self.name, + options=kwargs, + ) + + def _f(self): + return lambda: 1 + 1 + + +@conbench.runner.register_benchmark +class CargoBenchmarks(_criterion.CriterionBenchmark): + name = "datafusion" + description = "Run Arrow Datafusion micro benchmarks." diff --git a/conbench/requirements-test.txt b/conbench/requirements-test.txt new file mode 100644 index 0000000000000..5e5647acd2d64 --- /dev/null +++ b/conbench/requirements-test.txt @@ -0,0 +1,3 @@ +black +flake8 +isort diff --git a/conbench/requirements.txt b/conbench/requirements.txt new file mode 100644 index 0000000000000..a877c7b44e9be --- /dev/null +++ b/conbench/requirements.txt @@ -0,0 +1 @@ +conbench diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 304c08e808991..d35b5a81b660d 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -17,6 +17,11 @@ CHANGELOG.md datafusion/CHANGELOG.md ballista/CHANGELOG.md python/CHANGELOG.md +conbench/benchmarks.json +conbench/requirements.txt +conbench/requirements-test.txt +conbench/.flake8 +conbench/.isort.cfg dev/requirements*.txt dev/archery/MANIFEST.in dev/archery/requirements*.txt From e52330b2e958a99a5085070127eeeefb9a27892f Mon Sep 17 00:00:00 2001 From: Diana Clarke Date: Wed, 9 Feb 2022 08:50:51 -0700 Subject: [PATCH 2/2] remove --src-dir --- conbench/README.md | 2 +- conbench/_criterion.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/conbench/README.md b/conbench/README.md index ec4a300e4b7b8..f655ac8bd2972 100644 --- a/conbench/README.md +++ b/conbench/README.md @@ -27,7 +27,7 @@ $ cd ~/arrow-datafusion/conbench/ $ conda create -y -n conbench python=3.9 $ conda activate conbench (conbench) $ pip install -r requirements.txt -(conbench) $ conbench datafusion --src-dir=/Users/diana/workspace/arrow-datafusion +(conbench) $ conbench datafusion ``` ## Example output diff --git a/conbench/_criterion.py b/conbench/_criterion.py index 7e40ef83ef1e4..168a1b9b6cb10 100644 --- a/conbench/_criterion.py +++ b/conbench/_criterion.py @@ -68,11 +68,10 @@ def _execute_command(command): class CriterionBenchmark(conbench.runner.Benchmark): - external, iterations = True, None - options = {"src_dir": {"type": str}} + external = True def run(self, **kwargs): - src_dir = kwargs["src_dir"] + src_dir = os.path.join(os.getcwd(), "..") self._cargo_bench(src_dir) results = _read_results(src_dir) for suite in results: