From 8dd562d684b24eafc25219883b72f819bc37561f Mon Sep 17 00:00:00 2001
From: Diana Clarke <diana.joan.clarke@gmail.com>
Date: Tue, 8 Feb 2022 18:48:49 -0700
Subject: [PATCH 1/2] DataFusion + Conbench Integration

---
 conbench/.flake8                  |   2 +
 conbench/.gitignore               | 130 +++++++++++++++
 conbench/.isort.cfg               |   2 +
 conbench/README.md                | 252 ++++++++++++++++++++++++++++++
 conbench/_criterion.py            |  99 ++++++++++++
 conbench/benchmarks.json          |   8 +
 conbench/benchmarks.py            |  41 +++++
 conbench/requirements-test.txt    |   3 +
 conbench/requirements.txt         |   1 +
 dev/release/rat_exclude_files.txt |   5 +
 10 files changed, 543 insertions(+)
 create mode 100644 conbench/.flake8
 create mode 100755 conbench/.gitignore
 create mode 100644 conbench/.isort.cfg
 create mode 100644 conbench/README.md
 create mode 100644 conbench/_criterion.py
 create mode 100644 conbench/benchmarks.json
 create mode 100644 conbench/benchmarks.py
 create mode 100644 conbench/requirements-test.txt
 create mode 100644 conbench/requirements.txt

diff --git a/conbench/.flake8 b/conbench/.flake8
new file mode 100644
index 0000000000000..e44b81084185c
--- /dev/null
+++ b/conbench/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501
diff --git a/conbench/.gitignore b/conbench/.gitignore
new file mode 100755
index 0000000000000..aa44ee2adbd4c
--- /dev/null
+++ b/conbench/.gitignore
@@ -0,0 +1,130 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
diff --git a/conbench/.isort.cfg b/conbench/.isort.cfg
new file mode 100644
index 0000000000000..f238bf7ea137e
--- /dev/null
+++ b/conbench/.isort.cfg
@@ -0,0 +1,2 @@
+[settings]
+profile = black
diff --git a/conbench/README.md b/conbench/README.md
new file mode 100644
index 0000000000000..ec4a300e4b7b8
--- /dev/null
+++ b/conbench/README.md
@@ -0,0 +1,252 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion + Conbench Integration
+
+
+## Quick start
+
+```
+$ cd ~/arrow-datafusion/conbench/
+$ conda create -y -n conbench python=3.9
+$ conda activate conbench
+(conbench) $ pip install -r requirements.txt
+(conbench) $ conbench datafusion --src-dir=/Users/diana/workspace/arrow-datafusion
+```
+
+## Example output
+
+```
+{
+    "batch_id": "3c82f9d23fce49328b78ba9fd963b254",
+    "context": {
+        "benchmark_language": "Rust"
+    },
+    "github": {
+        "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d",
+        "repository": "https://github.com/dianaclarke/arrow-datafusion"
+    },
+    "info": {},
+    "machine_info": {
+        "architecture_name": "x86_64",
+        "cpu_core_count": "8",
+        "cpu_frequency_max_hz": "2400000000",
+        "cpu_l1d_cache_bytes": "65536",
+        "cpu_l1i_cache_bytes": "131072",
+        "cpu_l2_cache_bytes": "4194304",
+        "cpu_l3_cache_bytes": "0",
+        "cpu_model_name": "Apple M1",
+        "cpu_thread_count": "8",
+        "gpu_count": "0",
+        "gpu_product_names": [],
+        "kernel_name": "20.6.0",
+        "memory_bytes": "17179869184",
+        "name": "diana",
+        "os_name": "macOS",
+        "os_version": "10.16"
+    },
+    "run_id": "ec2a50b9380c470b96d7eb7d63ab5b77",
+    "stats": {
+        "data": [
+            "0.001532",
+            "0.001394",
+            "0.001333",
+            "0.001356",
+            "0.001379",
+            "0.001361",
+            "0.001307",
+            "0.001348",
+            "0.001436",
+            "0.001397",
+            "0.001339",
+            "0.001523",
+            "0.001593",
+            "0.001415",
+            "0.001344",
+            "0.001312",
+            "0.001402",
+            "0.001362",
+            "0.001329",
+            "0.001330",
+            "0.001447",
+            "0.001413",
+            "0.001536",
+            "0.001330",
+            "0.001333",
+            "0.001338",
+            "0.001333",
+            "0.001331",
+            "0.001426",
+            "0.001575",
+            "0.001362",
+            "0.001343",
+            "0.001334",
+            "0.001383",
+            "0.001476",
+            "0.001356",
+            "0.001362",
+            "0.001334",
+            "0.001390",
+            "0.001497",
+            "0.001330",
+            "0.001347",
+            "0.001331",
+            "0.001468",
+            "0.001377",
+            "0.001351",
+            "0.001328",
+            "0.001509",
+            "0.001338",
+            "0.001355",
+            "0.001332",
+            "0.001485",
+            "0.001370",
+            "0.001366",
+            "0.001507",
+            "0.001358",
+            "0.001331",
+            "0.001463",
+            "0.001362",
+            "0.001336",
+            "0.001428",
+            "0.001343",
+            "0.001359",
+            "0.001905",
+            "0.001726",
+            "0.001411",
+            "0.001433",
+            "0.001391",
+            "0.001453",
+            "0.001346",
+            "0.001339",
+            "0.001420",
+            "0.001330",
+            "0.001422",
+            "0.001683",
+            "0.001426",
+            "0.001349",
+            "0.001342",
+            "0.001430",
+            "0.001330",
+            "0.001436",
+            "0.001331",
+            "0.001415",
+            "0.001332",
+            "0.001408",
+            "0.001343",
+            "0.001392",
+            "0.001371",
+            "0.001655",
+            "0.001354",
+            "0.001438",
+            "0.001347",
+            "0.001341",
+            "0.001374",
+            "0.001453",
+            "0.001352",
+            "0.001358",
+            "0.001398",
+            "0.001362",
+            "0.001454"
+        ],
+        "iqr": "0.000088",
+        "iterations": 100,
+        "max": "0.001905",
+        "mean": "0.001401",
+        "median": "0.001362",
+        "min": "0.001307",
+        "q1": "0.001340",
+        "q3": "0.001428",
+        "stdev": "0.000095",
+        "time_unit": "s",
+        "times": [],
+        "unit": "s"
+    },
+    "tags": {
+        "name": "aggregate_query_group_by",
+        "suite": "aggregate_query_group_by"
+    },
+    "timestamp": "2022-02-09T01:32:55.769468+00:00"
+}
+```
+
+## Debug with test benchmark
+
+```
+(conbench) $ cd ~/arrow-datafusion/conbench/
+(conbench) $ conbench test --iterations=3
+
+Benchmark result:
+{
+    "batch_id": "41a144761bc24d82b94efa70d6e460b3",
+    "context": {
+        "benchmark_language": "Python"
+    },
+    "github": {
+        "commit": "e8c198b9fac6cd8822b950b9f71898e47965488d",
+        "repository": "https://github.com/dianaclarke/arrow-datafusion"
+    },
+    "info": {
+        "benchmark_language_version": "Python 3.9.7"
+    },
+    "machine_info": {
+        "architecture_name": "x86_64",
+        "cpu_core_count": "8",
+        "cpu_frequency_max_hz": "2400000000",
+        "cpu_l1d_cache_bytes": "65536",
+        "cpu_l1i_cache_bytes": "131072",
+        "cpu_l2_cache_bytes": "4194304",
+        "cpu_l3_cache_bytes": "0",
+        "cpu_model_name": "Apple M1",
+        "cpu_thread_count": "8",
+        "gpu_count": "0",
+        "gpu_product_names": [],
+        "kernel_name": "20.6.0",
+        "memory_bytes": "17179869184",
+        "name": "diana",
+        "os_name": "macOS",
+        "os_version": "10.16"
+    },
+    "run_id": "71f46362db8844afacea82cba119cefc",
+    "stats": {
+        "data": [
+            "0.000001",
+            "0.000001",
+            "0.000000"
+        ],
+        "iqr": "0.000000",
+        "iterations": 3,
+        "max": "0.000001",
+        "mean": "0.000001",
+        "median": "0.000001",
+        "min": "0.000000",
+        "q1": "0.000000",
+        "q3": "0.000001",
+        "stdev": "0.000001",
+        "time_unit": "s",
+        "times": [],
+        "unit": "s"
+    },
+    "tags": {
+        "name": "test"
+    },
+    "timestamp": "2022-02-09T01:36:45.823615+00:00"
+}
+```
+
diff --git a/conbench/_criterion.py b/conbench/_criterion.py
new file mode 100644
index 0000000000000..7e40ef83ef1e4
--- /dev/null
+++ b/conbench/_criterion.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import collections
+import csv
+import os
+import pathlib
+import subprocess
+
+import conbench.runner
+from conbench.machine_info import github_info
+
+
+def _result_in_seconds(row):
+    # sample_measured_value - The value of the measurement for this sample.
+    # Note that this is the measured value for the whole sample, not the
+    # time-per-iteration To calculate the time-per-iteration, use
+    # sample_measured_value/iteration_count
+    # -- https://bheisler.github.io/criterion.rs/book/user_guide/csv_output.html
+    count = int(row["iteration_count"])
+    sample = float(row["sample_measured_value"])
+    return sample / count / 10**9
+
+
+def _parse_benchmark_group(row):
+    parts = row["group"].split(",")
+    if len(parts) > 1:
+        suite, name = parts[0], ",".join(parts[1:])
+    else:
+        suite, name = row["group"], row["group"]
+    return suite.strip(), name.strip()
+
+
+def _read_results(src_dir):
+    results = collections.defaultdict(lambda: collections.defaultdict(list))
+    path = pathlib.Path(os.path.join(src_dir, "target", "criterion"))
+    for path in list(path.glob("**/new/raw.csv")):
+        with open(path) as csv_file:
+            reader = csv.DictReader(csv_file)
+            for row in reader:
+                suite, name = _parse_benchmark_group(row)
+                results[suite][name].append(_result_in_seconds(row))
+    return results
+
+
+def _execute_command(command):
+    try:
+        print(command)
+        result = subprocess.run(command, capture_output=True, check=True)
+    except subprocess.CalledProcessError as e:
+        print(e.stderr.decode("utf-8"))
+        raise e
+    return result.stdout.decode("utf-8"), result.stderr.decode("utf-8")
+
+
+class CriterionBenchmark(conbench.runner.Benchmark):
+    external, iterations = True, None
+    options = {"src_dir": {"type": str}}
+
+    def run(self, **kwargs):
+        src_dir = kwargs["src_dir"]
+        self._cargo_bench(src_dir)
+        results = _read_results(src_dir)
+        for suite in results:
+            self.conbench.mark_new_batch()
+            for name, data in results[suite].items():
+                yield self._record_result(suite, name, data, kwargs)
+
+    def _cargo_bench(self, src_dir):
+        os.chdir(src_dir)
+        _execute_command(["cargo", "bench"])
+
+    def _record_result(self, suite, name, data, options):
+        tags = {"suite": suite}
+        result = {"data": data, "unit": "s"}
+        context = {"benchmark_language": "Rust"}
+        github = github_info()
+        return self.conbench.record(
+            result,
+            name,
+            tags=tags,
+            context=context,
+            github=github,
+            options=options,
+        )
diff --git a/conbench/benchmarks.json b/conbench/benchmarks.json
new file mode 100644
index 0000000000000..bb7033547722b
--- /dev/null
+++ b/conbench/benchmarks.json
@@ -0,0 +1,8 @@
+[
+  {
+    "command": "datafusion",
+    "flags": {
+      "language": "Rust"
+    }
+  }
+]
diff --git a/conbench/benchmarks.py b/conbench/benchmarks.py
new file mode 100644
index 0000000000000..9ad3e314ee4e7
--- /dev/null
+++ b/conbench/benchmarks.py
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import conbench.runner
+
+import _criterion
+
+
+@conbench.runner.register_benchmark
+class TestBenchmark(conbench.runner.Benchmark):
+    name = "test"
+
+    def run(self, **kwargs):
+        yield self.conbench.benchmark(
+            self._f(),
+            self.name,
+            options=kwargs,
+        )
+
+    def _f(self):
+        return lambda: 1 + 1
+
+
+@conbench.runner.register_benchmark
+class CargoBenchmarks(_criterion.CriterionBenchmark):
+    name = "datafusion"
+    description = "Run Arrow Datafusion micro benchmarks."
diff --git a/conbench/requirements-test.txt b/conbench/requirements-test.txt
new file mode 100644
index 0000000000000..5e5647acd2d64
--- /dev/null
+++ b/conbench/requirements-test.txt
@@ -0,0 +1,3 @@
+black
+flake8
+isort
diff --git a/conbench/requirements.txt b/conbench/requirements.txt
new file mode 100644
index 0000000000000..a877c7b44e9be
--- /dev/null
+++ b/conbench/requirements.txt
@@ -0,0 +1 @@
+conbench
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 304c08e808991..d35b5a81b660d 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -17,6 +17,11 @@ CHANGELOG.md
 datafusion/CHANGELOG.md
 ballista/CHANGELOG.md
 python/CHANGELOG.md
+conbench/benchmarks.json
+conbench/requirements.txt
+conbench/requirements-test.txt
+conbench/.flake8
+conbench/.isort.cfg
 dev/requirements*.txt
 dev/archery/MANIFEST.in
 dev/archery/requirements*.txt

From e52330b2e958a99a5085070127eeeefb9a27892f Mon Sep 17 00:00:00 2001
From: Diana Clarke <diana.joan.clarke@gmail.com>
Date: Wed, 9 Feb 2022 08:50:51 -0700
Subject: [PATCH 2/2] remove --src-dir

---
 conbench/README.md     | 2 +-
 conbench/_criterion.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/conbench/README.md b/conbench/README.md
index ec4a300e4b7b8..f655ac8bd2972 100644
--- a/conbench/README.md
+++ b/conbench/README.md
@@ -27,7 +27,7 @@ $ cd ~/arrow-datafusion/conbench/
 $ conda create -y -n conbench python=3.9
 $ conda activate conbench
 (conbench) $ pip install -r requirements.txt
-(conbench) $ conbench datafusion --src-dir=/Users/diana/workspace/arrow-datafusion
+(conbench) $ conbench datafusion
 ```
 
 ## Example output
diff --git a/conbench/_criterion.py b/conbench/_criterion.py
index 7e40ef83ef1e4..168a1b9b6cb10 100644
--- a/conbench/_criterion.py
+++ b/conbench/_criterion.py
@@ -68,11 +68,10 @@ def _execute_command(command):
 
 
 class CriterionBenchmark(conbench.runner.Benchmark):
-    external, iterations = True, None
-    options = {"src_dir": {"type": str}}
+    external = True
 
     def run(self, **kwargs):
-        src_dir = kwargs["src_dir"]
+        src_dir = os.path.join(os.getcwd(), "..")
         self._cargo_bench(src_dir)
         results = _read_results(src_dir)
         for suite in results: