From 9d79c8ac4024c6c6b0e499ae3afc49ec0564b34d Mon Sep 17 00:00:00 2001 From: logan-keede Date: Fri, 14 Feb 2025 13:42:13 +0530 Subject: [PATCH 1/4] initial version --- benchmarks/lineformat.py | 135 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 benchmarks/lineformat.py diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py new file mode 100644 index 0000000000000..0ddb170d935b4 --- /dev/null +++ b/benchmarks/lineformat.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import json +from dataclasses import dataclass +from typing import Dict, List, Any +from pathlib import Path +from argparse import ArgumentParser + +try: + from rich.console import Console + from rich.table import Table +except ImportError: + print("Couldn't import modules -- run `./bench.sh venv` first") + raise + + +@dataclass +class QueryResult: + elapsed: float + row_count: int + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryResult: + return cls(elapsed=data["elapsed"], row_count=data["row_count"]) + + +@dataclass +class QueryRun: + query: int + iterations: List[QueryResult] + start_time: int + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryRun: + return cls( + query=data["query"], + iterations=[QueryResult(**iteration) for iteration in data["iterations"]], + start_time=data["start_time"], + ) + + @property + def execution_time(self) -> float: + assert len(self.iterations) >= 1 + + # Use minimum execution time to account for variations / other + # things the system was doing + return min(iteration.elapsed for iteration in self.iterations) + + +@dataclass +class Context: + benchmark_version: str + datafusion_version: str + num_cpus: int + start_time: int + arguments: List[str] + name: str + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> Context: + return cls( + benchmark_version=data["benchmark_version"], + datafusion_version=data["datafusion_version"], + num_cpus=data["num_cpus"], + start_time=data["start_time"], + arguments=data["arguments"], + name=data["arguments"][0] + ) + + +@dataclass +class BenchmarkRun: + context: Context + queries: List[QueryRun] + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun: + return cls( + context=Context.load_from(data["context"]), + queries=[QueryRun.load_from(result) for result in data["queries"]], + ) + + @classmethod + def load_from_file(cls, path: Path) -> BenchmarkRun: + with open(path, "r") as f: + return cls.load_from(json.load(f)) + + +def compare( + baseline: Path, +) -> None: + baseline = BenchmarkRun.load_from_file(baseline) + context = baseline.context + benchamrk_str = f"benchmark,name={context.name},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}" + for query in baseline.queries: + query_str = f"query=\"{query.query}\"" + timestamp = f"{query.start_time}" + for iter_num, result in enumerate(query.iterations): + print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}") + +def main() -> None: + parser = ArgumentParser() + compare_parser = parser + compare_parser.add_argument( + "baseline_path", + type=Path, + help="Path to the baseline summary file.", + ) + + options = parser.parse_args() + + compare(options.baseline_path) + + + +if __name__ == "__main__": + main() From 4f760a12a981704e45130e107e0623baca9b2ae6 Mon Sep 17 00:00:00 2001 From: logan-keede Date: Fri, 14 Feb 2025 13:51:58 +0530 Subject: [PATCH 2/4] small changes --- benchmarks/lineformat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py index 0ddb170d935b4..e827215fd32d3 100644 --- a/benchmarks/lineformat.py +++ b/benchmarks/lineformat.py @@ -109,10 +109,10 @@ def compare( ) -> None: baseline = BenchmarkRun.load_from_file(baseline) context = baseline.context - benchamrk_str = f"benchmark,name={context.name},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}" + benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}" for query in baseline.queries: query_str = f"query=\"{query.query}\"" - timestamp = f"{query.start_time}" + timestamp = f"{query.start_time*10**9}" for iter_num, result in enumerate(query.iterations): print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}") From 5ed26d3ed6914f369bc07a979628a165000c0b47 Mon Sep 17 00:00:00 2001 From: logan-keede Date: Fri, 14 Feb 2025 14:06:14 +0530 Subject: [PATCH 3/4] getting rid of rich + using stdout instead of python print --- benchmarks/lineformat.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py index e827215fd32d3..4673260474e4f 100644 --- a/benchmarks/lineformat.py +++ b/benchmarks/lineformat.py @@ -23,13 +23,8 @@ from typing import Dict, List, Any from pathlib import Path from argparse import ArgumentParser - -try: - from rich.console import Console - from rich.table import Table -except ImportError: - print("Couldn't import modules -- run `./bench.sh venv` first") - raise +import sys +print = sys.stdout.write @dataclass @@ -104,7 +99,7 @@ def load_from_file(cls, path: Path) -> BenchmarkRun: return cls.load_from(json.load(f)) -def compare( +def lineformat( baseline: Path, ) -> None: baseline = BenchmarkRun.load_from_file(baseline) @@ -114,7 +109,7 @@ def compare( query_str = f"query=\"{query.query}\"" timestamp = f"{query.start_time*10**9}" for iter_num, result in enumerate(query.iterations): - print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}") + print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n") def main() -> None: parser = ArgumentParser() @@ -127,7 +122,7 @@ def main() -> None: options = parser.parse_args() - compare(options.baseline_path) + lineformat(options.baseline_path) From 5ab53df7c0c7db51c8b7a0f7587b971f08b8b33a Mon Sep 17 00:00:00 2001 From: logan-keede Date: Sat, 15 Feb 2025 18:50:41 +0530 Subject: [PATCH 4/4] tweaks --- benchmarks/{lineformat.py => lineprotocol.py} | 68 +++++++++++++++++-- 1 file changed, 63 insertions(+), 5 deletions(-) rename benchmarks/{lineformat.py => lineprotocol.py} (68%) diff --git a/benchmarks/lineformat.py b/benchmarks/lineprotocol.py similarity index 68% rename from benchmarks/lineformat.py rename to benchmarks/lineprotocol.py index 4673260474e4f..75e09b662e3e1 100644 --- a/benchmarks/lineformat.py +++ b/benchmarks/lineprotocol.py @@ -16,6 +16,66 @@ # specific language governing permissions and limitations # under the License. + +""" +Converts a given json to LineProtocol format that can be +visualised by grafana/other systems that support LineProtocol. + +Usage example: +$ python3 lineprotocol.py sort.json +benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000 +benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000 +benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000 +benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000 +""" + +# sort.json +""" +{ + "queries": [ + { + "iterations": [ + { + "elapsed": 85626.006132, + "row_count": 10838832 + }, + { + "elapsed": 68694.467851, + "row_count": 10838832 + }, + { + "elapsed": 63392.883406, + "row_count": 10838832 + }, + { + "elapsed": 66388.367387, + "row_count": 10838832 + }, + ], + "query": "sort utf8", + "start_time": 1691105678 + }, + ], + "context": { + "arguments": [ + "sort", + "--path", + "benchmarks/data", + "--scale-factor", + "1.0", + "--iterations", + "4", + "-o", + "sort.json" + ], + "benchmark_version": "28.0.0", + "datafusion_version": "28.0.0", + "num_cpus": 8, + "start_time": 1691105678 + } +} +""" + from __future__ import annotations import json @@ -113,13 +173,11 @@ def lineformat( def main() -> None: parser = ArgumentParser() - compare_parser = parser - compare_parser.add_argument( - "baseline_path", + parser.add_argument( + "path", type=Path, - help="Path to the baseline summary file.", + help="Path to the benchmark file.", ) - options = parser.parse_args() lineformat(options.baseline_path)