From 9d79c8ac4024c6c6b0e499ae3afc49ec0564b34d Mon Sep 17 00:00:00 2001
From: logan-keede <chawlakranti8@gmail.com>
Date: Fri, 14 Feb 2025 13:42:13 +0530
Subject: [PATCH 1/4] initial version

---
 benchmarks/lineformat.py | 135 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 benchmarks/lineformat.py

diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py
new file mode 100644
index 0000000000000..0ddb170d935b4
--- /dev/null
+++ b/benchmarks/lineformat.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from typing import Dict, List, Any
+from pathlib import Path
+from argparse import ArgumentParser
+
+try:
+    from rich.console import Console
+    from rich.table import Table
+except ImportError:
+    print("Couldn't import modules -- run `./bench.sh venv` first")
+    raise
+
+
+@dataclass
+class QueryResult:
+    elapsed: float
+    row_count: int
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> QueryResult:
+        return cls(elapsed=data["elapsed"], row_count=data["row_count"])
+
+
+@dataclass
+class QueryRun:
+    query: int
+    iterations: List[QueryResult]
+    start_time: int
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> QueryRun:
+        return cls(
+            query=data["query"],
+            iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
+            start_time=data["start_time"],
+        )
+
+    @property
+    def execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        # Use minimum execution time to account for variations / other
+        # things the system was doing
+        return min(iteration.elapsed for iteration in self.iterations)
+
+
+@dataclass
+class Context:
+    benchmark_version: str
+    datafusion_version: str
+    num_cpus: int
+    start_time: int
+    arguments: List[str]
+    name: str
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> Context:
+        return cls(
+            benchmark_version=data["benchmark_version"],
+            datafusion_version=data["datafusion_version"],
+            num_cpus=data["num_cpus"],
+            start_time=data["start_time"],
+            arguments=data["arguments"],
+            name=data["arguments"][0]
+        )
+
+
+@dataclass
+class BenchmarkRun:
+    context: Context
+    queries: List[QueryRun]
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
+        return cls(
+            context=Context.load_from(data["context"]),
+            queries=[QueryRun.load_from(result) for result in data["queries"]],
+        )
+
+    @classmethod
+    def load_from_file(cls, path: Path) -> BenchmarkRun:
+        with open(path, "r") as f:
+            return cls.load_from(json.load(f))
+
+
+def compare(
+    baseline: Path,
+) -> None:
+    baseline = BenchmarkRun.load_from_file(baseline)
+    context = baseline.context
+    benchamrk_str = f"benchmark,name={context.name},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
+    for query in baseline.queries:
+        query_str = f"query=\"{query.query}\""
+        timestamp = f"{query.start_time}"
+        for iter_num, result in enumerate(query.iterations):
+            print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}")
+    
+def main() -> None:
+    parser = ArgumentParser()
+    compare_parser = parser
+    compare_parser.add_argument(
+        "baseline_path",
+        type=Path,
+        help="Path to the baseline summary file.",
+    )
+
+    options = parser.parse_args()
+
+    compare(options.baseline_path)
+
+
+
+if __name__ == "__main__":
+    main()

From 4f760a12a981704e45130e107e0623baca9b2ae6 Mon Sep 17 00:00:00 2001
From: logan-keede <chawlakranti8@gmail.com>
Date: Fri, 14 Feb 2025 13:51:58 +0530
Subject: [PATCH 2/4] small changes

---
 benchmarks/lineformat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py
index 0ddb170d935b4..e827215fd32d3 100644
--- a/benchmarks/lineformat.py
+++ b/benchmarks/lineformat.py
@@ -109,10 +109,10 @@ def compare(
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline)
     context = baseline.context
-    benchamrk_str = f"benchmark,name={context.name},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
+    benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
     for query in baseline.queries:
         query_str = f"query=\"{query.query}\""
-        timestamp = f"{query.start_time}"
+        timestamp = f"{query.start_time*10**9}"
         for iter_num, result in enumerate(query.iterations):
             print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}")
     

From 5ed26d3ed6914f369bc07a979628a165000c0b47 Mon Sep 17 00:00:00 2001
From: logan-keede <chawlakranti8@gmail.com>
Date: Fri, 14 Feb 2025 14:06:14 +0530
Subject: [PATCH 3/4] getting rid of rich + using stdout instead of python
 print

---
 benchmarks/lineformat.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/benchmarks/lineformat.py b/benchmarks/lineformat.py
index e827215fd32d3..4673260474e4f 100644
--- a/benchmarks/lineformat.py
+++ b/benchmarks/lineformat.py
@@ -23,13 +23,8 @@
 from typing import Dict, List, Any
 from pathlib import Path
 from argparse import ArgumentParser
-
-try:
-    from rich.console import Console
-    from rich.table import Table
-except ImportError:
-    print("Couldn't import modules -- run `./bench.sh venv` first")
-    raise
+import sys
+print = sys.stdout.write
 
 
 @dataclass
@@ -104,7 +99,7 @@ def load_from_file(cls, path: Path) -> BenchmarkRun:
             return cls.load_from(json.load(f))
 
 
-def compare(
+def lineformat(
     baseline: Path,
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline)
@@ -114,7 +109,7 @@ def compare(
         query_str = f"query=\"{query.query}\""
         timestamp = f"{query.start_time*10**9}"
         for iter_num, result in enumerate(query.iterations):
-            print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}")
+            print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")
     
 def main() -> None:
     parser = ArgumentParser()
@@ -127,7 +122,7 @@ def main() -> None:
 
     options = parser.parse_args()
 
-    compare(options.baseline_path)
+    lineformat(options.baseline_path)
 
 
 

From 5ab53df7c0c7db51c8b7a0f7587b971f08b8b33a Mon Sep 17 00:00:00 2001
From: logan-keede <chawlakranti8@gmail.com>
Date: Sat, 15 Feb 2025 18:50:41 +0530
Subject: [PATCH 4/4] tweaks

---
 benchmarks/{lineformat.py => lineprotocol.py} | 68 +++++++++++++++++--
 1 file changed, 63 insertions(+), 5 deletions(-)
 rename benchmarks/{lineformat.py => lineprotocol.py} (68%)

diff --git a/benchmarks/lineformat.py b/benchmarks/lineprotocol.py
similarity index 68%
rename from benchmarks/lineformat.py
rename to benchmarks/lineprotocol.py
index 4673260474e4f..75e09b662e3e1 100644
--- a/benchmarks/lineformat.py
+++ b/benchmarks/lineprotocol.py
@@ -16,6 +16,66 @@
 # specific language governing permissions and limitations
 # under the License.
 
+
+""" 
+Converts a given json to LineProtocol format that can be 
+visualised by grafana/other systems that support LineProtocol. 
+
+Usage example: 
+$ python3 lineprotocol.py sort.json 
+benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000
+benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000
+benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000
+benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000
+"""
+
+# sort.json
+"""
+{
+  "queries": [
+    {
+      "iterations": [
+        {
+          "elapsed": 85626.006132,
+          "row_count": 10838832
+        },
+        {
+          "elapsed": 68694.467851,
+          "row_count": 10838832
+        },
+        {
+          "elapsed": 63392.883406,
+          "row_count": 10838832
+        },
+        {
+          "elapsed": 66388.367387,
+          "row_count": 10838832
+        },
+      ],
+      "query": "sort utf8",
+      "start_time": 1691105678
+    },
+  ],
+  "context": {
+    "arguments": [
+      "sort",
+      "--path",
+      "benchmarks/data",
+      "--scale-factor",
+      "1.0",
+      "--iterations",
+      "4",
+      "-o",
+      "sort.json"
+    ],
+    "benchmark_version": "28.0.0",
+    "datafusion_version": "28.0.0",
+    "num_cpus": 8,
+    "start_time": 1691105678
+  }
+}
+"""
+
 from __future__ import annotations
 
 import json
@@ -113,13 +173,11 @@ def lineformat(
     
 def main() -> None:
     parser = ArgumentParser()
-    compare_parser = parser
-    compare_parser.add_argument(
-        "baseline_path",
+    parser.add_argument(
+        "path",
         type=Path,
-        help="Path to the baseline summary file.",
+        help="Path to the benchmark file.",
     )
-
     options = parser.parse_args()
 
     lineformat(options.baseline_path)