Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions benchmarks/pandas/bench_concat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Benchmark: concat — concatenate two 50k-row DataFrames"""
import json, time
import numpy as np
import pandas as pd

ROWS = 50_000
WARMUP = 5
ITERATIONS = 20

vals1 = np.arange(ROWS, dtype=np.float64)
vals2 = np.arange(ROWS, dtype=np.float64) * 2.0
df1 = pd.DataFrame({"value": vals1})
df2 = pd.DataFrame({"value": vals2})

for _ in range(WARMUP):
pd.concat([df1, df2], ignore_index=True)

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.concat([df1, df2], ignore_index=True)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "concat",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_dataframe_apply.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: dataframe_apply — apply a function across rows of a 10k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 10_000
WARMUP = 3
ITERATIONS = 10

a = np.arange(ROWS, dtype=np.float64)
b = np.arange(ROWS, dtype=np.float64) * 2.0
df = pd.DataFrame({"a": a, "b": b})

for _ in range(WARMUP):
df.apply(lambda row: row["a"] + row["b"], axis=1)

start = time.perf_counter()
for _ in range(ITERATIONS):
df.apply(lambda row: row["a"] + row["b"], axis=1)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_apply",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_dataframe_creation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: DataFrame creation from arrays (pandas equivalent)"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

nums1 = np.arange(ROWS, dtype=np.float64) * 1.1
nums2 = np.arange(ROWS, dtype=np.float64) * 2.2
strs = [f"label_{i % 100}" for i in range(ROWS)]

for _ in range(WARMUP):
pd.DataFrame({"a": nums1, "b": nums2, "c": strs})

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.DataFrame({"a": nums1, "b": nums2, "c": strs})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_creation",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_dataframe_dropna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: dataframe_dropna — drop rows with NaN values from 100k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 5
ITERATIONS = 20

a = np.where(np.arange(ROWS) % 10 == 0, np.nan, np.arange(ROWS) * 1.1)
b = np.where(np.arange(ROWS) % 7 == 0, np.nan, np.arange(ROWS) * 2.2)
df = pd.DataFrame({"a": a, "b": b})

for _ in range(WARMUP):
df.dropna()

start = time.perf_counter()
for _ in range(ITERATIONS):
df.dropna()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_dropna",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
26 changes: 26 additions & 0 deletions benchmarks/pandas/bench_dataframe_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Benchmark: DataFrame filter (boolean mask on 100k-row DataFrame)"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 5
ITERATIONS = 20

vals = np.arange(ROWS, dtype=np.float64) * 0.1
df = pd.DataFrame({"value": vals})

for _ in range(WARMUP):
df[df["value"] > 5000]

start = time.perf_counter()
for _ in range(ITERATIONS):
df[df["value"] > 5000]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_filter",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_dataframe_rename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: dataframe_rename — rename columns in a 100k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 5
ITERATIONS = 20

a = np.arange(ROWS, dtype=np.float64) * 1.1
b = np.arange(ROWS, dtype=np.float64) * 2.2
df = pd.DataFrame({"old_a": a, "old_b": b})

for _ in range(WARMUP):
df.rename(columns={"old_a": "new_a", "old_b": "new_b"})

start = time.perf_counter()
for _ in range(ITERATIONS):
df.rename(columns={"old_a": "new_a", "old_b": "new_b"})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_rename",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
28 changes: 28 additions & 0 deletions benchmarks/pandas/bench_dataframe_sort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Benchmark: dataframe_sort — sort a 100k-row DataFrame by two columns"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

rng = np.random.default_rng(42)
a = [f"group_{i % 100}" for i in range(ROWS)]
b = rng.random(ROWS) * 1000
df = pd.DataFrame({"a": a, "b": b})

for _ in range(WARMUP):
df.sort_values(["a", "b"])

start = time.perf_counter()
for _ in range(ITERATIONS):
df.sort_values(["a", "b"])
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "dataframe_sort",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: describe — summary statistics on a 100k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

a = np.arange(ROWS, dtype=np.float64) * 1.1
b = np.sqrt(np.arange(1, ROWS + 1, dtype=np.float64))
df = pd.DataFrame({"a": a, "b": b})

for _ in range(WARMUP):
df.describe()

start = time.perf_counter()
for _ in range(ITERATIONS):
df.describe()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "describe",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
26 changes: 26 additions & 0 deletions benchmarks/pandas/bench_ewm_mean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Benchmark: ewm_mean — exponentially weighted mean on 100k-element Series"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

data = np.sin(np.arange(ROWS) * 0.05)
s = pd.Series(data)

for _ in range(WARMUP):
s.ewm(span=20).mean()

start = time.perf_counter()
for _ in range(ITERATIONS):
s.ewm(span=20).mean()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "ewm_mean",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
27 changes: 27 additions & 0 deletions benchmarks/pandas/bench_groupby_mean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Benchmark: GroupBy mean on 100k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

keys = [f"group_{i % 100}" for i in range(ROWS)]
vals = np.arange(ROWS, dtype=np.float64) * 0.1
df = pd.DataFrame({"key": keys, "value": vals})

for _ in range(WARMUP):
df.groupby("key")["value"].mean()

start = time.perf_counter()
for _ in range(ITERATIONS):
df.groupby("key")["value"].mean()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "groupby_mean",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
29 changes: 29 additions & 0 deletions benchmarks/pandas/bench_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Benchmark: merge — inner join two 50k-row DataFrames on a key column"""
import json, time
import numpy as np
import pandas as pd

ROWS = 50_000
WARMUP = 3
ITERATIONS = 10

keys = np.arange(ROWS) % 1000
vals1 = np.arange(ROWS, dtype=np.float64)
vals2 = np.arange(ROWS, dtype=np.float64) * 2.0
df1 = pd.DataFrame({"key": keys, "val1": vals1})
df2 = pd.DataFrame({"key": keys, "val2": vals2})

for _ in range(WARMUP):
pd.merge(df1, df2, on="key", how="inner")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge(df1, df2, on="key", how="inner")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "merge",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
28 changes: 28 additions & 0 deletions benchmarks/pandas/bench_pivot_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Benchmark: pivot_table — pivot aggregation on 100k-row DataFrame"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

rows = [f"row_{i % 100}" for i in range(ROWS)]
cols = [f"col_{i % 50}" for i in range(ROWS)]
vals = np.arange(ROWS, dtype=np.float64) * 0.1
df = pd.DataFrame({"row": rows, "col": cols, "value": vals})

for _ in range(WARMUP):
df.pivot_table(values="value", index="row", columns="col", aggfunc="mean")

start = time.perf_counter()
for _ in range(ITERATIONS):
df.pivot_table(values="value", index="row", columns="col", aggfunc="mean")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "pivot_table",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_read_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: read_csv — parse a 100k-row CSV file"""
import json, time, os, tempfile
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 2
ITERATIONS = 5

# Build CSV file
tmp_path = "/tmp/gh-aw/agent/bench_read_csv.csv"
with open(tmp_path, "w") as f:
f.write("id,value,label\n")
for i in range(ROWS):
f.write(f"{i},{i * 1.1:.4f},cat_{i % 50}\n")

for _ in range(WARMUP):
pd.read_csv(tmp_path)

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.read_csv(tmp_path)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "read_csv",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
26 changes: 26 additions & 0 deletions benchmarks/pandas/bench_rolling_mean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Benchmark: rolling mean with window=100 on 100k-element Series"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

data = np.sin(np.arange(ROWS) * 0.01)
s = pd.Series(data)

for _ in range(WARMUP):
s.rolling(100).mean()

start = time.perf_counter()
for _ in range(ITERATIONS):
s.rolling(100).mean()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "rolling_mean",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
Loading