Skip to content
Merged
2 changes: 1 addition & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
Expand Down
6 changes: 3 additions & 3 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1548,11 +1548,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-fa", "--flash-attn"}, "FA",
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
[](common_params & params, const std::string & value) {
if (value == "on" || value == "enabled") {
if (value == "on" || value == "enabled" || value == "1") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (value == "off" || value == "disabled") {
} else if (value == "off" || value == "disabled" || value == "0") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (value == "auto") {
} else if (value == "auto" || value == "-1") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ else()
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(speculative-simple)
add_subdirectory(sweep-bench)
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
Expand Down
5 changes: 5 additions & 0 deletions examples/sweep-bench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-sweep-bench)
add_executable(${TARGET} sweep-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
65 changes: 65 additions & 0 deletions examples/sweep-bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# ik_llama.cpp/example/sweep-bench

Benchmark the prompt processing and token generation performance of `ik_llama.cpp`
by doing a sweep over a whole context size and gathering performance metrics
in each ubatch-sized window. Only a single token sequence is used.

The benchmark steps are:

for each ubatch-sized window in context:

1. generate ubatch/4 tokens (not the whole window to save some time)
2. measure generation performance
3. remove generated tokens from KV cache
4. prepare a ubatch-sized batch of random tokens
4. process prepated batch
5. measure prompt processing performance

The purpose of the benchmark is to visualize how the performance changes with
the context size without averaging the metrics values over the whole context.

## Usage

./llama-sweep-bench -c 8704 -ub 512 -m models/Meta-Llama-3.2-3B-Instruct-Q8_0.gguf

## Sample results

- `PP` - prompt tokens per ubatch
- `TG` - generated tokens per ubatch
- `N_KV` - current KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)

| PP | TG | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s |
|-------|--------|--------|----------|----------|----------|----------|
| 512 | 128 | 0 | 1.100 | 465.51 | 2.311 | 55.38 |
| 512 | 128 | 512 | 1.183 | 432.97 | 1.895 | 67.55 |
| 512 | 128 | 1024 | 1.305 | 392.38 | 2.071 | 61.81 |
| 512 | 128 | 1536 | 1.279 | 400.42 | 2.164 | 59.14 |
| 512 | 128 | 2048 | 1.571 | 325.96 | 2.280 | 56.14 |
| 512 | 128 | 2560 | 1.431 | 357.87 | 2.418 | 52.94 |
| 512 | 128 | 3072 | 1.515 | 337.93 | 2.566 | 49.88 |
| 512 | 128 | 3584 | 1.588 | 322.34 | 2.722 | 47.03 |
| 512 | 128 | 4096 | 1.675 | 305.70 | 2.864 | 44.69 |
| 512 | 128 | 4608 | 1.769 | 289.50 | 2.999 | 42.68 |
| 512 | 128 | 5120 | 1.845 | 277.48 | 3.102 | 41.26 |
| 512 | 128 | 5632 | 1.893 | 270.46 | 3.219 | 39.76 |
| 512 | 128 | 6144 | 1.953 | 262.20 | 3.348 | 38.23 |
| 512 | 128 | 6656 | 2.018 | 253.71 | 3.474 | 36.84 |
| 512 | 128 | 7168 | 2.078 | 246.34 | 3.589 | 35.66 |
| 512 | 128 | 7680 | 2.140 | 239.22 | 3.717 | 34.43 |
| 512 | 128 | 8192 | 2.196 | 233.15 | 3.854 | 33.21 |

### JSONL output

Pass `--output-format jsonl` to output JSONL instead of Markdown, á la

```json lines
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 0, "t_pp": 1.093814, "speed_pp": 468.086884, "t_tg": 1.780312, "speed_tg": 71.897514 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 512, "t_pp": 1.169302, "speed_pp": 437.868073, "t_tg": 1.897474, "speed_tg": 67.458099 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1024, "t_pp": 1.183700, "speed_pp": 432.542053, "t_tg": 2.059179, "speed_tg": 62.160694 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 1536, "t_pp": 1.428625, "speed_pp": 358.386566, "t_tg": 2.160639, "speed_tg": 59.241734 }
{"n_kv_max": 8704, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "n_gpu_layers": -1, "n_threads": 32, "n_threads_batch": 32, "pp": 512, "tg": 128, "n_kv": 2048, "t_pp": 1.360647, "speed_pp": 376.291595, "t_tg": 2.274003, "speed_tg": 56.288403 }
```
118 changes: 118 additions & 0 deletions examples/sweep-bench/sweep-bench-plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('file', nargs='+')
args = parser.parse_args()

df = None

#for jsonl_file in args.file:
# # Read JSONL file into DataFrame
# df_part = pd.read_json(jsonl_file, lines=True)
# df_part['label'] = jsonl_file
# if df is None:
# df = df_part
# else:
# df = pd.concat([df, df_part])
#



for md_file in args.file:
# Read markdown table file into DataFrame
df_part = pd.read_csv(md_file, sep=r'\s*\|\s*', engine='python',
header=0, skiprows=[1])

# Clean up columns (remove empty columns from markdown formatting)
df_part = df_part.iloc[:, 1:-1]
df_part.columns = [col.strip() for col in df_part.columns]

# Rename columns to match expected names
df_part = df_part.rename(columns={
'N_KV': 'n_kv',
'S_PP t/s': 'speed_pp',
'S_TG t/s': 'speed_tg'
})

# Convert to numeric types
df_part['n_kv'] = pd.to_numeric(df_part['n_kv'])
df_part['speed_pp'] = pd.to_numeric(df_part['speed_pp'])
df_part['speed_tg'] = pd.to_numeric(df_part['speed_tg'])

# Add label and append to main DataFrame
df_part['label'] = md_file
df = pd.concat([df, df_part]) if df is not None else df_part

# Group by label and n_kv, calculate mean and std for both speed metrics
df_grouped = df.groupby(['label', 'n_kv']).agg({
'speed_pp': ['mean', 'std'],
'speed_tg': ['mean', 'std']
}).reset_index()

# Flatten multi-index columns
df_grouped.columns = ['label', 'n_kv', 'speed_pp_mean', 'speed_pp_std',
'speed_tg_mean', 'speed_tg_std']

# Replace NaN with 0 (std for a single sample is NaN)
df_grouped['speed_pp_std'] = df_grouped['speed_pp_std'].fillna(0)
df_grouped['speed_tg_std'] = df_grouped['speed_tg_std'].fillna(0)

# Prepare ticks values for X axis (prune for readability)
x_ticks = df['n_kv'].unique()
while len(x_ticks) > 16:
x_ticks = x_ticks[::2]

# Get unique labels and color map
labels = df_grouped['label'].unique()
colors = plt.cm.rainbow(np.linspace(0, 1, len(labels)))

# Create prompt processing plot
plt.figure(figsize=(10, 6))
ax1 = plt.gca()
plt.grid()
ax1.set_xticks(x_ticks)

# Plot each label's data
for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
pp = ax1.errorbar(label_data['n_kv'], label_data['speed_pp_mean'],
yerr=label_data['speed_pp_std'], color=color,
marker='o', linestyle='-', label=label)

# Add labels and title
ax1.set_xlabel('Context Length (tokens)')
ax1.set_ylabel('Prompt Processing Rate (t/s)')
plt.title('Prompt Processing Performance Comparison')
ax1.legend(loc='upper right')

# Adjust layout and save
plt.tight_layout()
plt.savefig('performance_comparison_pp.png', bbox_inches='tight')
plt.close()

# Create token generation plot
plt.figure(figsize=(10, 6))
ax1 = plt.gca()
plt.grid()
ax1.set_xticks(x_ticks)

# Plot each model's data
for label, color in zip(labels, colors):
label_data = df_grouped[df_grouped['label'] == label].sort_values('n_kv')
tg = ax1.errorbar(label_data['n_kv'], label_data['speed_tg_mean'],
yerr=label_data['speed_tg_std'], color=color,
marker='s', linestyle='-', label=label)

# Add labels and title
ax1.set_xlabel('Context Length (n_kv)')
ax1.set_ylabel('Token Generation Rate (t/s)')
plt.title('Token Generation Performance Comparison')
ax1.legend(loc='upper right')

# Adjust layout and save
plt.tight_layout()
plt.savefig('performance_comparison_tg.png', bbox_inches='tight')
plt.close()
Loading