Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
151 commits
Select commit Hold shift + click to select a range
1ada0dc
initial commit based on kimbos edits
cquil11 Oct 24, 2025
a1b7476
adding config and python script:
cquil11 Oct 24, 2025
d474718
adding runner field
cquil11 Oct 24, 2025
346b10d
finishing up script, ready for testing
cquil11 Oct 24, 2025
0dc246c
testing purposes
cquil11 Oct 24, 2025
02f5792
testing purposes
cquil11 Oct 24, 2025
e93d20b
refactoring more
cquil11 Oct 24, 2025
88239ac
refactoring more
cquil11 Oct 24, 2025
f00e47d
refactoring more
cquil11 Oct 24, 2025
8cc9eeb
refactoring more
cquil11 Oct 24, 2025
7be2673
refactoring more
cquil11 Oct 25, 2025
f9c5e27
refactoring more
cquil11 Oct 25, 2025
bb460c7
refactoring more
cquil11 Oct 25, 2025
2a5658a
refactoring more
cquil11 Oct 25, 2025
15da179
refactoring more
cquil11 Oct 25, 2025
9bf6b1f
refactoring more
cquil11 Oct 25, 2025
8d330cd
refactoring more
cquil11 Oct 25, 2025
8f665dd
updating the benchmark files with logic
cquil11 Oct 26, 2025
0987482
updating the benchmark files with logic
cquil11 Oct 26, 2025
d9fd191
updating the benchmark files with logic
cquil11 Oct 26, 2025
78f6b8d
updating the benchmark files with logic
cquil11 Oct 26, 2025
d808413
updating the benchmark files with logic
cquil11 Oct 26, 2025
bc24be4
updating the benchmark files with logic
cquil11 Oct 26, 2025
7479f74
testing concurrency
cquil11 Oct 26, 2025
93fba3b
updating the benchmark files with logic
cquil11 Oct 26, 2025
d021eb3
updating the benchmark files with logic
cquil11 Oct 27, 2025
09ebb8a
updating the benchmark files with logic
cquil11 Oct 27, 2025
6c61ba9
updating the benchmark files with logic
cquil11 Oct 27, 2025
f7d8340
updating the benchmark files with logic
cquil11 Oct 27, 2025
869572a
adding pytests
cquil11 Oct 27, 2025
fdb94fa
adding other isl osl
cquil11 Oct 27, 2025
d339b8f
adding more workflows
cquil11 Oct 27, 2025
2b284f9
adding more workflows
cquil11 Oct 27, 2025
09e9c49
adding more workflows
cquil11 Oct 27, 2025
15553b8
adding more workflows
cquil11 Oct 27, 2025
471b7c2
adding more workflows
cquil11 Oct 27, 2025
fca9c16
adding more workflows
cquil11 Oct 27, 2025
8ba4de9
adding more workflows
cquil11 Oct 28, 2025
8df3aa3
adding more workflows
cquil11 Oct 28, 2025
903f2f6
adding more workflows
cquil11 Oct 28, 2025
60465c8
adding more workflows
cquil11 Oct 28, 2025
ae2505e
adding more workflows
cquil11 Oct 28, 2025
6fec99e
adding more workflows
cquil11 Oct 28, 2025
0226fc5
adding more workflows
cquil11 Oct 28, 2025
3f7609d
adding more workflows
cquil11 Oct 28, 2025
395bbb0
adding more workflows
cquil11 Oct 28, 2025
8510c0a
adding more workflows
cquil11 Oct 28, 2025
f439163
adding more workflows
cquil11 Oct 28, 2025
28665f2
adding more workflows
cquil11 Oct 28, 2025
99aec70
adding more workflows
cquil11 Oct 28, 2025
3ea4aa2
adding more workflows
cquil11 Oct 28, 2025
6090656
adding more workflows
cquil11 Oct 29, 2025
9b570de
adding script
cquil11 Oct 29, 2025
2bb9dfa
removing extraneous files
cquil11 Oct 29, 2025
34ba318
removing extraneous files
cquil11 Oct 29, 2025
8c615ef
removing plottingh
cquil11 Oct 29, 2025
7b2acaa
removing plottingh
cquil11 Oct 29, 2025
ad18b51
removing plottingh
cquil11 Oct 29, 2025
165bde3
removing plottingh
cquil11 Oct 29, 2025
52153c7
removing plottingh
cquil11 Oct 29, 2025
db05e34
removing plotting python script
cquil11 Oct 29, 2025
efdfcaf
bmk-space -> search-space
cquil11 Oct 29, 2025
15eead5
updating exp name for full sweep
cquil11 Oct 29, 2025
6bbc028
pip install pydantic
cquil11 Oct 29, 2025
b84fffe
add filtered sweep
cquil11 Oct 29, 2025
df8877d
allow multiple filter values
cquil11 Oct 29, 2025
b0aaf6a
reverse seq len mapping
cquil11 Oct 29, 2025
de9e367
less verbose
cquil11 Oct 29, 2025
6df2657
deleting files
cquil11 Oct 29, 2025
5729c67
list tp ep dpa then conc
cquil11 Oct 30, 2025
6edcc3a
removing 70b stuff
cquil11 Oct 30, 2025
69844b2
temp fix (#148)
cquil11 Oct 29, 2025
1105aea
remove: llama 70b
functionstackx Oct 30, 2025
24ea7de
revert remove: llama 70b
Oct 30, 2025
b89047d
remove llama 70b (#149)
functionstackx Oct 30, 2025
be3b40f
testing concurrency
cquil11 Oct 26, 2025
13803ac
adding more workflows
cquil11 Oct 27, 2025
422e5b8
deleting files
cquil11 Oct 29, 2025
2d1e457
cleaning up after rebase
cquil11 Oct 30, 2025
534d98c
adding docs for configs; adding field to configs
cquil11 Oct 30, 2025
e216920
hash on dpa too
cquil11 Oct 30, 2025
751d092
debug
cquil11 Oct 30, 2025
d4f57a7
debug
cquil11 Oct 30, 2025
825aa7e
debug
cquil11 Oct 30, 2025
232b33b
debug
cquil11 Oct 30, 2025
d2d025e
update hashing
cquil11 Oct 30, 2025
e95af11
deleting extraneous file
cquil11 Oct 30, 2025
bed5406
adding gb200
cquil11 Oct 30, 2025
475559a
adding gb200 pt 2
cquil11 Oct 30, 2025
f24799b
adding gb200 pt 3
cquil11 Oct 30, 2025
5f61cd3
adding gb200 to other isl osl sweeps
cquil11 Oct 30, 2025
89ebc6e
adding gb200 to other isl osl sweeps
cquil11 Oct 30, 2025
04b614a
adding gb200 test
cquil11 Oct 30, 2025
ab052fd
adding gb200 test
cquil11 Oct 30, 2025
6495caa
adding gb200 test
cquil11 Oct 30, 2025
589382d
adding full sweep test
cquil11 Oct 30, 2025
b920ec4
adding full sweep test pt 2
cquil11 Oct 30, 2025
d4c5dbc
adding full sweep test pt 2
cquil11 Oct 31, 2025
02deb3d
adding full sweep test pt 2
cquil11 Oct 31, 2025
18c26b3
adding full sweep test pt 2
cquil11 Oct 31, 2025
f1477e5
adding full sweep test pt 2
cquil11 Oct 31, 2025
d64d907
adding full sweep test pt 2
cquil11 Oct 31, 2025
d6bf37e
adding full sweep test pt 2
cquil11 Oct 31, 2025
dba3b4c
adding full sweep test pt 2
cquil11 Oct 31, 2025
a45e4bf
adding full sweep test pt 2
cquil11 Oct 31, 2025
60233aa
adding full sweep test pt 2
cquil11 Oct 31, 2025
c1b5ddd
adding full sweep test pt 2
cquil11 Oct 31, 2025
2cd0295
adding full sweep test pt 2
cquil11 Oct 31, 2025
3065c13
reverting title
cquil11 Oct 31, 2025
89d6dc3
adding full sweep test pt 2
cquil11 Oct 31, 2025
68e2462
adding full sweep test pt 2
cquil11 Oct 31, 2025
04992c4
reverting title
cquil11 Oct 31, 2025
f2f1a5e
fixing test files
cquil11 Oct 31, 2025
9d2cbbb
fixing gha syntax error
cquil11 Oct 31, 2025
7164cde
fixing gha syntax error
cquil11 Oct 31, 2025
5eb1f90
fixing error in multinode script
cquil11 Oct 31, 2025
9318ba7
bug fxes
cquil11 Oct 31, 2025
5a56794
debug
cquil11 Oct 31, 2025
912d70d
debug
cquil11 Oct 31, 2025
10a9dc0
Merge remote-tracking branch 'origin/main' into initial-refactor
cquil11 Oct 31, 2025
98362f1
celaning up the full sweep sched
cquil11 Oct 31, 2025
1eb74b9
celaning up other workflows
cquil11 Oct 31, 2025
f78de57
docs
cquil11 Oct 31, 2025
d233ea2
remove concurrency locks
cquil11 Oct 31, 2025
4e1228b
add dpa to results filename
cquil11 Oct 31, 2025
d816ef4
add back plotting
cquil11 Oct 31, 2025
249a94c
testing concurrency
cquil11 Oct 26, 2025
6589e53
adding more workflows
cquil11 Oct 27, 2025
3695ed5
deleting files
cquil11 Oct 29, 2025
b328c7f
temp fix (#148)
cquil11 Oct 29, 2025
264186f
testing concurrency
cquil11 Oct 26, 2025
e9e0e70
update random range ratio default
cquil11 Oct 31, 2025
bbc2220
get process results vals from env vars instead of argv
cquil11 Oct 31, 2025
d5ec7de
get process results vals from env vars instead of argv pt 2
cquil11 Oct 31, 2025
6af36ef
editing runners yaml
cquil11 Oct 31, 2025
cefcf15
testing concurrency
cquil11 Oct 26, 2025
46545a9
adding more workflows
cquil11 Oct 27, 2025
e59f2d7
deleting files
cquil11 Oct 29, 2025
fe445a1
testing concurrency
cquil11 Oct 26, 2025
d154049
testing concurrency
cquil11 Oct 26, 2025
880d3c8
testing concurrency
cquil11 Oct 26, 2025
026d16b
remove 70b
cquil11 Oct 31, 2025
4a81cd4
cleaning up after rebase
cquil11 Oct 31, 2025
cac35bc
changing name of files from XkYk to shceduler
cquil11 Oct 31, 2025
b60289e
double check and update master configs
cquil11 Nov 3, 2025
9fba14a
double check and update master configs pt 2
cquil11 Nov 3, 2025
c331874
add pydantic pip install
cquil11 Nov 3, 2025
582e1b1
bug fix
cquil11 Nov 3, 2025
4b78c4a
update cron trigger to 9:00 PM CDT
cquil11 Nov 3, 2025
7c4c931
runner name bug in process result python script
cquil11 Nov 3, 2025
c1b3530
Merge branch 'main' into initial-refactor
cquil11 Nov 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions .github/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# How to Test Workflows

In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix-logic/generate_sweep_configs.py` script. The usage for this script is shown below:

```
usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ...

Generate benchmark configurations from YAML config files

positional arguments:
{full-sweep,test-config,runner-model-sweep,runner-sweep,custom}
Available commands
full-sweep Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths
test-config Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.
runner-model-sweep Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate
that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner
nodes.
runner-sweep Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is
meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b
actually do so successfully.
custom Enter custom values

options:
-h, --help show this help message and exit
```

Instead of explaining each command at a high level, let's just walk through some common testing scenarios and describe how to run them.

**Scenario 1**: I want to change increase the concurrency from 128 to 256 in the 1k1k scenario for the `dsr1-fp4-b200-sglang` config (from `.github/configs/nvidia-master.yaml`) and then test it.

Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input:
```
test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986046399

If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively.

Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error:

```
test-config --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --key dsr1-fp4-b200-sglang --seq-len 1k1k --runner-node mi300x-amd_0

ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'.
```

Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986053019/job/54229839736

**Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners.

Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input:
```
runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986283169

This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made.

**Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes.

Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the following command as the text input:
```
runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986292917

This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`.

For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs).

This is particularly useful when:
- You've made infrastructure changes to a specific runner type (driver updates, system configuration, Docker setup)
- You've added new runner nodes and want to validate they work with all existing model configurations
- You want to verify that all models remain compatible with a specific GPU type after system updates

**Key difference from Scenario 2**:
- `runner-sweep`: Fix a **model**, sweep across runners → "Does this model work on all its runners?"
- `runner-model-sweep`: Fix a **runner type**, sweep across models → "Do all models work on this runner type?"

## Additional Use Cases with `full-sweep`

The `full-sweep` command supports multiple filters that can be combined for targeted testing:

**Test all gptoss configurations on B200 with 1k1k sequence lengths:**
```
full-sweep --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

**Test all fp8 precision configs across all runners for 1k8k workloads:**
```
full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml
```

**Test all TRT configs on H200 runners:**
```
full-sweep --framework trt --runner-type h200 h200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

**Quick smoke test of all configs (highest TP, lowest concurrency only):**
```
full-sweep --test-mode --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml
```

**Test specific model on specific hardware with specific sequence lengths:**
```
full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

## Custom One-off Tests

**Scenario 4**: I want to run a quick test with a custom image, model, or configuration that isn't in the config files yet.

Use the `custom` command to specify all parameters manually:
```
custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
```

This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for:
- Testing new images before adding them to config files
- Quick validation of new models
- Experimenting with different frameworks or precisions
52 changes: 52 additions & 0 deletions .github/configs/CONFIGS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Configs

The config files in this directory are meant to be a "source of truth" for what benchmark configurations can/should be run. As such, they must follow a precise format which is described below.

## Master Configs (AMD, NVIDIA, etc.)

```yaml
entry-name:
image: string
model: string
model-prefix: string
runner: string
precision: string
framework: string
seq-len-configs:
- isl: int
osl: int
search-space:
- { tp: int, conc-start: int, conc-end: int }
# Optionally, specify 'ep' (expert-parallelism) and 'dp-attn' (data parallel attention)
- { tp: int, ep: int, dp-attn: bool, conc-start: int, conc-end: int }
- ...
- ...
```
Note: while not required, `entry-name` typically takes the format `<INFMAX_MODEL_PREFIX>-<PRECISION>-<GPU>-<FRAMEWORK>`.

The below list describes what each field is:

- `image`: The image used to serve the benchmark, e.g., `vllm/vllm-openai:v0.10.2`
- `model`: The model to server, e.g., `openai/gpt-oss-120b`
- `model-prefix`: The canonical InferenceMAX model prefix reference, i.e., `dsr1` for Deepseek, `gptoss` for gptoss-120b, etc. This value is used to decipher which script in `benchmarks/` should be used in order to launch the benchmark.
- `runner`: This is the runner on which to run the benchmark. This must be a valid runner (key or value) from `runners.yaml`.
- `precision`: The precision to run the benchmark. Again, this is used to find which script to run in `benchmarks/`.
- `framework`: The framework (serving runtime) to serve the benchmark, e.g., `vllm`, `sglang`, `trt`.
- `seq-len-configs`: A list of possible sequence lengths to benchmark. Each entry must have the following fields:
- `isl`: An integer representing the input sequence length, e.g., `1024`
- `osl`: An integer representing the output sequence length, e.g., `8192`
- `search-space`: A list of configurations to run with respective `isl` and `osl`, each entry must be a dict with the following fields:
- `tp`: An integer representing the tensor parallelism level that the configuration will be served at.
- `conc-start`: An integer representing the starting level of concurrency e.g., `4`
- `conc-end`: An integer representing the ending level of concurrency (inclusive) e.g., `128`
- Note: the step factor between `conc-start` and `conc-end` is 2, so if `conc-start` is 4 and `conc-end` is 128, all concurrencies `4, 8, 16, 32, ..., 128` will be run.
- (Optional) `ep`: An integer representing the expert parallelism level that the configuration will be served at. Default is 1 (no expert parallelism) when not specified.
- (Optional) `dp-attn`: A boolean representing whether or not to activate data parallel attention for the configuration. Default is false when not specified.

Notes:
- No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run.
- Setting the fields above, particularly `ep` and `dp-attn`, only guarantee that the respective values will be passed as environment variables to the benchmark scripts! Actually using those environment variables is an implementation detail at the level of the benchmark Bash script.

## Runners

The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `h200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs.
171 changes: 171 additions & 0 deletions .github/configs/amd-master.yaml
Comment thread
cquil11 marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
dsr1-fp4-mi355x-sglang:
image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
model: amd/DeepSeek-R1-0528-MXFP4-Preview
model-prefix: dsr1
runner: mi355x
precision: fp4
framework: sglang
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi300x-sglang:
image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi300x
precision: fp8
framework: sglang
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi325x-sglang:
image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi325x
precision: fp8
framework: sglang
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

dsr1-fp8-mi355x-sglang:
image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: mi355x
precision: fp8
framework: sglang
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

gptoss-fp4-mi300x-vllm:
image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi300x
precision: fp4
framework: vllm
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 1, conc-start: 64, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 1024
osl: 8192
search-space:
- { tp: 1, conc-start: 64, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 8192
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi325x-vllm:
image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi325x
precision: fp4
framework: vllm
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 1024
osl: 8192
search-space:
- { tp: 1, conc-start: 64, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 64, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 2, conc-start: 4, conc-end: 8 }
- { tp: 4, conc-start: 4, conc-end: 8 }
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi355x-vllm:
image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi355x
precision: fp4
framework: vllm
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 8 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 1024
osl: 8192
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 8 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- isl: 8192
osl: 1024
search-space:
- { tp: 1, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 4 }
- { tp: 8, conc-start: 4, conc-end: 8 }
Loading