UnieAI · ZoneTwelve · Mar 17, 2026 · Mar 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -241,3 +241,5 @@ vllm/grpc/vllm_engine_pb2.pyi
 
 # Ignore generated cpu headers 
 csrc/cpu/cpu_attn_dispatch_generated.h
+
+logs/
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
@@ -132,13 +132,66 @@ The algorithm for exploring different workload levels can be summarized as follo
 
 You can override the number of iterations in the algorithm by setting `--workload-iters`.
 
-!!! tip
-    This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575).
+!!! important
+    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+
+    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+
+### Optuna auto-tuner
+
+`vllm bench sweep serve_optuna` uses Optuna to tune serve arguments and scores each trial across multiple benchmark concurrencies.
+
+The score formula is:
+
+`sum(mean(score_metric) / concurrency)`
+
+where `mean(score_metric)` is computed across `--num-runs` for that concurrency.
+
+1. (Optional) Create a JSON search space file:
+
+```json
+{
+  "gpu_memory_utilization": { "type": "float", "low": 0.7, "high": 0.98, "step": 0.02 },
+  "max_num_batched_tokens": { "type": "categorical", "choices": [null, 512, 1024, 2048, 4096, 8192] },
+  "max_num_seqs": { "type": "categorical", "choices": [null, 8, 16, 32, 64, 128, 256] },
+  "enable_chunked_prefill": { "type": "bool" },
+  "enable_prefix_caching": { "type": "bool" }
+}
+```
+
+If `--search-space` is omitted, `serve_optuna` uses built-in defaults:
+- `gpu_memory_utilization` in `[0.7, 0.98]` (step `0.02`)
+- `max_num_batched_tokens` in `[null, 512, 1024, 2048, 4096, 8192]`
+- `max_num_seqs` in `[null, 8, 16, 32, 64, 128, 256]`
+- `enable_chunked_prefill` in `[true, false]`
+- `enable_prefix_caching` in `[true, false]`
+
+2. Run the optimizer:
+
+```bash
+vllm bench sweep serve_optuna \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --search-space benchmarks/search_space.json \
+    --score-metric total_token_throughput \
+    --score-concurrencies 1,8,64,256 \
+    --n-trials 20 \
+    -o benchmarks/results
+```
+
+`--bench-cmd` is optional. When omitted, `serve_optuna` uses `vllm bench serve`
+and auto-fills model/base-url/tokenizer from `--serve-cmd`.
+
+By default, `serve_optuna` also launches the best server configuration at the end.
+Use `--no-start-best-server` to disable this behavior.
+
+3. Inspect outputs under the timestamped run directory:
 
-    In general, `--workload-var max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine.
-    Nevertheless, we default to `--workload-var request_rate` to maintain similar behavior as GuideLLM.
+- `baseline.json`: baseline run score and full per-concurrency benchmark payload.
+- `trials.json`: all trial records (`complete`, `pruned`, baseline).
+- `best_params.json`: best Optuna parameters.
+- `best.json`: best trial score and benchmark payload.
 
-## Startup Benchmark
+### Startup
 
 `vllm bench sweep startup` runs `vllm bench startup` across parameter combinations to compare cold/warm startup time for different engine settings.
 

diff --git a/docs/cli/README.md b/docs/cli/README.md
@@ -9,7 +9,7 @@ vllm --help
 Available Commands:
 
 ```bash
-vllm {chat,complete,serve,bench,collect-env,run-batch}
+vllm {chat,complete,serve,serve-optuna,bench,collect-env,run-batch}
 ```
 
 ## serve
@@ -147,6 +147,24 @@ vllm bench throughput \
 
 See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments.
 
+## serve-optuna
+
+Tune `vllm serve` parameters with Optuna and benchmark scoring across multiple concurrencies.
+
+```bash
+vllm serve-optuna \
+    --serve-cmd 'vllm serve Qwen/Qwen3-0.6B' \
+    --score-concurrencies 1,8,64,256 \
+    --n-trials 20 \
+    -o benchmarks/results
+```
+
+`--search-space` is optional. If omitted, vLLM uses built-in default serve tuning ranges.
+`--bench-cmd` is optional. If omitted, vLLM auto-fills model/base-url/tokenizer from `--serve-cmd`.
+By default, the best server config is started after optimization. Use `--no-start-best-server` to skip.
+
+See [vllm bench sweep serve_optuna](./bench/sweep/serve_optuna.md) for the full reference of all available arguments.
+
 ## collect-env
 
 Start collecting environment information.

diff --git a/docs/cli/bench/sweep/serve_optuna.md b/docs/cli/bench/sweep/serve_optuna.md
@@ -0,0 +1,9 @@
+# vllm bench sweep serve_optuna
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/generated/argparse/bench_sweep_serve_optuna.inc.md"
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
@@ -103,6 +103,12 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
 bench_sweep_serve_workload = auto_mock(
     "vllm.benchmarks.sweep.serve_workload", "SweepServeWorkloadArgs"
 )
+bench_sweep_serve_optuna = auto_mock(
+    "vllm.benchmarks.sweep.serve_optuna", "SweepServeOptunaArgs"
+)
+bench_sweep_serve_sla = auto_mock(
+    "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
+)
 bench_throughput = auto_mock("vllm.benchmarks", "throughput")
 AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs")
 EngineArgs = auto_mock("vllm.engine.arg_utils", "EngineArgs")
@@ -232,6 +238,10 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "bench_sweep_serve_workload": create_parser(
             bench_sweep_serve_workload.add_cli_args
         ),
+        "bench_sweep_serve_optuna": create_parser(
+            bench_sweep_serve_optuna.add_cli_args
+        ),
+        "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
         "bench_throughput": create_parser(bench_throughput.add_cli_args),
     }
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ Slack="https://slack.vllm.ai/"
 
 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
+unieinfra = "vllm.entrypoints.unieai:main"
 
 [project.entry-points."vllm.general_plugins"]
 lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"

diff --git a/requirements/common.txt b/requirements/common.txt
@@ -55,3 +55,5 @@ opentelemetry-sdk >= 1.27.0
 opentelemetry-api >= 1.27.0
 opentelemetry-exporter-otlp >= 1.27.0
 opentelemetry-semantic-conventions-ai >= 0.4.1
+cryptography  # Required for UnieAI license verification in unieai.py
+uvloop  # Optional event loop for performance in unieai.py and other entrypoints
Original file line number	Diff line number	Diff line change
Expand Up		@@ -241,3 +241,5 @@ vllm/grpc/vllm_engine_pb2.pyi

		# Ignore generated cpu headers
		csrc/cpu/cpu_attn_dispatch_generated.h

		logs/