huggingface · SunMarc · Apr 1, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -105,6 +105,9 @@ RUN python3 -m pip install --no-cache-dir python-Levenshtein
 # For `FastSpeech2ConformerTokenizer` tokenizer
 RUN python3 -m pip install --no-cache-dir g2p-en
 
+# For serving tests (audio pipelines)
+RUN python3 -m pip install --no-cache-dir librosa python-multipart
+
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops
 

diff --git a/examples/pytorch/transformers_serve_cb_eval_job.py b/examples/pytorch/transformers_serve_cb_eval_job.py
@@ -16,18 +16,17 @@
 
 from inspect_ai import eval
 from inspect_ai.log import bundle_log_dir
-from inspect_evals.gpqa import gpqa_diamond
 
 
-def wait_for_server_up(server_process, timeout=600):
+def wait_for_server_up(server_process, port=8000, timeout=600):
     start_time = time.time()
 
     import urllib.error
     import urllib.request
 
     while time.time() - start_time < timeout:
         try:
-            req = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=2)
+            req = urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2)
             if req.status == 200:
                 elapsed = time.time() - start_time
                 print("\n" + "=" * 70)
@@ -69,17 +68,29 @@ def main():
         action="store_true",
         help="Disable continuous batching (enabled by default)",
     )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port for the transformers serve server (default: 8000)",
+    )
     parser.add_argument(
         "--limit",
         type=int,
         default=10,
-        help="Number of evaluation samples to run (default: 5)",
+        help="Number of evaluation samples to run (default: 10)",
     )
     parser.add_argument(
         "--max-connections",
         type=int,
         default=10,
-        help="Maximum concurrent connections for evaluation (default: 2)",
+        help="Maximum concurrent connections for evaluation (default: 10)",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0,
+        help="Temperature for generation (default: 0)",
     )
     parser.add_argument(
         "--log-dir",
@@ -121,7 +132,7 @@ def main():
     )
     parser.add_argument(
         "--cb-use-cuda-graph",
-        action="store_true",
+        action=argparse.BooleanOptionalAction,
         help="Enable CUDA graphs for continuous batching performance",
     )
 
@@ -133,6 +144,7 @@ def main():
     serve_cmd = [
         "transformers",
         "serve",
+        args.model,
     ]
 
     # Add continuous batching if not disabled
@@ -150,10 +162,14 @@ def main():
 
         serve_cmd.extend(["--cb-max-memory-percent", str(args.cb_max_memory_percent)])
 
-        serve_cmd.append("--cb-use-cuda-graph")
+        if args.cb_use_cuda_graph is True:
+            serve_cmd.append("--cb-use-cuda-graph")
+        elif args.cb_use_cuda_graph is False:
+            serve_cmd.append("--no-cb-use-cuda-graph")
 
     # Always use sdpa attention implementation
     serve_cmd.extend(["--attn-implementation", "kernels-community/flash-attn2"])
+    serve_cmd.extend(["--port", str(args.port)])
 
     print("Starting transformers serve with continuous batching...")
     print(f"Model: {args.model}")
@@ -163,6 +179,7 @@ def main():
         print(f"CB Max Batch Tokens: {args.cb_max_batch_tokens if args.cb_max_batch_tokens else 'auto'}")
         print(f"CB Max Memory: {args.cb_max_memory_percent * 100}%")
         print(f"CB CUDA Graph: {args.cb_use_cuda_graph}")
+    print(f"Temperature: {args.temperature}")
     print(f"Command: {' '.join(serve_cmd)}")
     print("=" * 70)
     print("SERVER OUTPUT:")
@@ -171,16 +188,17 @@ def main():
     # Start server with output going directly to stdout/stderr
     server_process = subprocess.Popen(serve_cmd, stdout=None, stderr=None)
 
-    wait_for_server_up(server_process, timeout=600)
+    wait_for_server_up(server_process, port=args.port, timeout=600)
 
     eval(
-        gpqa_diamond,
+        "hf/Idavidrein/gpqa/diamond",
         model=f"openai-api/transformers-serve/{args.model}",
         log_dir=args.log_dir,
-        model_base_url="http://localhost:8000/v1",
+        model_base_url=f"http://localhost:{args.port}/v1",
         display="plain",
         limit=args.limit,
         model_args={"stream": False},
+        temperature=args.temperature,
         max_connections=args.max_connections,
         max_tokens=2048,
     )

diff --git a/src/transformers/cli/chat.py b/src/transformers/cli/chat.py
@@ -114,11 +114,17 @@ async def stream_output(self, stream: AsyncIterator[ChatCompletionStreamOutput])
         self._console.print(f"[bold blue]<{self.model_id}>:")
         with Live(console=self._console, refresh_per_second=4) as live:
             text = ""
+            completion_tokens = 0
+            start_time = time.time()
             finish_reason: str | None = None
             async for token in await stream:
                 outputs = token.choices[0].delta.content
                 finish_reason = getattr(token.choices[0], "finish_reason", finish_reason)
 
+                usage = getattr(token, "usage", None)
+                if usage is not None:
+                    completion_tokens = getattr(usage, "completion_tokens", completion_tokens)
+
                 if not outputs:
                     continue
 
@@ -154,6 +160,11 @@ async def stream_output(self, stream: AsyncIterator[ChatCompletionStreamOutput])
                 # Update the Live console output
                 live.update(markdown, refresh=True)
 
+        elapsed = time.time() - start_time
+        if elapsed > 0 and completion_tokens > 0:
+            tok_per_sec = completion_tokens / elapsed
+            self._console.print()
+            self._console.print(f"[dim]{completion_tokens} tokens in {elapsed:.1f}s ({tok_per_sec:.1f} tok/s)[/dim]")
         self._console.print()
 
         return text, finish_reason
@@ -544,14 +555,16 @@ async def _inner_run(self):
                     else:
                         chat.append({"role": "user", "content": user_input})
 
+                    extra_body = {
+                        "generation_config": config.to_json_string(),
+                        "model": self.model_id,
+                    }
+
                     stream = client.chat_completion(
                         chat,
                         stream=True,
                         model=self.model_id,
-                        extra_body={
-                            "generation_config": config.to_json_string(),
-                            "model": self.model_id,
-                        },
+                        extra_body=extra_body,
                     )
 
                     model_output, finish_reason = await interface.stream_output(stream)