Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
d8e7c45
new serve file
SunMarc Mar 16, 2026
f238867
app
SunMarc Mar 16, 2026
be0291d
model_manager done
SunMarc Mar 16, 2026
e84d82e
update serve
SunMarc Mar 17, 2026
fb77305
style
SunMarc Mar 17, 2026
d869d62
poc done
SunMarc Mar 17, 2026
5aadd1a
Merge remote-tracking branch 'origin/main' into refactor-serving
SunMarc Mar 17, 2026
bd734e8
renaming
SunMarc Mar 17, 2026
69d3264
fix
SunMarc Mar 17, 2026
f5afd6c
new tests
SunMarc Mar 17, 2026
fedad8e
update metrics and processor
SunMarc Mar 18, 2026
9b904b1
hardcode n_batch for now
SunMarc Mar 18, 2026
0084b91
add response api + compile
SunMarc Mar 19, 2026
1d5d1cb
more tests
SunMarc Mar 19, 2026
3d64a8c
add it for now but we will move it
SunMarc Mar 19, 2026
74b3593
Merge remote-tracking branch 'origin/main' into refactor-serving
SunMarc Mar 19, 2026
552603c
remove cache impl
SunMarc Mar 19, 2026
3643ece
add back load_model
SunMarc Mar 20, 2026
12c0f55
fix naming
SunMarc Mar 20, 2026
d4ffdf4
add transcription
SunMarc Mar 20, 2026
68cd5bc
tool calls better !
SunMarc Mar 23, 2026
6da3f3c
vlm support for both response and chat endpoints
SunMarc Mar 24, 2026
a92ebe2
update bench
SunMarc Mar 24, 2026
76a5c83
fix vl test
SunMarc Mar 24, 2026
31e59c3
first iteration of cb
SunMarc Mar 26, 2026
962d039
cb tests
SunMarc Mar 26, 2026
13945c1
typing + review
SunMarc Mar 26, 2026
4abb194
update test
SunMarc Mar 26, 2026
1658981
better benchmark
SunMarc Mar 26, 2026
720ecdb
better stream
SunMarc Mar 27, 2026
4424635
update bench
SunMarc Mar 27, 2026
7d0cd77
fix
SunMarc Mar 27, 2026
533233c
serve refactored
SunMarc Mar 27, 2026
880e6e0
merge
SunMarc Mar 27, 2026
4aa7fec
update
SunMarc Mar 27, 2026
3ab4e09
fix
SunMarc Mar 27, 2026
06bacbb
style
SunMarc Mar 27, 2026
ef10618
simpler
SunMarc Mar 27, 2026
09d5fe1
style
SunMarc Mar 27, 2026
96b6b8b
update warmup
SunMarc Mar 27, 2026
07ecd2a
remove llamacpp integration for now
SunMarc Mar 27, 2026
fad7c25
styke
SunMarc Mar 27, 2026
feed4cb
styke
SunMarc Mar 27, 2026
abd4087
style again
SunMarc Mar 27, 2026
120e37b
Merge branch 'main' into output-callback-cb
SunMarc Mar 27, 2026
d550b9b
remove annoattion
SunMarc Mar 27, 2026
ca06e2b
Merge branch 'main' into refactor-serving
SunMarc Mar 27, 2026
ac0d6a1
review !
SunMarc Mar 30, 2026
66314b5
Merge remote-tracking branch 'origin/main' into output-callback-cb
SunMarc Mar 30, 2026
9d52002
style
SunMarc Mar 30, 2026
c48aec3
much cleaner
SunMarc Mar 30, 2026
b13dacc
renamed
SunMarc Mar 30, 2026
7855606
remove bench for now
SunMarc Mar 30, 2026
ef1c710
batch output
SunMarc Mar 30, 2026
caaab6e
style
SunMarc Mar 30, 2026
4c1cd01
type
SunMarc Mar 30, 2026
702ff74
better tests
SunMarc Mar 30, 2026
80b5c78
update test
SunMarc Mar 30, 2026
a8461fc
queue draining
SunMarc Mar 30, 2026
480828d
Merge branch 'main' into output-callback-cb
SunMarc Mar 30, 2026
cb83702
Merge remote-tracking branch 'origin/main' into refactor-serving
SunMarc Mar 30, 2026
9db52a0
add seed
SunMarc Mar 30, 2026
9485f68
Merge branch 'main' into refactor-serving
SunMarc Mar 30, 2026
160b2f6
Merge remote-tracking branch 'origin/main' into refactor-serving
SunMarc Mar 31, 2026
40417ee
some logs
SunMarc Mar 31, 2026
3bd6a09
Merge remote-tracking branch 'origin/main' into refactor-serving
SunMarc Mar 31, 2026
ced96c2
readd nathan feature + some minor fixes
SunMarc Mar 31, 2026
ff02cd7
fix
SunMarc Mar 31, 2026
307498e
guard transcription
SunMarc Mar 31, 2026
ffe4c64
better now
SunMarc Mar 31, 2026
06a7881
fix
SunMarc Mar 31, 2026
052cbc7
adding lock to see if this helps
SunMarc Apr 1, 2026
6799727
remove locks
SunMarc Apr 1, 2026
3a07c86
lock again
SunMarc Apr 1, 2026
7a7abf2
update bench and remove lock for now
SunMarc Apr 1, 2026
bbd5cb0
Merge branch 'main' into refactor-serving
SunMarc Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/transformers-all-latest-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ RUN python3 -m pip install --no-cache-dir python-Levenshtein
# For `FastSpeech2ConformerTokenizer` tokenizer
RUN python3 -m pip install --no-cache-dir g2p-en

# For serving tests (audio pipelines)
RUN python3 -m pip install --no-cache-dir librosa python-multipart

# For Some bitsandbytes tests
RUN python3 -m pip install --no-cache-dir einops

Expand Down
38 changes: 28 additions & 10 deletions examples/pytorch/transformers_serve_cb_eval_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@

from inspect_ai import eval
from inspect_ai.log import bundle_log_dir
from inspect_evals.gpqa import gpqa_diamond


def wait_for_server_up(server_process, timeout=600):
def wait_for_server_up(server_process, port=8000, timeout=600):
start_time = time.time()

import urllib.error
import urllib.request

while time.time() - start_time < timeout:
try:
req = urllib.request.urlopen("http://127.0.0.1:8000/health", timeout=2)
req = urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2)
if req.status == 200:
elapsed = time.time() - start_time
print("\n" + "=" * 70)
Expand Down Expand Up @@ -69,17 +68,29 @@ def main():
action="store_true",
help="Disable continuous batching (enabled by default)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Port for the transformers serve server (default: 8000)",
)
parser.add_argument(
"--limit",
type=int,
default=10,
help="Number of evaluation samples to run (default: 5)",
help="Number of evaluation samples to run (default: 10)",
)
parser.add_argument(
"--max-connections",
type=int,
default=10,
help="Maximum concurrent connections for evaluation (default: 2)",
help="Maximum concurrent connections for evaluation (default: 10)",
)
parser.add_argument(
"--temperature",
type=float,
default=0,
help="Temperature for generation (default: 0)",
)
parser.add_argument(
"--log-dir",
Expand Down Expand Up @@ -121,7 +132,7 @@ def main():
)
parser.add_argument(
"--cb-use-cuda-graph",
action="store_true",
action=argparse.BooleanOptionalAction,
help="Enable CUDA graphs for continuous batching performance",
)

Expand All @@ -133,6 +144,7 @@ def main():
serve_cmd = [
"transformers",
"serve",
args.model,
]

# Add continuous batching if not disabled
Expand All @@ -150,10 +162,14 @@ def main():

serve_cmd.extend(["--cb-max-memory-percent", str(args.cb_max_memory_percent)])

serve_cmd.append("--cb-use-cuda-graph")
if args.cb_use_cuda_graph is True:
serve_cmd.append("--cb-use-cuda-graph")
elif args.cb_use_cuda_graph is False:
serve_cmd.append("--no-cb-use-cuda-graph")

# Always use sdpa attention implementation
serve_cmd.extend(["--attn-implementation", "kernels-community/flash-attn2"])
serve_cmd.extend(["--port", str(args.port)])

print("Starting transformers serve with continuous batching...")
print(f"Model: {args.model}")
Expand All @@ -163,6 +179,7 @@ def main():
print(f"CB Max Batch Tokens: {args.cb_max_batch_tokens if args.cb_max_batch_tokens else 'auto'}")
print(f"CB Max Memory: {args.cb_max_memory_percent * 100}%")
print(f"CB CUDA Graph: {args.cb_use_cuda_graph}")
print(f"Temperature: {args.temperature}")
print(f"Command: {' '.join(serve_cmd)}")
print("=" * 70)
print("SERVER OUTPUT:")
Expand All @@ -171,16 +188,17 @@ def main():
# Start server with output going directly to stdout/stderr
server_process = subprocess.Popen(serve_cmd, stdout=None, stderr=None)

wait_for_server_up(server_process, timeout=600)
wait_for_server_up(server_process, port=args.port, timeout=600)

eval(
gpqa_diamond,
"hf/Idavidrein/gpqa/diamond",
model=f"openai-api/transformers-serve/{args.model}",
log_dir=args.log_dir,
model_base_url="http://localhost:8000/v1",
model_base_url=f"http://localhost:{args.port}/v1",
display="plain",
limit=args.limit,
model_args={"stream": False},
temperature=args.temperature,
max_connections=args.max_connections,
max_tokens=2048,
)
Expand Down
21 changes: 17 additions & 4 deletions src/transformers/cli/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,17 @@ async def stream_output(self, stream: AsyncIterator[ChatCompletionStreamOutput])
self._console.print(f"[bold blue]<{self.model_id}>:")
with Live(console=self._console, refresh_per_second=4) as live:
text = ""
completion_tokens = 0
start_time = time.time()
finish_reason: str | None = None
async for token in await stream:
outputs = token.choices[0].delta.content
finish_reason = getattr(token.choices[0], "finish_reason", finish_reason)

usage = getattr(token, "usage", None)
if usage is not None:
completion_tokens = getattr(usage, "completion_tokens", completion_tokens)

if not outputs:
continue

Expand Down Expand Up @@ -154,6 +160,11 @@ async def stream_output(self, stream: AsyncIterator[ChatCompletionStreamOutput])
# Update the Live console output
live.update(markdown, refresh=True)

elapsed = time.time() - start_time
if elapsed > 0 and completion_tokens > 0:
tok_per_sec = completion_tokens / elapsed
self._console.print()
self._console.print(f"[dim]{completion_tokens} tokens in {elapsed:.1f}s ({tok_per_sec:.1f} tok/s)[/dim]")
Comment thread
SunMarc marked this conversation as resolved.
self._console.print()

return text, finish_reason
Expand Down Expand Up @@ -544,14 +555,16 @@ async def _inner_run(self):
else:
chat.append({"role": "user", "content": user_input})

extra_body = {
"generation_config": config.to_json_string(),
"model": self.model_id,
}

stream = client.chat_completion(
chat,
stream=True,
model=self.model_id,
extra_body={
"generation_config": config.to_json_string(),
"model": self.model_id,
},
extra_body=extra_body,
)

model_output, finish_reason = await interface.stream_output(stream)
Expand Down
Loading
Loading