Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/multimodal/components/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import uvloop
from transformers import AutoTokenizer
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.outputs import RequestOutput
from vllm.tokenizers import TokenizerLike as AnyTokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand Down
48 changes: 18 additions & 30 deletions examples/multimodal/utils/chat_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.chat_utils import ConversationMessage
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.completion.protocol import CompletionRequest
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.inputs.data import TokensPrompt
from vllm.renderers.registry import renderer_from_config
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike as AnyTokenizer

Expand All @@ -41,6 +41,7 @@ class StubEngineClient:

def __init__(self, model_config: ModelConfig):
self.model_config = model_config
self.renderer = renderer_from_config(model_config)
self.input_processor = None
self.io_processor = None

Expand Down Expand Up @@ -154,9 +155,6 @@ def parse_raw_request(
async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request)

# TODO: Revisit this later when adding multi-modal support for the frontend.
# If no chat template is provided and tokenizer doesn't have one,
# use a simple format that just concatenates messages
if not request.chat_template and not self.tokenizer.chat_template:
chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}Assistant:"
else:
Expand All @@ -167,20 +165,14 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
engine_prompts,
) = await self.openai_serving._preprocess_chat(
request,
self.tokenizer,
request.messages,
chat_template=chat_template,
chat_template_content_format=self.openai_serving.chat_template_content_format,
add_generation_prompt=request.add_generation_prompt,
continue_final_message=request.continue_final_message,
default_template=chat_template,
default_template_content_format=self.openai_serving.chat_template_content_format,
default_template_kwargs=None,
tool_dicts=None,
documents=request.documents,
chat_template_kwargs=request.chat_template_kwargs,
tool_parser=self.openai_serving.tool_parser,
add_special_tokens=request.add_special_tokens,
tool_parser=None,
)

# In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values
if not conversation or not engine_prompts:
raise ValueError(
"Preprocessing returned empty conversation or engine_prompts"
Expand Down Expand Up @@ -305,19 +297,14 @@ def parse_raw_request(self, raw_request: CompletionRequest) -> CompletionRequest
async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
request = self.parse_raw_request(raw_request)

# In newer vLLM, _preprocess_completion was removed
# Use the renderer approach instead
renderer = self.openai_serving._get_renderer(self.tokenizer)
config = self.openai_serving._build_render_config(request)
engine_prompts = await renderer.render_prompt_and_embeds(
prompt_or_prompts=request.prompt,
engine_prompts = await self.openai_serving._preprocess_completion(
request,
prompt_input=request.prompt,
prompt_embeds=getattr(request, "prompt_embeds", None),
config=config,
)

# engine_prompts is now a list of TokensPrompt
if not engine_prompts:
raise ValueError("Renderer returned empty engine_prompts")
raise ValueError("Preprocessing returned empty engine_prompts")
return PreprocessResult(None, engine_prompts[0])

async def stream_response(
Expand All @@ -332,6 +319,7 @@ async def stream_response(
raise ValueError("Only streaming responses are supported")
async for raw_response in self.openai_serving.completion_stream_generator(
request,
[], # engine_prompts (not needed for streaming output)
result_generator,
request_id,
int(time.time()), # created_time
Expand Down
Loading