diff --git a/examples/multimodal/components/processor.py b/examples/multimodal/components/processor.py index e5654e0c8b39..0588d4396ef6 100644 --- a/examples/multimodal/components/processor.py +++ b/examples/multimodal/components/processor.py @@ -15,7 +15,8 @@ import uvloop from transformers import AutoTokenizer from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.completion.protocol import CompletionRequest from vllm.outputs import RequestOutput from vllm.tokenizers import TokenizerLike as AnyTokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/examples/multimodal/utils/chat_processor.py b/examples/multimodal/utils/chat_processor.py index e70a794040ad..84c644a17f67 100644 --- a/examples/multimodal/utils/chat_processor.py +++ b/examples/multimodal/utils/chat_processor.py @@ -20,15 +20,15 @@ from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.chat_utils import ConversationMessage -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - CompletionRequest, - RequestResponseMetadata, -) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.completion.protocol import CompletionRequest +from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion +from vllm.entrypoints.openai.engine.protocol import RequestResponseMetadata +from vllm.entrypoints.openai.models.protocol import BaseModelPath +from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.inputs.data import TokensPrompt +from vllm.renderers.registry import renderer_from_config from vllm.sampling_params import SamplingParams from vllm.tokenizers import TokenizerLike as AnyTokenizer @@ -41,6 +41,7 @@ class StubEngineClient: def __init__(self, model_config: ModelConfig): self.model_config = model_config + self.renderer = renderer_from_config(model_config) self.input_processor = None self.io_processor = None @@ -154,9 +155,6 @@ def parse_raw_request( async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult: request = self.parse_raw_request(raw_request) - # TODO: Revisit this later when adding multi-modal support for the frontend. - # If no chat template is provided and tokenizer doesn't have one, - # use a simple format that just concatenates messages if not request.chat_template and not self.tokenizer.chat_template: chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}Assistant:" else: @@ -167,20 +165,14 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu engine_prompts, ) = await self.openai_serving._preprocess_chat( request, - self.tokenizer, request.messages, - chat_template=chat_template, - chat_template_content_format=self.openai_serving.chat_template_content_format, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, + default_template=chat_template, + default_template_content_format=self.openai_serving.chat_template_content_format, + default_template_kwargs=None, tool_dicts=None, - documents=request.documents, - chat_template_kwargs=request.chat_template_kwargs, - tool_parser=self.openai_serving.tool_parser, - add_special_tokens=request.add_special_tokens, + tool_parser=None, ) - # In newer vLLM, _preprocess_chat returns (conversation, engine_prompts) - 2 values if not conversation or not engine_prompts: raise ValueError( "Preprocessing returned empty conversation or engine_prompts" @@ -305,19 +297,14 @@ def parse_raw_request(self, raw_request: CompletionRequest) -> CompletionRequest async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult: request = self.parse_raw_request(raw_request) - # In newer vLLM, _preprocess_completion was removed - # Use the renderer approach instead - renderer = self.openai_serving._get_renderer(self.tokenizer) - config = self.openai_serving._build_render_config(request) - engine_prompts = await renderer.render_prompt_and_embeds( - prompt_or_prompts=request.prompt, + engine_prompts = await self.openai_serving._preprocess_completion( + request, + prompt_input=request.prompt, prompt_embeds=getattr(request, "prompt_embeds", None), - config=config, ) - # engine_prompts is now a list of TokensPrompt if not engine_prompts: - raise ValueError("Renderer returned empty engine_prompts") + raise ValueError("Preprocessing returned empty engine_prompts") return PreprocessResult(None, engine_prompts[0]) async def stream_response( @@ -332,6 +319,7 @@ async def stream_response( raise ValueError("Only streaming responses are supported") async for raw_response in self.openai_serving.completion_stream_generator( request, + [], # engine_prompts (not needed for streaming output) result_generator, request_id, int(time.time()), # created_time