From aafd3dbbb0c52ccc7117a6345ddf505084ee2c42 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Thu, 19 Feb 2026 15:10:23 +0100 Subject: [PATCH 1/3] Fix mix of data and pipeline parallelism --- src/lighteval/models/vllm/vllm_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 8ac6fb0af..d586291d4 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -196,6 +196,7 @@ def __init__( ) self.data_parallel_size = config.data_parallel_size self.tensor_parallel_size = config.tensor_parallel_size + self.pipeline_parallel_size = config.pipeline_parallel_size self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False self._tokenizer = self._create_auto_tokenizer(config) @@ -275,7 +276,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: self.model_args["load_format"] = config.load_format if config.data_parallel_size > 1: - self.model_args["distributed_executor_backend"] = "ray" + self.model_args["distributed_executor_backend"] = "mp" self._batch_size = "auto" if self._max_length is None: @@ -442,7 +443,7 @@ def _generate( if self.data_parallel_size > 1: - @ray.remote(num_gpus=self.tensor_parallel_size) + @ray.remote(num_gpus=self.tensor_parallel_size * self.pipeline_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) return llm.generate( From e3fd675cceed9d9955419d9655ec1ed060dcc41e Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Fri, 20 Feb 2026 16:17:55 +0100 Subject: [PATCH 2/3] Add support of context parallelism for versions of VLLM that support it (>= 0.15). Unfortunately, it currently fails with VLLM 0.15.1 in our env: File ".../vllm/v1/worker/gpu_worker.py", line 412, in initialize_from_config self.model_runner.initialize_kv_cache(kv_cache_config) File ".../vllm/v1/worker/gpu_model_runner.py", line 5874, in initialize_kv_cache self.initialize_attn_backend(kv_cache_config) File ".../vllm/v1/worker/gpu_model_runner.py", line 5225, in initialize_attn_backend check_attention_cp_compatibility(self.vllm_config) File ".../vllm/v1/worker/cp_utils.py", line 39, in check_attention_cp_compatibility assert layer_impl.supports_pcp, ( AssertionError: PCP requires attention impls' support, but the impl FlashAttentionImpl does not support PCP. --- src/lighteval/models/vllm/vllm_model.py | 56 ++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index d586291d4..0271e4fc0 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -28,7 +28,8 @@ from typing import Coroutine, Optional import torch -from pydantic import NonNegativeFloat, NonNegativeInt, PositiveInt +from packaging.version import Version +from pydantic import NonNegativeFloat, NonNegativeInt, PositiveInt, model_validator from tqdm import tqdm from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset @@ -98,6 +99,16 @@ class VLLMModelConfig(ModelConfig): Number of GPUs to use for data parallelism. Defaults to 1. pipeline_parallel_size (PositiveInt): Number of GPUs to use for pipeline parallelism. Defaults to 1. + prefill_context_parallel_size (PositiveInt): + Number of GPUs to use for prefill context parallelism. Splits long sequences across GPUs + during the prefill phase, reducing peak KV-cache memory. Requires vllm >= 0.15.0 and an + attention backend that sets supports_pcp=True (not available in vllm 0.15.1). + Increases total GPU count by this factor. Defaults to 1 (disabled). + decode_context_parallel_size (PositiveInt): + Number of context parallel groups for the decode phase. Shards the KV cache along + the token dimension, reusing the existing TP GPUs (does not require extra GPUs). + tensor_parallel_size must be divisible by this value. Requires vllm >= 0.15.0. + Defaults to 1 (disabled). gpu_memory_utilization (NonNegativeFloat): Fraction of GPU memory to use. Lower this if running out of memory. Defaults to 0.9. enable_prefix_caching (bool): @@ -161,6 +172,18 @@ class VLLMModelConfig(ModelConfig): tensor_parallel_size: PositiveInt = 1 # how many GPUs to use for tensor parallelism data_parallel_size: PositiveInt = 1 # how many GPUs to use for data parallelism pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism + prefill_context_parallel_size: PositiveInt = 1 # context parallelism for prefill phase (requires vllm >= 0.15.0) + decode_context_parallel_size: PositiveInt = 1 # context parallelism for decode phase (requires vllm >= 0.15.0) + + @model_validator(mode="after") + def validate_context_parallelism(self) -> "VLLMModelConfig": + if self.decode_context_parallel_size > 1: + if self.tensor_parallel_size % self.decode_context_parallel_size != 0: + raise ValueError( + f"tensor_parallel_size ({self.tensor_parallel_size}) must be divisible by " + f"decode_context_parallel_size ({self.decode_context_parallel_size})." + ) + return self gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory enable_prefix_caching: bool = None # whether to enable prefix caching to speed up generation. May use more memory. Should be disabled for LFM2 max_model_length: PositiveInt | None = ( @@ -197,6 +220,7 @@ def __init__( self.data_parallel_size = config.data_parallel_size self.tensor_parallel_size = config.tensor_parallel_size self.pipeline_parallel_size = config.pipeline_parallel_size + self.prefill_context_parallel_size = config.prefill_context_parallel_size self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False self._tokenizer = self._create_auto_tokenizer(config) @@ -275,6 +299,34 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: if config.load_format is not None: self.model_args["load_format"] = config.load_format + if config.prefill_context_parallel_size > 1 or config.decode_context_parallel_size > 1: + from importlib.metadata import version as get_package_version + + _VLLM_MIN_VERSION_CP = Version("0.15.0") + _vllm_version = Version(get_package_version("vllm")) + if _vllm_version < _VLLM_MIN_VERSION_CP: + raise ValueError( + f"Context parallelism (prefill_context_parallel_size / decode_context_parallel_size) " + f"requires vllm >= {_VLLM_MIN_VERSION_CP}, but the installed version is {_vllm_version}." + ) + if config.prefill_context_parallel_size > 1: + # PCP requires attention backends to set supports_pcp=True. Check this early + # to avoid failing after several minutes of model loading. + try: + from vllm.v1.attention.backend import AttentionImplBase + + if not AttentionImplBase.supports_pcp: + raise NotImplementedError( + f"prefill_context_parallel_size > 1 is not supported by any attention " + f"backend in the installed vllm {_vllm_version}. " + f"Consider using tensor_parallel_size or decode_context_parallel_size instead." + ) + except ImportError: + pass # older vllm layout; let vllm raise its own error + self.model_args["prefill_context_parallel_size"] = config.prefill_context_parallel_size + if config.decode_context_parallel_size > 1: + self.model_args["decode_context_parallel_size"] = config.decode_context_parallel_size + if config.data_parallel_size > 1: self.model_args["distributed_executor_backend"] = "mp" self._batch_size = "auto" @@ -443,7 +495,7 @@ def _generate( if self.data_parallel_size > 1: - @ray.remote(num_gpus=self.tensor_parallel_size * self.pipeline_parallel_size) + @ray.remote(num_gpus=self.tensor_parallel_size * self.pipeline_parallel_size * self.prefill_context_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) return llm.generate( From 637d2effae4dd481dfb8017d739ab88ce82cdf52 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Fri, 20 Feb 2026 16:19:35 +0100 Subject: [PATCH 3/3] remove unnecessary deps (already there) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index afa83e33c..98dc0d400 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,7 +122,7 @@ multilingual = [ "pyvi", # for vietnamese tokenizer ] math = ["latex2sympy2_extended==1.0.6"] -translation = ["unbabel-comet>=2.2.0", "sentencepiece"] +translation = ["unbabel-comet>=2.2.0"] wandb = ["wandb"] trackio = ["trackio"]