diff --git a/docs/guides/eval.md b/docs/guides/eval.md
new file mode 100644
index 0000000000..8ac5ab5675
--- /dev/null
+++ b/docs/guides/eval.md
@@ -0,0 +1,33 @@
+# Evaluation
+
+## Start Evaluation
+
+### Start Script
+```sh
+# To run the evaluation with default config (examples/configs/eval.yaml)
+uv run python examples/run_eval.py
+
+# Specify a custom config file
+uv run python examples/run_eval.py --config path/to/custom_config.yaml
+
+# Override specific config values via command line
+uv run python examples/run_eval.py generation.model_name="Qwen/Qwen2.5-Math-7B-Instruct"
+```
+
+### Example Output
+
+```
+============================================================
+model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime_2024'
+score=0.10 (3.0/30)
+============================================================
+```
+
+## Configuration
+
+An example Evaluation configuration file can be found [here](../../examples/configs/eval.yaml).
+
+### Prompt Template Configuration
+Always remember to use the same `prompt_file` and `system_prompt_file` that were used during training.
+
+For open-source models, we recommend setting `prompt_file=null` and `system_prompt_file=null` to allow them to use their native chat templates.
diff --git a/docs/index.md b/docs/index.md
index 56cd64ac1b..0628f19953 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -18,6 +18,7 @@ cluster.md
 adding_new_models.md
 guides/sft.md
 guides/grpo.md
+guides/eval.md
 ```
 
 ```{toctree}
diff --git a/examples/configs/eval.yaml b/examples/configs/eval.yaml
new file mode 100644
index 0000000000..a867e9617f
--- /dev/null
+++ b/examples/configs/eval.yaml
@@ -0,0 +1,30 @@
+# Evaluation Configuration
+generation:
+  backend: "vllm" # only vllm is supported for evaluation
+  max_new_tokens: ${generation.vllm_cfg.max_model_len}
+  temperature: 0.0
+  top_p: 1.0
+  top_k: -1 # disable
+  num_prompts_per_step: -1 # -1 means pass all prompts at once
+  model_name: "Qwen/Qwen2.5-Math-1.5B-Instruct"
+  vllm_cfg:
+    tensor_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 2048
+
+data:
+  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
+  prompt_file: null
+  system_prompt_file: null
+  dataset_name: "HuggingFaceH4/aime_2024"
+  dataset_key: "train"
+  problem_key: "problem"
+  solution_key: "answer"
+
+env:
+  math:
+    num_workers: 8
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
diff --git a/examples/run_eval.py b/examples/run_eval.py
new file mode 100644
index 0000000000..54358ad260
--- /dev/null
+++ b/examples/run_eval.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import pprint
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from datasets import load_dataset
+from omegaconf import OmegaConf
+from transformers import AutoTokenizer
+
+from examples.run_grpo_math import math_data_processor
+from nemo_reinforcer.data import MathDataConfig
+from nemo_reinforcer.data.datasets import AllTaskProcessedDataset
+from nemo_reinforcer.data.interfaces import TaskDataSpec
+from nemo_reinforcer.data.llm_message_utils import remap_dataset_keys
+from nemo_reinforcer.distributed.virtual_cluster import init_ray
+from nemo_reinforcer.environments.math_environment import MathEnvironment
+from nemo_reinforcer.evals.eval import MasterConfig, run_env_eval, setup
+from nemo_reinforcer.models.generation.interfaces import GenerationConfig
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Run Evaluation with configuration")
+    parser.add_argument(
+        "--config", type=str, default=None, help="Path to YAML config file"
+    )
+
+    # Parse known args for the script
+    args, remaining = parser.parse_known_args()
+
+    # Convert remaining args to OmegaConf format
+    overrides = OmegaConf.from_dotlist(remaining)
+
+    return args, overrides
+
+
+def setup_data(
+    data_config: MathDataConfig, generation_config: GenerationConfig, env_configs
+):
+    print("\n▶ Setting up data...")
+    math_task_spec = TaskDataSpec(
+        task_name="math",
+        prompt_file=data_config["prompt_file"],
+        system_prompt_file=data_config["system_prompt_file"],
+    )
+
+    # load dataset
+    base_dataset = load_dataset(data_config["dataset_name"])
+    if data_config["dataset_key"] is not None:
+        base_dataset = base_dataset[data_config["dataset_key"]]
+    # remap problem and solution keys
+    remapped_dataset = remap_dataset_keys(
+        base_dataset,
+        mapping_dict={
+            data_config["problem_key"]: "problem",
+            data_config["solution_key"]: "expected_answer",
+        },
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(generation_config["model_name"])
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    math_env = MathEnvironment.options(
+        runtime_env={"py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE}
+    ).remote(env_configs["math"])
+
+    dataset = AllTaskProcessedDataset(
+        dataset=remapped_dataset,
+        tokenizer=tokenizer,
+        default_task_data_spec=math_task_spec,
+        task_data_processors=math_data_processor,
+        max_seq_length=data_config["max_input_seq_length"],
+    )
+
+    return dataset, math_env, tokenizer
+
+
+def main():
+    """Main entry point."""
+    # Parse arguments
+    args, overrides = parse_args()
+
+    if not args.config:
+        args.config = os.path.join(os.path.dirname(__file__), "configs", "eval.yaml")
+
+    config = OmegaConf.load(args.config)
+    print(f"Loaded configuration from: {args.config}")
+
+    if overrides:
+        override_conf = OmegaConf.from_cli()
+        print(f"Overrides: {override_conf}")
+        config = OmegaConf.merge(config, override_conf)
+
+    config: MasterConfig = OmegaConf.to_container(config, resolve=True)
+    print("Applied CLI overrides")
+
+    # Print config
+    print("Final config:")
+    pprint.pprint(config)
+
+    # Init ray
+    init_ray()
+
+    # Setup data
+    (
+        dataset,
+        math_env,
+        tokenizer,
+    ) = setup_data(config["data"], config["generation"], config["env"])
+
+    # Setup
+    (
+        vllm_generation,
+        dataloader,
+        master_config,
+    ) = setup(config, tokenizer, dataset)
+
+    # Run evaluation
+    run_env_eval(
+        vllm_generation,
+        dataloader,
+        math_env,
+        master_config,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index 7e2f3e693d..b87c7037b3 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -122,6 +122,8 @@ def math_data_processor(
 
     template = task_data_spec.custom_template
     message_log: LLMMessageLogType = []
+
+    # system prompt
     if task_data_spec.system_prompt:
         sys_message = {"role": "system", "content": task_data_spec.system_prompt}
         message = tokenizer.apply_chat_template(
@@ -135,10 +137,11 @@ def math_data_processor(
             0
         ]
         message_log.append(sys_message)
-    user_message = {
-        "role": "user",
-        "content": task_data_spec.prompt.format(problem),
-    }
+
+    # user prompt
+    if task_data_spec.prompt:
+        problem = task_data_spec.prompt.format(problem)
+    user_message = {"role": "user", "content": problem}
     message = tokenizer.apply_chat_template(
         [user_message],
         chat_template=template,
@@ -167,8 +170,9 @@ def math_data_processor(
         "extra_env_info": extra_env_info,
         "loss_multiplier": loss_multiplier,
         "idx": idx,
-        "task_name": datum_dict["task_name"],
     }
+    if "task_name" in datum_dict:
+        output["task_name"] = datum_dict["task_name"]
     return output
 
 
diff --git a/nemo_reinforcer/data/__init__.py b/nemo_reinforcer/data/__init__.py
index 63aad516b2..09eaf35fb5 100644
--- a/nemo_reinforcer/data/__init__.py
+++ b/nemo_reinforcer/data/__init__.py
@@ -21,3 +21,8 @@ class DataConfig(TypedDict):
     system_prompt_file: Optional[str]
     dataset_name: str
     val_dataset_name: Optional[str]
+
+
+class MathDataConfig(DataConfig):
+    problem_key: str
+    solution_key: str
diff --git a/nemo_reinforcer/data/datasets.py b/nemo_reinforcer/data/datasets.py
index 033cee05d7..8a81c85fb2 100644
--- a/nemo_reinforcer/data/datasets.py
+++ b/nemo_reinforcer/data/datasets.py
@@ -130,3 +130,54 @@ def rl_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict:
         batch_max_length=batch_max_length,
     )
     return output
+
+
+def eval_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict:
+    """Collate function for evaluation.
+
+    Takes a list of data samples and combines them into a single batched dictionary
+    for model evaluation.
+
+    Args:
+        data_batch: List of data samples with message_log, extra_env_info, and idx fields.
+
+    Returns:
+        BatchedDataDict with message_log, extra_env_info, and idx fields.
+
+    Examples:
+    ```{doctest}
+    >>> import torch
+    >>> from nemo_reinforcer.data.datasets import eval_collate_fn
+    >>> from nemo_reinforcer.data.interfaces import DatumSpec
+    >>> data_batch = [
+    ...     DatumSpec(
+    ...         message_log=[{"role": "user", "content": "Hello", "token_ids": torch.tensor([1, 2, 3])}],
+    ...         extra_env_info={'ground_truth': '1'},
+    ...         idx=0,
+    ...     ),
+    ...     DatumSpec(
+    ...         message_log=[{"role": "assistant", "content": "Hi there", "token_ids": torch.tensor([4, 5, 6, 7])}],
+    ...         extra_env_info={'ground_truth': '2'},
+    ...         idx=1,
+    ...     ),
+    ... ]
+    >>> output = eval_collate_fn(data_batch)
+    >>> output['message_log'][0]
+    [{'role': 'user', 'content': 'Hello', 'token_ids': tensor([1, 2, 3])}]
+    >>> output['message_log'][1]
+    [{'role': 'assistant', 'content': 'Hi there', 'token_ids': tensor([4, 5, 6, 7])}]
+    >>> output['extra_env_info']
+    [{'ground_truth': '1'}, {'ground_truth': '2'}]
+    >>> output['idx']
+    [0, 1]
+    """
+    message_log = [datum_spec["message_log"] for datum_spec in data_batch]
+    extra_env_info = [datum_spec["extra_env_info"] for datum_spec in data_batch]
+    idx = [datum_spec["idx"] for datum_spec in data_batch]
+
+    output = BatchedDataDict(
+        message_log=message_log,
+        extra_env_info=extra_env_info,
+        idx=idx,
+    )
+    return output
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index 43e24fc1ce..5ae8bee9a8 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Union
-
+from typing import Dict, List
 
 import torch
+from datasets import Dataset
 
 from nemo_reinforcer.data.interfaces import (
     LLMMessageLogType,
@@ -390,3 +390,27 @@ def get_formatted_message_log(
         prev_formatted_message = formatted_message
 
     return message_log
+
+
+def remap_dataset_keys(
+    dataset: Dataset,
+    mapping_dict: Dict[str, str],
+) -> Dataset:
+    """Remap dataset keys as per mapping.
+
+    Args:
+        dataset: The input dataset to remap keys in
+        mapping_dict: A dictionary mapping input keys to output keys
+
+    Returns:
+        Dataset: A new dataset with remapped keys
+    """
+    # no need to remap if the keys are already correct
+    if all(k == v for k, v in mapping_dict.items()):
+        return dataset
+
+    # return the remapped dataset
+    return dataset.map(
+        lambda x: {v: x[k] for k, v in mapping_dict.items()},
+        remove_columns=list(mapping_dict.keys()),
+    )
diff --git a/nemo_reinforcer/evals/eval.py b/nemo_reinforcer/evals/eval.py
new file mode 100644
index 0000000000..33d486a4d5
--- /dev/null
+++ b/nemo_reinforcer/evals/eval.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Tuple, TypedDict
+
+import ray
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+
+from nemo_reinforcer.data import MathDataConfig
+from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, eval_collate_fn
+from nemo_reinforcer.data.llm_message_utils import get_keys_from_message_log
+from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
+from nemo_reinforcer.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster
+from nemo_reinforcer.environments.math_environment import MathEnvConfig
+from nemo_reinforcer.models.generation.interfaces import GenerationConfig
+from nemo_reinforcer.models.generation.vllm import VllmGeneration
+
+
+# ===============================================================================
+# Configuration
+# ===============================================================================
+
+
+class MasterConfig(TypedDict):
+    generate: GenerationConfig
+    data: MathDataConfig
+    env: MathEnvConfig
+    cluster: ClusterConfig
+
+
+# ===============================================================================
+# Setup & Initialization
+# ===============================================================================
+
+
+def setup(
+    master_config: MasterConfig,
+    tokenizer: AutoTokenizer,
+    dataset: AllTaskProcessedDataset,
+) -> Tuple[
+    VllmGeneration,
+    DataLoader,
+    MasterConfig,
+]:
+    """Set up components for model evaluation.
+
+    Initializes the VLLM model and data loader.
+
+    Args:
+        master_config: Configuration settings.
+        dataset: Dataset to evaluate on.
+
+    Returns:
+        VLLM model, data loader, and config.
+    """
+    # Extract individual configs for easier access
+    generation_config = master_config["generation"]
+    cluster_config = master_config["cluster"]
+
+    # ==========================
+    #           Data
+    # ==========================
+    if generation_config["num_prompts_per_step"] == -1:
+        generation_config["num_prompts_per_step"] = len(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=generation_config["num_prompts_per_step"],
+        shuffle=False,
+        collate_fn=eval_collate_fn,
+    )
+    print(f"  ✓ Evaluation dataset loaded with {len(dataset)} samples")
+
+    # ==========================
+    #          Cluster
+    # ==========================
+    print("\n▶ Setting up compute cluster...")
+    cluster = RayVirtualCluster(
+        name="eval_cluster",
+        bundle_ct_per_node_list=[cluster_config["gpus_per_node"]]
+        * cluster_config["num_nodes"],
+        use_gpus=True,
+        num_gpus_per_node=cluster_config["gpus_per_node"],
+        max_colocated_worker_groups=1,
+    )
+    print(f"  ✓ Ray cluster initialized with {cluster_config['num_nodes']} nodes")
+
+    # ==========================
+    #           Model
+    # ==========================
+    print("\n▶ Setting up model...")
+    # check backend
+    backend = generation_config["backend"]
+    assert backend == "vllm", "Only vLLM backend is supported for evaluation"
+
+    # set vllm config
+    generation_config["vllm_cfg"]["load_format"] = "auto"
+    generation_config["vllm_cfg"]["skip_tokenizer_init"] = False
+    generation_config["stop_token_ids"] = [tokenizer.eos_token_id]
+    generation_config["pad_token"] = tokenizer.pad_token_id
+
+    # initialize vllm generation
+    vllm_generation = VllmGeneration(cluster=cluster, config=generation_config)
+    print(
+        f"  ✓ Using vLLM backend for generation with {generation_config['model_name']}"
+    )
+
+    print("\n" + "=" * 60)
+    print(" " * 18 + "SETUP COMPLETE")
+    print("=" * 60 + "\n")
+
+    return (
+        vllm_generation,
+        dataloader,
+        master_config,
+    )
+
+
+# ===============================================================================
+# Evaluation
+# ===============================================================================
+
+
+def run_env_eval(vllm_generation, dataloader, env, master_config):
+    """Main entry point for running evaluation using environment.
+
+    Generates model responses and evaluates them by env.
+
+    Args:
+        vllm_generation: Model for generating responses.
+        dataloader: Data loader with evaluation samples.
+        env: Environment that scores responses.
+        master_config: Configuration settings.
+    """
+    # Run evaluation loop
+    score, count = 0.0, 0
+    for batch in dataloader:
+        # get input prompt from message_log
+        prompts = []
+        for message_log in batch["message_log"]:
+            content = [message["content"] for message in message_log]
+            content = "\n".join(content)
+            prompts.append(content)
+        # generate by vllm
+        inputs = BatchedDataDict({"prompts": prompts})
+        outputs = vllm_generation.generate_text(inputs)["texts"]
+
+        # append to message_log
+        for idx, output in enumerate(outputs):
+            batch["message_log"][idx].append(
+                {
+                    "role": "assistant",
+                    "content": output,
+                }
+            )
+
+        # evaluate generations with the environment
+        to_env = [
+            get_keys_from_message_log(batch["message_log"][i], ["role", "content"])
+            for i in range(len(batch["message_log"]))
+        ]
+        _, _, rewards, _ = ray.get(env.step.remote(to_env, batch["extra_env_info"]))
+
+        score += rewards.sum().item()
+        count += len(rewards)
+
+    # Cleanup before printing results
+    ray.get(env.shutdown.remote())
+    vllm_generation.shutdown()
+
+    # Print results
+    dataset_name = os.path.basename(master_config["data"]["dataset_name"])
+    model_name = os.path.basename(master_config["generation"]["model_name"])
+    average_score = score / count
+
+    print("\n" + "=" * 60)
+    print(f"{model_name=} {dataset_name=}")
+    print(f"score={average_score:.2f} ({score}/{count})")
+    print("=" * 60 + "\n")
diff --git a/nemo_reinforcer/evals/run_env_eval.py b/nemo_reinforcer/evals/run_env_eval.py
deleted file mode 100644
index 341a77c5bc..0000000000
--- a/nemo_reinforcer/evals/run_env_eval.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index cb60b7fe8c..2395b65e34 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -200,6 +200,7 @@ def generate(
 
         Args:
             data: BatchedDataDict containing input_ids and input_lengths tensors
+            greedy: Whether to use greedy decoding instead of sampling
 
         Returns:
             BatchedDataDict conforming to GenerationOutputSpec:
@@ -246,7 +247,7 @@ def generate(
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
         sampling_params = self.SamplingParams(
-            temperature=self.cfg["temperature"],
+            temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
             top_k=top_k
             if not greedy
@@ -330,6 +331,37 @@ def generate(
 
         return return_data
 
+    def generate_text(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate text responses using vLLM generation.
+
+        Args:
+            data: BatchedDataDict containing prompts with text strings
+            greedy: Whether to use greedy decoding instead of sampling
+
+        Returns:
+            BatchedDataDict containing:
+                - texts: List of generated text responses
+        """
+        # Read generation parameters from config
+        top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
+        sampling_params = self.SamplingParams(
+            temperature=self.cfg["temperature"] if not greedy else 0,
+            top_p=self.cfg["top_p"],
+            top_k=top_k if not greedy else 1,
+            max_tokens=self.cfg["max_new_tokens"],
+            stop=self.cfg.get("stop_sequences", None),
+        )
+
+        # Generate outputs
+        outputs = self.llm.generate(data["prompts"], sampling_params)
+        texts = [output.outputs[0].text for output in outputs]
+
+        # Convert to BatchedDataDict
+        return_data = BatchedDataDict({"texts": texts})
+        return return_data
+
     def shutdown(self):
         """Clean up vLLM resources."""
         try:
@@ -537,6 +569,42 @@ def generate(
 
         return combined
 
+    def generate_text(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate text responses using vLLM."""
+        assert isinstance(data, BatchedDataDict), (
+            f"data must be a BatchedDataDict, got type: {type(data)}"
+        )
+
+        # Get total batch size
+        batch_size = len(data["prompts"])
+
+        # Shard the data across the tied worker groups
+        sharded_data = data.shard_by_batch_size(self.dp_size, batch_size=batch_size)
+        future_bundle = self.worker_group.run_all_workers_multiple_data(
+            "generate_text",
+            sharded_data,
+            common_kwargs={"greedy": greedy},
+            respect_tied_workers=True,
+        )
+
+        # Get results from the workers, respecting tied worker groups (only one result per tied worker group)
+        results = self.worker_group.get_all_worker_results(future_bundle)
+
+        # Combine results from all tied worker groups
+        combined = BatchedDataDict.from_batches(results)
+
+        # Verify the output has all required fields
+        required_keys = ["texts"]
+        missing_keys = [key for key in required_keys if key not in combined]
+        if missing_keys:
+            raise ValueError(
+                f"Missing required keys for GenerationOutputSpec: {missing_keys}"
+            )
+
+        return combined
+
     def prepare_for_generation(self, *args, **kwargs):
         """Abstract method that must be implemented by subclasses."""
         try:
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index af8b5698e3..8c810e31dc 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -40,10 +40,14 @@
 }
 
 
-def configure_vllm_with_tokenizer(vllm_config, tokenizer):
+def configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=False):
     """Apply tokenizer-specific configurations to vLLM config."""
-    vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True
-    vllm_config["vllm_cfg"]["load_format"] = "dummy"
+    if is_eval:
+        vllm_config["vllm_cfg"]["skip_tokenizer_init"] = False
+        vllm_config["vllm_cfg"]["load_format"] = "auto"
+    else:
+        vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True
+        vllm_config["vllm_cfg"]["load_format"] = "dummy"
     vllm_config["pad_token"] = tokenizer.pad_token_id
     vllm_config["stop_token_ids"] = [tokenizer.eos_token_id]
     return vllm_config
@@ -532,3 +536,38 @@ def test_vllm_policy_weight_update(cluster, tokenizer, tensor_parallel_size):
 
     # Clean up
     vllm_policy.shutdown()
+
+
+def test_vllm_generate_text(cluster, tokenizer):
+    """Test that vLLM can generate text."""
+    # Prepare test data
+    test_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+    test_prompts = BatchedDataDict({"prompts": test_prompts})
+
+    # Create separate configs for each policy
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=True)
+
+    # Ensure we can get same output
+    assert vllm_config["model_name"] == "meta-llama/Llama-3.2-1B", (
+        "Model name should be meta-llama/Llama-3.2-1B to get expected output"
+    )
+    assert vllm_config["vllm_cfg"]["tensor_parallel_size"] == 1, (
+        "Tensor parallel size should be 1 to get expected output"
+    )
+
+    # Create vLLM generation
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Generate and check result
+    output = vllm_generation.generate_text(test_prompts, greedy=True)
+    assert output["texts"] == [
+        " Kelsey and I am a 2018 graduate",
+        " Paris. The city is located in the north of",
+    ], "Output should be the same as the expected output"
+
+    # Clean up
+    vllm_generation.shutdown()