From f06c51b2f9a4c823c6ef5a080e1bee06514b3ed6 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 25 Sep 2024 11:27:32 -0700
Subject: [PATCH 01/21] Add evaluate method and other minor fixes

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/__init__.py      |   7 ++
 nemo/collections/llm/api.py           | 117 +++++++++++++++++++++++---
 nemo/deploy/service/rest_model_api.py |   7 +-
 scripts/deploy/nlp/deploy_triton.py   |   3 +-
 4 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 2051f844d888..a67d24f5bbc8 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -231,3 +231,10 @@
     __all__.append("deploy")
 except ImportError as error:
     logging.warning(f"The deploy module could not be imported: {error}")
+
+try:
+    from nemo.collections.llm.api import evaluate
+
+    __all__.append("evaluate")
+except ImportError as error:
+    logging.warning(f"The evaluate module could not be imported: {error}")
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index fdceff5d959e..7f1bdf94c876 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -333,7 +333,7 @@ def deploy(
     model_type: str = "llama",
     triton_model_name: str = "xxx",
     triton_model_version: Optional[int] = 1,
-    triton_port: int = 8080,
+    triton_port: int = 8000,
     triton_http_address: str = "0.0.0.0",
     triton_request_timeout: int = 60,
     triton_model_repository: Path = None,
@@ -348,6 +348,7 @@ def deploy(
     rest_service_http_address: str = "0.0.0.0",
     rest_service_port: int = 8000,
     openai_format_response: bool = False,
+    ckpt_type: str = "nemo"
 ):
     from nemo.deploy import DeployPyTriton
 
@@ -358,18 +359,28 @@ def deploy(
         # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
         store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
 
-    triton_deployable = get_trtllm_deployable(
-        nemo_checkpoint,
-        model_type,
-        triton_model_repository,
-        num_gpus,
-        tensor_parallelism_size,
-        pipeline_parallelism_size,
-        max_input_len,
-        max_output_len,
-        max_batch_size,
-        dtype,
-    )
+    #TODO: directly support deploy of trtllm engine wo exporting to TRTLLM
+    if ckpt_type == "trtllm":
+        triton_deployable = get_trtllm_deployable(
+            nemo_checkpoint,
+            model_type,
+            triton_model_repository,
+            num_gpus,
+            tensor_parallelism_size,
+            pipeline_parallelism_size,
+            max_input_len,
+            max_output_len,
+            max_batch_size,
+            dtype,
+        )
+    elif ckpt_type == "nemo":
+        if nemo_checkpoint is None:
+            raise ValueError("In-Framework deployment requires a .nemo checkpoint")
+        try:
+            from nemo.deploy.nlp import MegatronLLMDeployable
+        except Exception as e:
+            raise ValueError("MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}")
+        triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus)
 
     try:
         nm = DeployPyTriton(
@@ -383,6 +394,7 @@ def deploy(
 
         logging.info("Triton deploy function will be called.")
         nm.deploy()
+        nm.run()
     except Exception as error:
         logging.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
@@ -415,6 +427,85 @@ def deploy(
     logging.info("Model serving will be stopped.")
     nm.stop()
 
+def evaluate(
+    url: str = "http://0.0.0.0:1234/v1", 
+    model_name: str = "xxxx",
+    eval_task: str = "gsm8k",
+    num_fewshot: Optional[int] = None,
+    limit: Optional[Union[int, float]] = None,
+    bootstrap_iters: int = 100000,
+    ):
+
+    from lm_eval import tasks, evaluator
+    from lm_eval.api.model import LM
+    import requests
+    class CustomModel(LM):
+        def __init__(self, model_name, api_url):
+            self.model_name = model_name
+            self.api_url = api_url
+
+            super().__init__()
+
+        def loglikelihood(self, requests):
+            # Implement log likelihood calculation logic here
+            pass
+
+        def loglikelihood_rolling(self, requests):
+            # Implement log likelihood calculation logic here
+            pass
+
+        def generate_until(self, inputs):
+            results = []
+            for instance in inputs:
+                # Access the 'arguments' attribute of the Instance
+                prompt = instance.arguments[0]  # This should be the prompt string
+
+                # Extract other parameters from the 'arguments' or 'doc' as needed
+                max_tokens = 50  # Set a default or extract from instance if available
+                #temperature = instance.arguments[1].get('temperature', 1.0)
+                # top_p = instance.arguments[1].get('top_p', 1.0)
+                # top_k = instance.arguments[1].get('top_k', 0)
+                temperature = 1.0
+                top_p = 0
+                top_k = 1.0
+
+                payload = {
+                    "model": self.model_name,
+                    "prompt": prompt,
+                    "max_tokens": max_tokens,
+                    "temperature": temperature,
+                    "top_p": top_p,
+                    "n": top_k
+                }
+
+                response = requests.post(f"{self.api_url}/completions/", json=payload)
+                response_data = response.json()
+
+                if 'error' in response_data:
+                    raise Exception(f"API Error: {response_data['error']}")
+
+                # Assuming the response is in OpenAI format
+                generated_text = response_data['choices'][0]['text']
+                results.append(generated_text)
+
+            return results
+    model = CustomModel(model_name, url)
+    #task = tasks.get_task_dict(eval_task)
+    # Run evaluation
+    # results = evaluator.evaluate(
+    #     lm=model,
+    #     limit=1,
+    #     task_dict=task
+    # )
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=eval_task,
+        limit=limit,
+        num_fewshot=num_fewshot,
+        bootstrap_iters=bootstrap_iters
+        )
+
+    print("--score---",results['results']['gsm8k'])
 
 @run.cli.entrypoint(name="import", namespace="llm")
 def import_ckpt(
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index fbc774883faa..6342da7e185a 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -76,13 +76,18 @@ class CompletionRequest(BaseModel):
     frequency_penalty: float = 1.0
 
 
+@app.get("/hello")
+def root():
+    return {"message": "Hello World"}
+
 @app.get("/triton_health")
 async def check_triton_health():
     """
     This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application.
     Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible.
     """
-    triton_url = f"triton_settings.triton_service_ip:str(triton_settings.triton_service_port)/v2/health/ready"
+    triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
+    print(f"Attempting to connect to Triton server at: {triton_url}")
     try:
         response = requests.get(triton_url, timeout=5)
         if response.status_code == 200:
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index e3394726fa1c..154ffc90dc9c 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -419,13 +419,14 @@ def nemo_deploy(argv):
 
         LOGGER.info("Triton deploy function will be called.")
         nm.deploy()
+        nm.run()
     except Exception as error:
         LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
 
     try:
         LOGGER.info("Model serving on Triton is will be started.")
-        if args.start_rest_service == "True":
+        if args.start_rest_service:
             try:
                 LOGGER.info("REST service will be started.")
                 uvicorn.run(

From 25302b4814ccd9c0541647792e85a3a1758652e1 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 25 Sep 2024 15:29:33 -0700
Subject: [PATCH 02/21] Add inference params to evaluate method

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py           | 38 ++++++++++++---------------
 nemo/deploy/service/rest_model_api.py |  4 +--
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 7f1bdf94c876..65e4f07828ef 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -434,16 +434,24 @@ def evaluate(
     num_fewshot: Optional[int] = None,
     limit: Optional[Union[int, float]] = None,
     bootstrap_iters: int = 100000,
+    # inference params
+    max_tokens_to_generate: Optional[int] = 256,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = 0.0,
+    top_k: Optional[int] = 1,
     ):
 
     from lm_eval import tasks, evaluator
     from lm_eval.api.model import LM
     import requests
     class CustomModel(LM):
-        def __init__(self, model_name, api_url):
+        def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
             self.api_url = api_url
-
+            self.max_tokens_to_generate = max_tokens_to_generate
+            self.temperature = temperature
+            self.top_p = top_p
+            self.top_k = top_k
             super().__init__()
 
         def loglikelihood(self, requests):
@@ -460,22 +468,16 @@ def generate_until(self, inputs):
                 # Access the 'arguments' attribute of the Instance
                 prompt = instance.arguments[0]  # This should be the prompt string
 
-                # Extract other parameters from the 'arguments' or 'doc' as needed
-                max_tokens = 50  # Set a default or extract from instance if available
-                #temperature = instance.arguments[1].get('temperature', 1.0)
-                # top_p = instance.arguments[1].get('top_p', 1.0)
-                # top_k = instance.arguments[1].get('top_k', 0)
-                temperature = 1.0
-                top_p = 0
-                top_k = 1.0
+                # Extract default temperature from instance of the benchmark or use the uder defined value
+                temperature = instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
 
                 payload = {
                     "model": self.model_name,
                     "prompt": prompt,
-                    "max_tokens": max_tokens,
+                    "max_tokens": self.max_tokens_to_generate,
                     "temperature": temperature,
-                    "top_p": top_p,
-                    "n": top_k
+                    "top_p": self.top_p,
+                    "top_k": self.top_k
                 }
 
                 response = requests.post(f"{self.api_url}/completions/", json=payload)
@@ -489,14 +491,8 @@ def generate_until(self, inputs):
                 results.append(generated_text)
 
             return results
-    model = CustomModel(model_name, url)
-    #task = tasks.get_task_dict(eval_task)
-    # Run evaluation
-    # results = evaluator.evaluate(
-    #     lm=model,
-    #     limit=1,
-    #     task_dict=task
-    # )
+    model = CustomModel(model_name, url, temperature, top_p, top_k)
+
     results = evaluator.simple_evaluate(
         model=model,
         tasks=eval_task,
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 6342da7e185a..f7b470c00b34 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -70,7 +70,7 @@ class CompletionRequest(BaseModel):
     max_tokens: int = 512
     temperature: float = 1.0
     top_p: float = 0.0
-    n: int = 1
+    top_k: int = 1
     stream: bool = False
     stop: str | None = None
     frequency_penalty: float = 1.0
@@ -106,7 +106,7 @@ def completions_v1(request: CompletionRequest):
         output = nq.query_llm(
             prompts=[request.prompt],
             max_output_len=request.max_tokens,
-            top_k=request.n,
+            top_k=request.top_k,
             top_p=request.top_p,
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,

From 02e9edb7d5d713fd78da853690249a51b995c615 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Mon, 30 Sep 2024 13:54:16 -0700
Subject: [PATCH 03/21] Add wait_for_rest_service fn to evaluate method

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py                |  47 +++++++-
 nemo/deploy/service/rest_model_api.py      |  12 +-
 scripts/export/convert_nemo2_for_export.py | 123 ---------------------
 3 files changed, 51 insertions(+), 131 deletions(-)
 delete mode 100644 scripts/export/convert_nemo2_for_export.py

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 65e4f07828ef..670b13bdc62e 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -443,7 +443,51 @@ def evaluate(
 
     from lm_eval import tasks, evaluator
     from lm_eval.api.model import LM
+    import time
     import requests
+    from requests.exceptions import RequestException
+
+    def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2):
+        """
+        Wait for REST service to be ready.
+
+        Args:
+        rest_url (str): URL of the REST service's health endpoint
+        max_retries (int): Maximum number of retry attempts
+        retry_interval (int): Time to wait between retries in seconds
+
+        Returns:
+        bool: True if rest service is ready, False otherwise
+        """
+        for _ in range(max_retries):
+            rest_ready = check_service(rest_url)
+
+            if rest_ready:
+                print("REST service is ready.")
+                return True
+
+            print(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
+            time.sleep(retry_interval)
+
+        print("Timeout: One or both services did not become ready.")
+        return False
+
+    def check_service(url):
+        """
+        Check if a service is ready by making a GET request to its health endpoint.
+
+        Args:
+        url (str): URL of the service's health endpoint
+
+        Returns:
+        bool: True if the service is ready, False otherwise
+        """
+        try:
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except RequestException:
+            return False
+
     class CustomModel(LM):
         def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
@@ -491,8 +535,9 @@ def generate_until(self, inputs):
                 results.append(generated_text)
 
             return results
-    model = CustomModel(model_name, url, temperature, top_p, top_k)
 
+    wait_for_rest_service(rest_url=f"{url}/health")
+    model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k)
     results = evaluator.simple_evaluate(
         model=model,
         tasks=eval_task,
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index f7b470c00b34..9b330088487f 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -20,7 +20,6 @@
 
 from nemo.deploy.nlp import NemoQueryLLM
 
-
 class TritonSettings(BaseSettings):
     _triton_service_port: int
     _triton_service_ip: str
@@ -63,7 +62,6 @@ def openai_format_response(self):
 app = FastAPI()
 triton_settings = TritonSettings()
 
-
 class CompletionRequest(BaseModel):
     model: str
     prompt: str
@@ -76,15 +74,15 @@ class CompletionRequest(BaseModel):
     frequency_penalty: float = 1.0
 
 
-@app.get("/hello")
-def root():
-    return {"message": "Hello World"}
+@app.get("/v1/health")
+def health_check():
+    return {"status": "ok"}
 
-@app.get("/triton_health")
+@app.get("/v1/triton_health")
 async def check_triton_health():
     """
     This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application.
-    Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible.
+    Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible.
     """
     triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
     print(f"Attempting to connect to Triton server at: {triton_url}")
diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py
deleted file mode 100644
index 0703322cd854..000000000000
--- a/scripts/export/convert_nemo2_for_export.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert a NeMo 2.0 checkpoint to NeMo 1.0 for TRTLLM export.
-Example to run this conversion script:
-```
-    python /opt/NeMo/scripts/scripts/export/convert_nemo2_for_export.py \
-     --input_path /path/to/nemo2/ckpt \
-     --output_path /path/to/output \
-     --tokenizer_type huggingface \
-     --tokenizer_name meta-llama/Meta-Llama-3.1-8B \
-     --symbolic_link=True
-```
-"""
-
-import os
-import shutil
-from argparse import ArgumentParser
-
-from omegaconf import OmegaConf
-
-from nemo.lightning import io
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--input_path",
-        type=str,
-        required=True,
-        help="Path to nemo 2.0 checkpoint",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Output path",
-    )
-    parser.add_argument(
-        "--tokenizer_type",
-        type=str,
-        default="huggingface",
-        help="Type of tokenizer",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default="meta-llama/Meta-Llama-3.1-8B",
-        help="Name or path of tokenizer",
-    )
-    parser.add_argument(
-        "--symbolic_link",
-        type=bool,
-        default=True,
-        help="Whether to use symbiloc link for model weights",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    input_path = args.input_path
-    output_path = args.output_path
-    weight_path = os.path.join(output_path, "model_weights")
-
-    if os.path.exists(output_path):
-        shutil.rmtree(output_path)
-        print(f"Remove existing {output_path}")
-
-    os.makedirs(output_path, exist_ok=True)
-
-    config = io.load_context(input_path, subpath="model.config")
-
-    config_dict = {}
-    for k, v in config.__dict__.items():
-        if isinstance(v, (float, int, str, bool)):
-            config_dict[k] = v
-        elif k == "activation_func":
-            config_dict["activation"] = v.__name__
-
-    if config_dict.get("num_moe_experts") is None:
-        config_dict["num_moe_experts"] = 0
-        config_dict["moe_router_topk"] = 0
-    if config_dict["activation"] == "silu":
-        config_dict["activation"] = "fast-swiglu"
-
-    config_dict["mcore_gpt"] = True
-    config_dict["max_position_embeddings"] = config_dict.get("seq_length")
-    config_dict["tokenizer"] = {
-        "library": args.tokenizer_type,
-        "type": args.tokenizer_name,
-        "use_fast": True,
-    }
-
-    yaml_config = OmegaConf.create(config_dict)
-    OmegaConf.save(config=yaml_config, f=os.path.join(output_path, "model_config.yaml"))
-
-    if args.symbolic_link:
-        os.symlink(input_path, weight_path)
-    else:
-        os.makedirs(weight_path, exist_ok=True)
-        for file in os.listdir(input_path):
-            source_path = os.path.join(input_path, file)
-            target_path = os.path.join(weight_path, file)
-            shutil.copy(source_path, target_path)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)

From 3ee94addb68cfb5907438ac6c7101cdb8ef6b76e Mon Sep 17 00:00:00 2001
From: athitten <athitten@users.noreply.github.com>
Date: Mon, 30 Sep 2024 20:56:08 +0000
Subject: [PATCH 04/21] Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>
---
 nemo/collections/llm/api.py           | 35 +++++++++++++++------------
 nemo/deploy/service/rest_model_api.py |  7 +++++-
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 670b13bdc62e..a9d3cfb8d2b6 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -348,7 +348,7 @@ def deploy(
     rest_service_http_address: str = "0.0.0.0",
     rest_service_port: int = 8000,
     openai_format_response: bool = False,
-    ckpt_type: str = "nemo"
+    ckpt_type: str = "nemo",
 ):
     from nemo.deploy import DeployPyTriton
 
@@ -359,7 +359,7 @@ def deploy(
         # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
         store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
 
-    #TODO: directly support deploy of trtllm engine wo exporting to TRTLLM
+    # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM
     if ckpt_type == "trtllm":
         triton_deployable = get_trtllm_deployable(
             nemo_checkpoint,
@@ -379,7 +379,9 @@ def deploy(
         try:
             from nemo.deploy.nlp import MegatronLLMDeployable
         except Exception as e:
-            raise ValueError("MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}")
+            raise ValueError(
+                "MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}"
+            )
         triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus)
 
     try:
@@ -427,8 +429,9 @@ def deploy(
     logging.info("Model serving will be stopped.")
     nm.stop()
 
+
 def evaluate(
-    url: str = "http://0.0.0.0:1234/v1", 
+    url: str = "http://0.0.0.0:1234/v1",
     model_name: str = "xxxx",
     eval_task: str = "gsm8k",
     num_fewshot: Optional[int] = None,
@@ -439,12 +442,13 @@ def evaluate(
     temperature: Optional[float] = None,
     top_p: Optional[float] = 0.0,
     top_k: Optional[int] = 1,
-    ):
+):
 
-    from lm_eval import tasks, evaluator
-    from lm_eval.api.model import LM
     import time
+
     import requests
+    from lm_eval import evaluator, tasks
+    from lm_eval.api.model import LM
     from requests.exceptions import RequestException
 
     def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2):
@@ -513,7 +517,9 @@ def generate_until(self, inputs):
                 prompt = instance.arguments[0]  # This should be the prompt string
 
                 # Extract default temperature from instance of the benchmark or use the uder defined value
-                temperature = instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                temperature = (
+                    instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                )
 
                 payload = {
                     "model": self.model_name,
@@ -521,7 +527,7 @@ def generate_until(self, inputs):
                     "max_tokens": self.max_tokens_to_generate,
                     "temperature": temperature,
                     "top_p": self.top_p,
-                    "top_k": self.top_k
+                    "top_k": self.top_k,
                 }
 
                 response = requests.post(f"{self.api_url}/completions/", json=payload)
@@ -539,14 +545,11 @@ def generate_until(self, inputs):
     wait_for_rest_service(rest_url=f"{url}/health")
     model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k)
     results = evaluator.simple_evaluate(
-        model=model,
-        tasks=eval_task,
-        limit=limit,
-        num_fewshot=num_fewshot,
-        bootstrap_iters=bootstrap_iters
-        )
+        model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
+    )
+
+    print("--score---", results['results']['gsm8k'])
 
-    print("--score---",results['results']['gsm8k'])
 
 @run.cli.entrypoint(name="import", namespace="llm")
 def import_ckpt(
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 9b330088487f..a3b0594fb020 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -20,6 +20,7 @@
 
 from nemo.deploy.nlp import NemoQueryLLM
 
+
 class TritonSettings(BaseSettings):
     _triton_service_port: int
     _triton_service_ip: str
@@ -62,6 +63,7 @@ def openai_format_response(self):
 app = FastAPI()
 triton_settings = TritonSettings()
 
+
 class CompletionRequest(BaseModel):
     model: str
     prompt: str
@@ -78,13 +80,16 @@ class CompletionRequest(BaseModel):
 def health_check():
     return {"status": "ok"}
 
+
 @app.get("/v1/triton_health")
 async def check_triton_health():
     """
     This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application.
     Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible.
     """
-    triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
+    triton_url = (
+        f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
+    )
     print(f"Attempting to connect to Triton server at: {triton_url}")
     try:
         response = requests.get(triton_url, timeout=5)

From 7d79edc0bb66357369bba379cb80c4c00a834bd7 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Mon, 7 Oct 2024 22:23:28 -0700
Subject: [PATCH 05/21] Add logprobs to be returned by Pytriton for trtllm
 models

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py             | 133 ++++++++++++++++++++----
 nemo/deploy/nlp/query_llm.py            |   4 +-
 nemo/deploy/service/rest_model_api.py   |   1 +
 nemo/export/tensorrt_llm.py             |  13 ++-
 nemo/export/trt_llm/tensorrt_llm_run.py |   2 +
 5 files changed, 129 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index a9d3cfb8d2b6..0d353ecba3aa 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -447,7 +447,9 @@ def evaluate(
     import time
 
     import requests
-    from lm_eval import evaluator, tasks
+    from lm_eval import evaluator
+    ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base
+    from lm_eval.api.instance import Instance
     from lm_eval.api.model import LM
     from requests.exceptions import RequestException
 
@@ -493,6 +495,9 @@ def check_service(url):
             return False
 
     class CustomModel(LM):
+        """
+        Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
+        """
         def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
             self.api_url = api_url
@@ -502,42 +507,130 @@ def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top
             self.top_k = top_k
             super().__init__()
 
-        def loglikelihood(self, requests):
-            # Implement log likelihood calculation logic here
-            pass
+        def _generate_tokens_logprobs(self, payload,
+                                      return_text: bool = False,
+                                      return_logprobs: bool = False):
+            response = requests.post(f"{self.api_url}/completions/", json=payload)
+            response_data = response.json()
+
+            if 'error' in response_data:
+                raise Exception(f"API Error: {response_data['error']}")
 
-        def loglikelihood_rolling(self, requests):
-            # Implement log likelihood calculation logic here
-            pass
+            # Assuming the response is in OpenAI format
+            if return_text:
+                return response_data['choices'][0]['text']
 
-        def generate_until(self, inputs):
+            if return_logprobs:
+                return response_data['choices'][0]['log_probs']
+
+        def loglikelihood(self, requests: list[Instance]):
+            # log likelihood calculation logic here
             results = []
-            for instance in inputs:
+            for request in requests:
+                context = request.arguments[0]
+                continuation = request.arguments[1]
+                full_text = context + continuation
+                instance = Instance(
+                    request_type="loglikelihood",
+                    #doc={'text': full_text},
+                    doc=request.doc,
+                    arguments=(full_text,),
+                    idx=0,
+                )
                 # Access the 'arguments' attribute of the Instance
                 prompt = instance.arguments[0]  # This should be the prompt string
 
-                # Extract default temperature from instance of the benchmark or use the uder defined value
-                temperature = (
-                    instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                # Extract default temperature from instance of the benchmark or use the user defined value
+                # Does not work for MMLU since the input instance does not contain temp key
+                # temperature = (
+                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                # )
+                payload = {
+                    "model": self.model_name,
+                    "prompt": prompt,
+                    "max_tokens": self.max_tokens_to_generate,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": self.top_k,
+                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # user defined value or set it to True by default ?
+                }
+
+                log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True)
+
+                # Assuming log_probs is a list of log probabilities for each token
+                # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ?
+                continuation_log_prob = sum(log_probs[0][0][-len(continuation):])
+                results.append((continuation_log_prob, False))
+
+            return results
+
+        def loglikelihood_rolling(self, requests: list[Instance]):
+            # log likelihood rolling calculation logic here
+            results = []
+            for request in requests:
+                context = request.arguments[0]
+                continuation = request.arguments[1]
+                full_text = context + continuation
+                instance = Instance(
+                    request_type="loglikelihood_rolling",
+                    #doc={'text': full_text},
+                    doc=request.doc,
+                    arguments=(full_text,),
+                    idx=0,
                 )
+                # Access the 'arguments' attribute of the Instance
+                prompt = instance.arguments[0]  # This should be the prompt string
 
+                # Extract default temperature from instance of the benchmark or use the user defined value
+                # Does not work for MMLU since the input instance does not contain temp key
+                # temperature = (
+                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                # )
                 payload = {
                     "model": self.model_name,
                     "prompt": prompt,
                     "max_tokens": self.max_tokens_to_generate,
-                    "temperature": temperature,
+                    "temperature": self.temperature,
                     "top_p": self.top_p,
                     "top_k": self.top_k,
+                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # user defined value or set it to True by default ?
                 }
 
-                response = requests.post(f"{self.api_url}/completions/", json=payload)
-                response_data = response.json()
+                log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True)
+
+                # Assuming log_probs is a list of log probabilities for each token
+                continuation_log_probs = log_probs[0][0][-len(continuation):]
+                results.append((continuation_log_probs, False))
+
+            return results
+
+        def generate_until(self, inputs: list[Instance]):
+            # `Instance` is a dataclass defined in [`lm_eval.api.instance`] https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/lm_eval/api/instance.py
+            results = []
+            for instance in inputs:
+                # Access the 'arguments' attribute of the Instance
+                prompt = instance.arguments[0]  # This should be the prompt string
+
+                # Extract default temperature from instance of the benchmark or use the user defined value
+                # Does not work for MMLU since the input instance does not contain temp key
+                # temperature = (
+                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
+                # )
+                payload = {
+                    "model": self.model_name,
+                    "prompt": prompt,
+                    "max_tokens": self.max_tokens_to_generate,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "top_k": self.top_k,
+                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # user defined value or set it to True by default ?
+                }
 
-                if 'error' in response_data:
-                    raise Exception(f"API Error: {response_data['error']}")
+                generated_text = self._generate_tokens_logprobs(payload, return_text=True)
 
-                # Assuming the response is in OpenAI format
-                generated_text = response_data['choices'][0]['text']
                 results.append(generated_text)
 
             return results
@@ -548,7 +641,7 @@ def generate_until(self, inputs):
         model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
     )
 
-    print("--score---", results['results']['gsm8k'])
+    print("--results---", results['results'][eval_task])
 
 
 @run.cli.entrypoint(name="import", namespace="llm")
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 7e873db6b5b1..1f01e228bb8d 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -267,7 +267,9 @@ def query_llm(
                         "object": "text_completion",
                         "created": int(time.time()),
                         "model": self.model_name,
-                        "choices": [{"text": str(sentences)}],
+                        #TODO if compute_logprobs is True then add log_probs
+                        ## Convert log_probs to a list to make it json serializable
+                        "choices": [{"text": str(sentences), "log_probs":result_dict["log_probs"].tolist()}]
                     }
                     return openai_response
                 else:
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index a3b0594fb020..fdd68ecb1000 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -114,6 +114,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
+            compute_logprob=True
         )
         if triton_settings.openai_format_response:
             return output
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 08b0b822cad4..5b298696e07d 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -862,12 +862,13 @@ def get_triton_input(self):
             Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True)
         )
         return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single))
         return outputs
 
     @batch
@@ -898,14 +899,20 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             if "lora_uids" in inputs:
                 lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
                 infer_input["lora_uids"] = lora_uids[0].tolist()
+            if "compute_logprob" in inputs:
+                infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0]
 
-            output_texts = self.forward(**infer_input)
+            if infer_input["output_log_probs"]:
+                output_texts, log_probs = self.forward(**infer_input)
+                log_probs = np.array(log_probs.cpu().numpy())
+            else:
+                output_texts = self.forward(**infer_input)
             output = cast_output(output_texts, np.bytes_)
         except Exception as error:
             err_msg = "An error occurred: {0}".format(str(error))
             output = cast_output([err_msg], np.bytes_)
 
-        return {"outputs": output}
+        return {"outputs": output, "log_probs": log_probs}
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: np.ndarray):
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index bd7b8abd5f9e..110b7c0f1558 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -279,6 +279,7 @@ def _forward(
                 streaming=streaming,
                 output_sequence_lengths=True,
                 return_dict=True,
+                output_log_probs=sampling_kwargs.get('output_log_probs', False),
             )
 
             torch.cuda.synchronize()
@@ -699,6 +700,7 @@ def generate(
     output_ids = outputs['output_ids']
     sequence_lengths = outputs['sequence_lengths']
     input_lengths = [t.shape[0] for t in input_tensors]
+    log_probs = outputs['log_probs']
 
     output_lines_list = [
         tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])

From 8563cc3545c9b263dd1aaa01a458da64afcf534b Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Mon, 14 Oct 2024 16:45:27 -0700
Subject: [PATCH 06/21] Increase max_retries in wait_for_rest_service method

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 0d353ecba3aa..5f3759e79638 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -351,7 +351,6 @@ def deploy(
     ckpt_type: str = "nemo",
 ):
     from nemo.deploy import DeployPyTriton
-
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
@@ -453,7 +452,7 @@ def evaluate(
     from lm_eval.api.model import LM
     from requests.exceptions import RequestException
 
-    def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2):
+    def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
         """
         Wait for REST service to be ready.
 

From a35d8d72f9ee7c280cbd06dd36bc8785dfd519b1 Mon Sep 17 00:00:00 2001
From: athitten <athitten@users.noreply.github.com>
Date: Tue, 15 Oct 2024 00:14:29 +0000
Subject: [PATCH 07/21] Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>
---
 nemo/collections/llm/api.py           | 21 +++++++++++----------
 nemo/deploy/nlp/query_llm.py          |  4 ++--
 nemo/deploy/service/rest_model_api.py |  2 +-
 nemo/export/tensorrt_llm.py           |  7 +++++--
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 5f3759e79638..6ddc4c9db403 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -351,6 +351,7 @@ def deploy(
     ckpt_type: str = "nemo",
 ):
     from nemo.deploy import DeployPyTriton
+
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
@@ -447,6 +448,7 @@ def evaluate(
 
     import requests
     from lm_eval import evaluator
+
     ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base
     from lm_eval.api.instance import Instance
     from lm_eval.api.model import LM
@@ -497,6 +499,7 @@ class CustomModel(LM):
         """
         Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
         """
+
         def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
             self.api_url = api_url
@@ -506,9 +509,7 @@ def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top
             self.top_k = top_k
             super().__init__()
 
-        def _generate_tokens_logprobs(self, payload,
-                                      return_text: bool = False,
-                                      return_logprobs: bool = False):
+        def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_logprobs: bool = False):
             response = requests.post(f"{self.api_url}/completions/", json=payload)
             response_data = response.json()
 
@@ -531,7 +532,7 @@ def loglikelihood(self, requests: list[Instance]):
                 full_text = context + continuation
                 instance = Instance(
                     request_type="loglikelihood",
-                    #doc={'text': full_text},
+                    # doc={'text': full_text},
                     doc=request.doc,
                     arguments=(full_text,),
                     idx=0,
@@ -551,7 +552,7 @@ def loglikelihood(self, requests: list[Instance]):
                     "temperature": self.temperature,
                     "top_p": self.top_p,
                     "top_k": self.top_k,
-                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # "compute_logprob": True ##TODO Do we want to have this as an
                     # user defined value or set it to True by default ?
                 }
 
@@ -559,7 +560,7 @@ def loglikelihood(self, requests: list[Instance]):
 
                 # Assuming log_probs is a list of log probabilities for each token
                 # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ?
-                continuation_log_prob = sum(log_probs[0][0][-len(continuation):])
+                continuation_log_prob = sum(log_probs[0][0][-len(continuation) :])
                 results.append((continuation_log_prob, False))
 
             return results
@@ -573,7 +574,7 @@ def loglikelihood_rolling(self, requests: list[Instance]):
                 full_text = context + continuation
                 instance = Instance(
                     request_type="loglikelihood_rolling",
-                    #doc={'text': full_text},
+                    # doc={'text': full_text},
                     doc=request.doc,
                     arguments=(full_text,),
                     idx=0,
@@ -593,14 +594,14 @@ def loglikelihood_rolling(self, requests: list[Instance]):
                     "temperature": self.temperature,
                     "top_p": self.top_p,
                     "top_k": self.top_k,
-                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # "compute_logprob": True ##TODO Do we want to have this as an
                     # user defined value or set it to True by default ?
                 }
 
                 log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True)
 
                 # Assuming log_probs is a list of log probabilities for each token
-                continuation_log_probs = log_probs[0][0][-len(continuation):]
+                continuation_log_probs = log_probs[0][0][-len(continuation) :]
                 results.append((continuation_log_probs, False))
 
             return results
@@ -624,7 +625,7 @@ def generate_until(self, inputs: list[Instance]):
                     "temperature": self.temperature,
                     "top_p": self.top_p,
                     "top_k": self.top_k,
-                    #"compute_logprob": True ##TODO Do we want to have this as an
+                    # "compute_logprob": True ##TODO Do we want to have this as an
                     # user defined value or set it to True by default ?
                 }
 
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 1f01e228bb8d..a96c0e3a25eb 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -267,9 +267,9 @@ def query_llm(
                         "object": "text_completion",
                         "created": int(time.time()),
                         "model": self.model_name,
-                        #TODO if compute_logprobs is True then add log_probs
+                        # TODO if compute_logprobs is True then add log_probs
                         ## Convert log_probs to a list to make it json serializable
-                        "choices": [{"text": str(sentences), "log_probs":result_dict["log_probs"].tolist()}]
+                        "choices": [{"text": str(sentences), "log_probs": result_dict["log_probs"].tolist()}],
                     }
                     return openai_response
                 else:
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index fdd68ecb1000..119220da8ace 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -114,7 +114,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            compute_logprob=True
+            compute_logprob=True,
         )
         if triton_settings.openai_format_response:
             return output
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 5b298696e07d..21f3cc599ad6 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -862,13 +862,16 @@ def get_triton_input(self):
             Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True)
+            Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True),
         )
         return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single))
+        outputs = (
+            Tensor(name="outputs", shape=(-1,), dtype=bytes),
+            Tensor(name="log_probs", shape=(-1,), dtype=np.single),
+        )
         return outputs
 
     @batch

From 25fced220873bed911d0f01d1b03767fce3b87a1 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Fri, 1 Nov 2024 21:20:04 -0700
Subject: [PATCH 08/21] Add unset slurm vars and use env vars for Triton args

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py                   | 26 +++++++++++++++++--
 nemo/deploy/service/rest_model_api.py         | 10 +++----
 nemo/lightning/pytorch/callbacks/debugging.py |  2 +-
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 6ddc4c9db403..3f23ceee22e6 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -326,6 +326,24 @@ def store_args_to_json(triton_http_address, triton_port, triton_request_timeout,
     with open("nemo/deploy/service/config.json", "w") as f:
         json.dump(args_dict, f)
 
+def unset_environment_variables():
+    import subprocess
+    print("Unsetting all SLURM_, PMI_, PMIX_ Variables")
+
+    # Function to unset variables with a specific prefix
+    def unset_vars_with_prefix(prefix):
+        cmd = f"env | grep ^{prefix} | cut -d= -f1"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        vars_to_unset = result.stdout.strip().split('\n')
+        for var in vars_to_unset:
+            if var:  # Check if the variable name is not empty
+                os.environ.pop(var, None)
+
+    # Unset variables for each prefix
+    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
+        unset_vars_with_prefix(prefix)
+
+    print("Variables unset successfully")
 
 @run.cli.entrypoint(namespace="llm")
 def deploy(
@@ -351,13 +369,17 @@ def deploy(
     ckpt_type: str = "nemo",
 ):
     from nemo.deploy import DeployPyTriton
-
+    unset_environment_variables()
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
             return
         # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
-        store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
+        #store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
+        os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address
+        os.environ['TRITON_PORT'] = str(triton_port)
+        os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout)
+        os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
 
     # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM
     if ckpt_type == "trtllm":
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 119220da8ace..0c11a04a876c 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -29,12 +29,10 @@ class TritonSettings(BaseSettings):
     def __init__(self):
         super(TritonSettings, self).__init__()
         try:
-            with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config:
-                config_json = json.load(config)
-                self._triton_service_port = config_json["triton_service_port"]
-                self._triton_service_ip = config_json["triton_service_ip"]
-                self._triton_request_timeout = config_json["triton_request_timeout"]
-                self._openai_format_response = config_json["openai_format_response"]
+            self._triton_service_port = int(os.environ.get('TRITON_PORT', 8080))
+            self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0')
+            self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60))
+            self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true'
         except Exception as error:
             print("An exception occurred:", error)
             return
diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py
index 5f6e722ef89b..b4f80ac89608 100644
--- a/nemo/lightning/pytorch/callbacks/debugging.py
+++ b/nemo/lightning/pytorch/callbacks/debugging.py
@@ -116,7 +116,7 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule)
         Iterate over model parameters, find gradient tensor, apply and collect outputs of
         param_fn and grad_fn, and log outputs in a table.
         """
-
+        from prettytable import PrettyTable
         def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]:
             """If using MCore optimizer, search the grad buckets for param's grad tensor."""
             if not isinstance(pl_module.optim, MegatronOptimizerModule):

From 41eb551b3a116bf4a7f8b44edb2fd24a14e92fec Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Fri, 1 Nov 2024 16:30:10 -0700
Subject: [PATCH 09/21] Add logic to get logProbs from logits

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py               | 72 ++++++++++++-----------
 nemo/deploy/nlp/query_llm.py              |  5 +-
 nemo/deploy/service/rest_model_api.py     |  5 +-
 nemo/export/tensorrt_llm.py               | 15 +++--
 nemo/export/trt_llm/tensorrt_llm_build.py |  2 +-
 nemo/export/trt_llm/tensorrt_llm_run.py   |  5 +-
 6 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 3f23ceee22e6..3a3f1e61dae9 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -453,6 +453,7 @@ def deploy(
 
 
 def evaluate(
+    nemo_checkpoint_path: Path,
     url: str = "http://0.0.0.0:1234/v1",
     model_name: str = "xxxx",
     eval_task: str = "gsm8k",
@@ -521,10 +522,10 @@ class CustomModel(LM):
         """
         Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
         """
-
-        def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k):
+        def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k):
             self.model_name = model_name
             self.api_url = api_url
+            self.tokenizer = tokenizer
             self.max_tokens_to_generate = max_tokens_to_generate
             self.temperature = temperature
             self.top_p = top_p
@@ -543,33 +544,30 @@ def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_l
                 return response_data['choices'][0]['text']
 
             if return_logprobs:
-                return response_data['choices'][0]['log_probs']
+                # generation_logits is needed only for loglikelihood tasks
+                return response_data['choices'][0]['log_probs'], response_data['choices'][0]['generation_logits']
+
 
         def loglikelihood(self, requests: list[Instance]):
-            # log likelihood calculation logic here
+            import numpy as np
+            import torch
+            import torch.nn.functional as F
+
+            special_tokens_kwargs = {'add_special_tokens': False} ## Hardcode for now. TODO Infer add_bos from input.
             results = []
             for request in requests:
                 context = request.arguments[0]
                 continuation = request.arguments[1]
-                full_text = context + continuation
-                instance = Instance(
-                    request_type="loglikelihood",
-                    # doc={'text': full_text},
-                    doc=request.doc,
-                    arguments=(full_text,),
-                    idx=0,
-                )
-                # Access the 'arguments' attribute of the Instance
-                prompt = instance.arguments[0]  # This should be the prompt string
+                context_enc = self.tokenizer.tokenizer.encode(context) #, **special_tokens_kwargs) #errors for SentencePeicetokenizer
+                continuation_enc = self.tokenizer.tokenizer.encode(continuation) #, **special_tokens_kwargs)
+                continuation_enc = continuation_enc[1:] #for SentencePeice since first encoded token is space, comment this for HF tokenizer
+                num_cont_tokens = len(continuation_enc)
+                ## Update self.max_tokens_to_generate with number of continuation tokens in the request
+                self.max_tokens_to_generate = num_cont_tokens
 
-                # Extract default temperature from instance of the benchmark or use the user defined value
-                # Does not work for MMLU since the input instance does not contain temp key
-                # temperature = (
-                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
-                # )
                 payload = {
                     "model": self.model_name,
-                    "prompt": prompt,
+                    "prompt": context,
                     "max_tokens": self.max_tokens_to_generate,
                     "temperature": self.temperature,
                     "top_p": self.top_p,
@@ -577,17 +575,24 @@ def loglikelihood(self, requests: list[Instance]):
                     # "compute_logprob": True ##TODO Do we want to have this as an
                     # user defined value or set it to True by default ?
                 }
-
-                log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True)
-
-                # Assuming log_probs is a list of log probabilities for each token
-                # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ?
-                continuation_log_prob = sum(log_probs[0][0][-len(continuation) :])
-                results.append((continuation_log_prob, False))
+                log_probs, generation_logits = self._generate_tokens_logprobs(payload, return_logprobs=True)
+                # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation
+                multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1)
+                cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0)
+                greedy_tokens = multi_logits.argmax(dim=-1)
+                max_equal = (greedy_tokens == cont_toks).all()
+                logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )
+                result = (float(logits.sum()), bool(max_equal))
+
+                results.append(result)
 
             return results
 
         def loglikelihood_rolling(self, requests: list[Instance]):
+            ## Note: loglikelihood_rolling does not have correct implementation yet,
+            # the tasks we have working so far: gsm8k, mmlu, lambada dont need loglikelihood_rolling
             # log likelihood rolling calculation logic here
             results = []
             for request in requests:
@@ -635,11 +640,6 @@ def generate_until(self, inputs: list[Instance]):
                 # Access the 'arguments' attribute of the Instance
                 prompt = instance.arguments[0]  # This should be the prompt string
 
-                # Extract default temperature from instance of the benchmark or use the user defined value
-                # Does not work for MMLU since the input instance does not contain temp key
-                # temperature = (
-                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
-                # )
                 payload = {
                     "model": self.model_name,
                     "prompt": prompt,
@@ -657,13 +657,17 @@ def generate_until(self, inputs: list[Instance]):
 
             return results
 
+    ## Get tokenizer from nemo 2.0 model, in case of 1.0 please add appropriate code to get
+    ## tokenizer from 1.0 ckpt and pass it to CustomModel
+    model = io.load_context(nemo_checkpoint_path, subpath="model")
+
     wait_for_rest_service(rest_url=f"{url}/health")
-    model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k)
+    model = CustomModel(model_name, url, model.tokenizer, max_tokens_to_generate, temperature, top_p, top_k)
     results = evaluator.simple_evaluate(
         model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
     )
 
-    print("--results---", results['results'][eval_task])
+    print("score", results['results'][eval_task])
 
 
 @run.cli.entrypoint(name="import", namespace="llm")
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index a96c0e3a25eb..6aa9ed79813a 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -269,7 +269,10 @@ def query_llm(
                         "model": self.model_name,
                         # TODO if compute_logprobs is True then add log_probs
                         ## Convert log_probs to a list to make it json serializable
-                        "choices": [{"text": str(sentences), "log_probs": result_dict["log_probs"].tolist()}],
+                        "choices": [{"text": str(sentences),
+                                     "log_probs":result_dict["log_probs"].tolist(),
+                                     "generation_logits": result_dict["generation_logits"].tolist()
+                                     }]
                     }
                     return openai_response
                 else:
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 0c11a04a876c..aaf9f3b6c0a0 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -107,12 +107,15 @@ def completions_v1(request: CompletionRequest):
         output = nq.query_llm(
             prompts=[request.prompt],
             max_output_len=request.max_tokens,
+            # when these below params are passed as None
             top_k=request.top_k,
             top_p=request.top_p,
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            compute_logprob=True,
+            # TODO make these two user configurable ??
+            all_probs=True,
+            compute_logprob=True
         )
         if triton_settings.openai_format_response:
             return output
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 21f3cc599ad6..76ea171bdc29 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -863,19 +863,19 @@ def get_triton_input(self):
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True),
+            Tensor(name="all_probs", shape=(-1,), dtype=np.bool_, optional=True)
         )
         return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (
-            Tensor(name="outputs", shape=(-1,), dtype=bytes),
-            Tensor(name="log_probs", shape=(-1,), dtype=np.single),
-        )
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single),
+                   Tensor(name="generation_logits", shape=(-1,), dtype=np.single))
         return outputs
 
     @batch
     def triton_infer_fn(self, **inputs: np.ndarray):
+        log_probs = None
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
             if "max_output_len" in inputs:
@@ -904,10 +904,13 @@ def triton_infer_fn(self, **inputs: np.ndarray):
                 infer_input["lora_uids"] = lora_uids[0].tolist()
             if "compute_logprob" in inputs:
                 infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0]
+            if "all_probs" in inputs:
+                infer_input["all_probs"] = inputs.pop("all_probs")[0][0]
 
             if infer_input["output_log_probs"]:
-                output_texts, log_probs = self.forward(**infer_input)
+                output_texts, log_probs, generation_logits = self.forward(**infer_input)
                 log_probs = np.array(log_probs.cpu().numpy())
+                generation_logits = np.array(generation_logits.cpu().numpy())
             else:
                 output_texts = self.forward(**infer_input)
             output = cast_output(output_texts, np.bytes_)
@@ -915,7 +918,7 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             err_msg = "An error occurred: {0}".format(str(error))
             output = cast_output([err_msg], np.bytes_)
 
-        return {"outputs": output, "log_probs": log_probs}
+        return {"outputs": output, "log_probs": log_probs, "generation_logits": generation_logits}
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: np.ndarray):
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 4be2d42ebe4d..424e4c3f27d9 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -97,7 +97,7 @@ def build_and_save_engine(
         'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
         'gather_context_logits': False,
-        'gather_generation_logits': False,
+        'gather_generation_logits': True,
         'strongly_typed': False,
         'builder_opt': None,
         'use_refit': use_refit,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 110b7c0f1558..7cbb038c5b0f 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -280,6 +280,7 @@ def _forward(
                 output_sequence_lengths=True,
                 return_dict=True,
                 output_log_probs=sampling_kwargs.get('output_log_probs', False),
+                all_probs=sampling_kwargs.get('all_probs', False),
             )
 
             torch.cuda.synchronize()
@@ -693,6 +694,7 @@ def generate(
         multiprocessed_env=multiprocessed_env,
         **sampling_kwargs,
     )
+
     assert outputs is not None
     if tensorrt_llm.mpi_rank() != 0:
         return None
@@ -701,6 +703,7 @@ def generate(
     sequence_lengths = outputs['sequence_lengths']
     input_lengths = [t.shape[0] for t in input_tensors]
     log_probs = outputs['log_probs']
+    generation_logits = outputs['generation_logits']
 
     output_lines_list = [
         tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
@@ -708,7 +711,7 @@ def generate(
     ]
 
     if output_log_probs:
-        return output_lines_list, log_probs
+        return output_lines_list, log_probs, generation_logits
     return output_lines_list
 
 

From d4ca0e10f051372ab99db4097f1c8169979f9902 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Thu, 7 Nov 2024 20:00:06 -0800
Subject: [PATCH 10/21] Refactor, clean and organize the code

1) Refactors the code and creates an evaluation folder where all util methods live
2) Add doctsrings, comments
3) Expose gather_context_logits, gather_generation_logits in trtllm and add output_generation_logits flag to return generation logits and remove output_logporbs as its not getting used anymore

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py                   | 420 +++++-------------
 nemo/collections/llm/evaluation/__init__.py   |   3 +
 nemo/collections/llm/evaluation/eval_utils.py | 267 +++++++++++
 nemo/deploy/nlp/query_llm.py                  |  15 +-
 nemo/deploy/service/rest_model_api.py         |  17 +-
 nemo/export/tensorrt_llm.py                   |  37 +-
 nemo/export/trt_llm/tensorrt_llm_build.py     |   6 +-
 nemo/export/trt_llm/tensorrt_llm_run.py       |   7 +-
 8 files changed, 418 insertions(+), 354 deletions(-)
 create mode 100644 nemo/collections/llm/evaluation/__init__.py
 create mode 100644 nemo/collections/llm/evaluation/eval_utils.py

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 3a3f1e61dae9..4ffc93b8d3b9 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -256,100 +256,11 @@ def validate(
     return app_state.exp_dir
 
 
-def get_trtllm_deployable(
-    nemo_checkpoint,
-    model_type,
-    triton_model_repository,
-    num_gpus,
-    tensor_parallelism_size,
-    pipeline_parallelism_size,
-    max_input_len,
-    max_output_len,
-    max_batch_size,
-    dtype,
-):
-    from nemo.export.tensorrt_llm import TensorRTLLM
-
-    if triton_model_repository is None:
-        trt_llm_path = "/tmp/trt_llm_model_dir/"
-        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
-    else:
-        trt_llm_path = triton_model_repository
-
-    if nemo_checkpoint is None and triton_model_repository is None:
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is not None and model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
-
-    trt_llm_exporter = TensorRTLLM(
-        model_dir=trt_llm_path,
-        load_model=(nemo_checkpoint is None),
-    )
-
-    if nemo_checkpoint is not None:
-        try:
-            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
-            trt_llm_exporter.export(
-                nemo_checkpoint_path=nemo_checkpoint,
-                model_type=model_type,
-                n_gpus=num_gpus,
-                tensor_parallelism_size=tensor_parallelism_size,
-                pipeline_parallelism_size=pipeline_parallelism_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                dtype=dtype,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-
-    return trt_llm_exporter
-
-
-def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response):
-    args_dict = {
-        "triton_service_ip": triton_http_address,
-        "triton_service_port": triton_port,
-        "triton_request_timeout": triton_request_timeout,
-        "openai_format_response": openai_format_response,
-    }
-    with open("nemo/deploy/service/config.json", "w") as f:
-        json.dump(args_dict, f)
-
-def unset_environment_variables():
-    import subprocess
-    print("Unsetting all SLURM_, PMI_, PMIX_ Variables")
-
-    # Function to unset variables with a specific prefix
-    def unset_vars_with_prefix(prefix):
-        cmd = f"env | grep ^{prefix} | cut -d= -f1"
-        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-        vars_to_unset = result.stdout.strip().split('\n')
-        for var in vars_to_unset:
-            if var:  # Check if the variable name is not empty
-                os.environ.pop(var, None)
-
-    # Unset variables for each prefix
-    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
-        unset_vars_with_prefix(prefix)
-
-    print("Variables unset successfully")
-
 @run.cli.entrypoint(namespace="llm")
 def deploy(
     nemo_checkpoint: Path = None,
     model_type: str = "llama",
-    triton_model_name: str = "xxx",
+    triton_model_name: str = 'triton_model',
     triton_model_version: Optional[int] = 1,
     triton_port: int = 8000,
     triton_http_address: str = "0.0.0.0",
@@ -362,49 +273,72 @@ def deploy(
     max_input_len: int = 256,
     max_output_len: int = 256,
     max_batch_size: int = 8,
-    start_rest_service: bool = False,
+    start_rest_service: bool = True,
     rest_service_http_address: str = "0.0.0.0",
-    rest_service_port: int = 8000,
-    openai_format_response: bool = False,
-    ckpt_type: str = "nemo",
+    rest_service_port: int = 8080,
+    openai_format_response: bool = True,
+    output_generation_logits: bool = True
 ):
+    """
+    Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm.
+    Also starts rest service that is used to send OpenAI API compatible input request
+    to the PyTiton server.
+
+    Args:
+        nemo_checkpoint (Path): Path for nemo checkpoint.
+        model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama.
+        triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name
+        is passed to the evalute method for the model to be accessible while sending evalution requests.  Default: 'triton_model'.
+        triton_model_version (Optional[int]): Version for the triton model. Default: 1.
+        triton_port (int): Port for the PyTriton server. Default: 8000.
+        triton_http_address (str): HTTP address for the PyTriton server. Default:  "0.0.0.0".
+        triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60,
+        triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engin gets saved in this path specified. Default: None.
+        num_gpus (int): Number of GPUs for export to trtllm and deploy. Default: 1.
+        tensor_parallelism_size (int): Tensor parallelism size. Default: 1.
+        pipeline_parallelism_size (int): Pipeline parallelism size. Default: 1.
+        dtype (str): dtype of the TensorRT-LLM model. Default: "bfloat16".
+        max_input_len (int): Max input length of the model. Default: 256.
+        max_output_len (int): Max output length of the model. Default: 256.
+        max_batch_size (int): Max batch size of the model. Default: 8.
+        start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server. Needs to be True
+        to be able to run evaluation . Default: True.
+        rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0".
+        rest_service_port (int): Port for the rest service. Ensure the rest service port is the port fowarded between host machine and docker
+        when running locally inside a docker container. Default: 8080.
+        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation.
+        Default: True.
+        output_generation_logits (bool): If true builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the
+        logProb of the output token. Default: True.
+    """
     from nemo.deploy import DeployPyTriton
-    unset_environment_variables()
+    from nemo.collections.llm import evaluation
+
+    evaluation.unset_environment_variables()
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
             return
-        # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
-        #store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)
+        # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py
         os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address
         os.environ['TRITON_PORT'] = str(triton_port)
         os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout)
         os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
-
-    # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM
-    if ckpt_type == "trtllm":
-        triton_deployable = get_trtllm_deployable(
-            nemo_checkpoint,
-            model_type,
-            triton_model_repository,
-            num_gpus,
-            tensor_parallelism_size,
-            pipeline_parallelism_size,
-            max_input_len,
-            max_output_len,
-            max_batch_size,
-            dtype,
-        )
-    elif ckpt_type == "nemo":
-        if nemo_checkpoint is None:
-            raise ValueError("In-Framework deployment requires a .nemo checkpoint")
-        try:
-            from nemo.deploy.nlp import MegatronLLMDeployable
-        except Exception as e:
-            raise ValueError(
-                "MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}"
-            )
-        triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus)
+        os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits)
+
+    triton_deployable = evaluation.get_trtllm_deployable(
+        nemo_checkpoint,
+        model_type,
+        triton_model_repository,
+        num_gpus,
+        tensor_parallelism_size,
+        pipeline_parallelism_size,
+        max_input_len,
+        max_output_len,
+        max_batch_size,
+        dtype,
+        output_generation_logits
+    )
 
     try:
         nm = DeployPyTriton(
@@ -455,214 +389,60 @@ def deploy(
 def evaluate(
     nemo_checkpoint_path: Path,
     url: str = "http://0.0.0.0:1234/v1",
-    model_name: str = "xxxx",
+    model_name: str = "triton_model",
     eval_task: str = "gsm8k",
     num_fewshot: Optional[int] = None,
     limit: Optional[Union[int, float]] = None,
     bootstrap_iters: int = 100000,
     # inference params
     max_tokens_to_generate: Optional[int] = 256,
-    temperature: Optional[float] = None,
+    temperature: Optional[float] = 0.000000001,
     top_p: Optional[float] = 0.0,
     top_k: Optional[int] = 1,
+    add_bos: Optional[bool] = False,
 ):
+    """
+    Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
+    nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is
+    required to tokenize the evaluation input and output prompts.
+    url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}.
+    Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server.
+    The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
+    model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able
+    to launch evaluation.
+    eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
+    These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run,
+    but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet.
+    num_fewshot (int): number of examples in few-shot context. Default: None.
+    limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples.
+    If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset.
+    bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000.
+    # inference params
+    max_tokens_to_generate (int): max tokens to generate. Default: 256.
+    temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001.
+    Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value.
+    top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider
+    the single most likely token for the next prediction. Default: 0.0.
+    top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token
+    for the next prediction. Default: 1
+    add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for
+    CausalLM its set to False. If needed set add_bos to True.
 
-    import time
-
-    import requests
-    from lm_eval import evaluator
-
-    ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base
-    from lm_eval.api.instance import Instance
-    from lm_eval.api.model import LM
-    from requests.exceptions import RequestException
-
-    def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
-        """
-        Wait for REST service to be ready.
-
-        Args:
-        rest_url (str): URL of the REST service's health endpoint
-        max_retries (int): Maximum number of retry attempts
-        retry_interval (int): Time to wait between retries in seconds
-
-        Returns:
-        bool: True if rest service is ready, False otherwise
-        """
-        for _ in range(max_retries):
-            rest_ready = check_service(rest_url)
-
-            if rest_ready:
-                print("REST service is ready.")
-                return True
-
-            print(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
-            time.sleep(retry_interval)
-
-        print("Timeout: One or both services did not become ready.")
-        return False
-
-    def check_service(url):
-        """
-        Check if a service is ready by making a GET request to its health endpoint.
-
-        Args:
-        url (str): URL of the service's health endpoint
-
-        Returns:
-        bool: True if the service is ready, False otherwise
-        """
-        try:
-            response = requests.get(url, timeout=5)
-            return response.status_code == 200
-        except RequestException:
-            return False
-
-    class CustomModel(LM):
-        """
-        Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
-        """
-        def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k):
-            self.model_name = model_name
-            self.api_url = api_url
-            self.tokenizer = tokenizer
-            self.max_tokens_to_generate = max_tokens_to_generate
-            self.temperature = temperature
-            self.top_p = top_p
-            self.top_k = top_k
-            super().__init__()
-
-        def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_logprobs: bool = False):
-            response = requests.post(f"{self.api_url}/completions/", json=payload)
-            response_data = response.json()
-
-            if 'error' in response_data:
-                raise Exception(f"API Error: {response_data['error']}")
-
-            # Assuming the response is in OpenAI format
-            if return_text:
-                return response_data['choices'][0]['text']
-
-            if return_logprobs:
-                # generation_logits is needed only for loglikelihood tasks
-                return response_data['choices'][0]['log_probs'], response_data['choices'][0]['generation_logits']
-
-
-        def loglikelihood(self, requests: list[Instance]):
-            import numpy as np
-            import torch
-            import torch.nn.functional as F
-
-            special_tokens_kwargs = {'add_special_tokens': False} ## Hardcode for now. TODO Infer add_bos from input.
-            results = []
-            for request in requests:
-                context = request.arguments[0]
-                continuation = request.arguments[1]
-                context_enc = self.tokenizer.tokenizer.encode(context) #, **special_tokens_kwargs) #errors for SentencePeicetokenizer
-                continuation_enc = self.tokenizer.tokenizer.encode(continuation) #, **special_tokens_kwargs)
-                continuation_enc = continuation_enc[1:] #for SentencePeice since first encoded token is space, comment this for HF tokenizer
-                num_cont_tokens = len(continuation_enc)
-                ## Update self.max_tokens_to_generate with number of continuation tokens in the request
-                self.max_tokens_to_generate = num_cont_tokens
-
-                payload = {
-                    "model": self.model_name,
-                    "prompt": context,
-                    "max_tokens": self.max_tokens_to_generate,
-                    "temperature": self.temperature,
-                    "top_p": self.top_p,
-                    "top_k": self.top_k,
-                    # "compute_logprob": True ##TODO Do we want to have this as an
-                    # user defined value or set it to True by default ?
-                }
-                log_probs, generation_logits = self._generate_tokens_logprobs(payload, return_logprobs=True)
-                # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation
-                multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1)
-                cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0)
-                greedy_tokens = multi_logits.argmax(dim=-1)
-                max_equal = (greedy_tokens == cont_toks).all()
-                logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                        -1
-                    )
-                result = (float(logits.sum()), bool(max_equal))
-
-                results.append(result)
-
-            return results
-
-        def loglikelihood_rolling(self, requests: list[Instance]):
-            ## Note: loglikelihood_rolling does not have correct implementation yet,
-            # the tasks we have working so far: gsm8k, mmlu, lambada dont need loglikelihood_rolling
-            # log likelihood rolling calculation logic here
-            results = []
-            for request in requests:
-                context = request.arguments[0]
-                continuation = request.arguments[1]
-                full_text = context + continuation
-                instance = Instance(
-                    request_type="loglikelihood_rolling",
-                    # doc={'text': full_text},
-                    doc=request.doc,
-                    arguments=(full_text,),
-                    idx=0,
-                )
-                # Access the 'arguments' attribute of the Instance
-                prompt = instance.arguments[0]  # This should be the prompt string
-
-                # Extract default temperature from instance of the benchmark or use the user defined value
-                # Does not work for MMLU since the input instance does not contain temp key
-                # temperature = (
-                #     instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature
-                # )
-                payload = {
-                    "model": self.model_name,
-                    "prompt": prompt,
-                    "max_tokens": self.max_tokens_to_generate,
-                    "temperature": self.temperature,
-                    "top_p": self.top_p,
-                    "top_k": self.top_k,
-                    # "compute_logprob": True ##TODO Do we want to have this as an
-                    # user defined value or set it to True by default ?
-                }
-
-                log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True)
-
-                # Assuming log_probs is a list of log probabilities for each token
-                continuation_log_probs = log_probs[0][0][-len(continuation) :]
-                results.append((continuation_log_probs, False))
-
-            return results
-
-        def generate_until(self, inputs: list[Instance]):
-            # `Instance` is a dataclass defined in [`lm_eval.api.instance`] https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/lm_eval/api/instance.py
-            results = []
-            for instance in inputs:
-                # Access the 'arguments' attribute of the Instance
-                prompt = instance.arguments[0]  # This should be the prompt string
-
-                payload = {
-                    "model": self.model_name,
-                    "prompt": prompt,
-                    "max_tokens": self.max_tokens_to_generate,
-                    "temperature": self.temperature,
-                    "top_p": self.top_p,
-                    "top_k": self.top_k,
-                    # "compute_logprob": True ##TODO Do we want to have this as an
-                    # user defined value or set it to True by default ?
-                }
-
-                generated_text = self._generate_tokens_logprobs(payload, return_text=True)
-
-                results.append(generated_text)
-
-            return results
-
-    ## Get tokenizer from nemo 2.0 model, in case of 1.0 please add appropriate code to get
-    ## tokenizer from 1.0 ckpt and pass it to CustomModel
-    model = io.load_context(nemo_checkpoint_path, subpath="model")
-
-    wait_for_rest_service(rest_url=f"{url}/health")
-    model = CustomModel(model_name, url, model.tokenizer, max_tokens_to_generate, temperature, top_p, top_k)
+    """
+    try:
+        # lm-evaluation-harness import
+        from lm_eval import evaluator
+    except ImportError:
+        raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required to run evaluations")
+
+    from nemo.collections.llm import evaluation
+
+    # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt.
+    tokenizer = io.load_context(nemo_checkpoint_path + '/context', subpath="model").tokenizer
+    # Wait for rest service to be ready before starting evaluation
+    evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health")
+    # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
+    model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos)
     results = evaluator.simple_evaluate(
         model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
     )
diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py
new file mode 100644
index 000000000000..bfe66b3e0ee4
--- /dev/null
+++ b/nemo/collections/llm/evaluation/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.evaluation.eval_utils import NeMoFWLMEval, unset_environment_variables, get_trtllm_deployable, wait_for_rest_service
+
+__all__ = ["NeMoFWLMEval", "unset_environment_variables", "get_trtllm_deployable", "wait_for_rest_service"]
\ No newline at end of file
diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/eval_utils.py
new file mode 100644
index 000000000000..0287d1e3378f
--- /dev/null
+++ b/nemo/collections/llm/evaluation/eval_utils.py
@@ -0,0 +1,267 @@
+import time
+import requests
+from requests.exceptions import RequestException
+import subprocess
+import os
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.utils import logging
+
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+
+class NeMoFWLMEval(LM):
+    """
+    NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with our model deployed on PyTriton server.
+    Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
+    """
+    def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos):
+        self.model_name = model_name
+        self.api_url = api_url
+        self.tokenizer = tokenizer
+        self.max_tokens_to_generate = max_tokens_to_generate
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.add_bos = add_bos
+        super().__init__()
+
+    def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False):
+        """
+        A private method that sends post request to the model on PyTriton server and returns either generated text or logits.
+        """
+        # send a post request to /v1/completions/ endpoint with the payload
+        response = requests.post(f"{self.api_url}/v1/completions/", json=payload)
+        response_data = response.json()
+
+        if 'error' in response_data:
+            raise Exception(f"API Error: {response_data['error']}")
+
+        # Assuming the response is in OpenAI format
+        if return_text:
+            # in case of generate_until tasks return just the text
+            return response_data['choices'][0]['text']
+
+        if return_logits:
+            # in case of loglikelihood tasks return the logits
+            return response_data['choices'][0]['generation_logits']
+
+    def tokenizer_type(self, tokenizer):
+        if isinstance(tokenizer, AutoTokenizer):
+            return "AutoTokenizer"
+        elif isinstance(tokenizer, SentencePieceTokenizer):
+            return "SentencePieceTokenizer"
+        else:
+            return "Unknown tokenizer type"
+
+    def loglikelihood(self, requests: list[Instance]):
+        """
+        Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance.
+        Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples.
+        """
+        if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
+            special_tokens_kwargs = {'add_bos': self.add_bos}
+        elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer":
+            special_tokens_kwargs = {'add_special_tokens': self.add_bos} ## Hardcode for now. TODO Infer add_bos from input.
+
+        results = []
+        for request in requests:
+            # get the input prompt from the request
+            context = request.arguments[0]
+            # get the output prompt from the request
+            continuation = request.arguments[1]
+            # get encoded tokens of continuation
+            continuation_enc = self.tokenizer.tokenizer.encode(continuation, **special_tokens_kwargs)
+            # for SentencePeice consider the encoded tokens from the 2nd token since first encoded token is space.
+            if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:]
+            num_cont_tokens = len(continuation_enc)
+            # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request
+            self.max_tokens_to_generate = num_cont_tokens
+            # Create payload to query the model deployed on PyTriton server 
+            payload = {
+                "model": self.model_name,
+                "prompt": context,
+                "max_tokens": self.max_tokens_to_generate,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+            }
+            # Get the logits from the model
+            generation_logits = self._generate_tokens_logits(payload, return_logits=True)
+            # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax
+            multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1)
+            # Convert encoded continuation tokens to torch tensor
+            cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0)
+            # Get the greedy token from the logits (i.e token with the highest prob)
+            greedy_tokens = multi_logits.argmax(dim=-1)
+            # Check if all greedy_tokens match the the actual continuation tokens
+            is_greedy = (greedy_tokens == cont_toks).all()
+            # Get the logits corresponding to the actual continuation tokens
+            logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                    -1
+                )
+            # result is tuple of logProb of generating the continuation token and is_greedy
+            result = (float(logits.sum()), bool(is_greedy))
+
+            results.append(result)
+
+        return results
+
+    def loglikelihood_rolling(self, requests: list[Instance]):
+        pass
+
+    def generate_until(self, inputs: list[Instance]):
+        """
+        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance.
+        Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples.
+        """
+        results = []
+        for instance in inputs:
+            # Access the 'arguments' attribute of the Instance which contains the input prompt string
+            prompt = instance.arguments[0]
+            # Create payload to query the model deployed on PyTriton server 
+            payload = {
+                "model": self.model_name,
+                "prompt": prompt,
+                "max_tokens": self.max_tokens_to_generate,
+                "temperature": self.temperature,
+                "top_p": self.top_p,
+                "top_k": self.top_k,
+            }
+            # Get the text generated by the model
+            generated_text = self._generate_tokens_logits(payload, return_text=True)
+
+            results.append(generated_text)
+
+        return results
+
+def unset_environment_variables():
+    """
+    SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
+    on clusters. This method takes care of unsetting these env variables
+    # TODO maybe move this to NeMo-Run script ?
+    """
+    logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables")
+
+    # Function to unset variables with a specific prefix
+    def unset_vars_with_prefix(prefix):
+        cmd = f"env | grep ^{prefix} | cut -d= -f1"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        vars_to_unset = result.stdout.strip().split('\n')
+        for var in vars_to_unset:
+            if var:  # Check if the variable name is not empty
+                os.environ.pop(var, None)
+
+    # Unset variables for each prefix
+    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
+        unset_vars_with_prefix(prefix)
+
+    logging.info("Variables unset successfully")
+
+def get_trtllm_deployable(
+    nemo_checkpoint,
+    model_type,
+    triton_model_repository,
+    num_gpus,
+    tensor_parallelism_size,
+    pipeline_parallelism_size,
+    max_input_len,
+    max_output_len,
+    max_batch_size,
+    dtype,
+    output_generation_logits
+):
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
+    if triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = triton_model_repository
+
+    if nemo_checkpoint is None and triton_model_repository is None:
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is not None and model_type is None:
+        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
+
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        load_model=(nemo_checkpoint is None),
+    )
+
+    if nemo_checkpoint is not None:
+        try:
+            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=nemo_checkpoint,
+                model_type=model_type,
+                n_gpus=num_gpus,
+                tensor_parallelism_size=tensor_parallelism_size,
+                pipeline_parallelism_size=pipeline_parallelism_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                dtype=dtype,
+                gather_generation_logits=output_generation_logits
+            )
+        except Exception as error:
+            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
+
+    return trt_llm_exporter
+
+def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
+    """
+    Wait for REST service to be ready.
+
+    Args:
+    rest_url (str): URL of the REST service's health endpoint
+    max_retries (int): Maximum number of retry attempts. Defaul: 60. 
+    retry_interval (int): Time to wait between retries in seconds. Default: 2.
+
+    Returns:
+    bool: True if rest service is ready, False otherwise
+    """
+
+    def check_service(url):
+        """
+        Check if the service is ready by making a GET request to its health endpoint.
+
+        Args:
+        url (str): URL of the service's health endpoint
+
+        Returns:
+        bool: True if the service is ready, False otherwise
+        """
+        try:
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except RequestException:
+            return False
+
+    for _ in range(max_retries):
+        rest_ready = check_service(rest_url)
+
+        if rest_ready:
+            logging.info("REST service is ready.")
+            return True
+
+        logging.info(f"REST Service not ready yet. Retrying in {retry_interval} seconds...")
+        time.sleep(retry_interval)
+
+    logging.info("Timeout: REST service did not become ready.")
+    return False
\ No newline at end of file
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 6aa9ed79813a..62215917733b 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -174,6 +174,7 @@ def query_llm(
         end_strings=None,
         init_timeout=60.0,
         openai_format_response: bool = False,
+        output_generation_logits: bool = False
     ):
         """
         Query the Triton server synchronously and return a list of responses.
@@ -190,6 +191,8 @@ def query_llm(
             no_repeat_ngram_size (int): no repeat ngram size.
             task_id (str): downstream task id if virtual tokens are used.
             init_timeout (flat): timeout for the connection.
+            openai_format_response: return response similar to OpenAI API format
+            output_generation_logits: return generation logits from model on PyTriton
         """
 
         prompts = str_list2numpy(prompts)
@@ -248,6 +251,9 @@ def query_llm(
         if end_strings is not None:
             inputs["end_strings"] = str_list2numpy(end_strings)
 
+        if output_generation_logits is not None:
+            inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_)
+
         with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
             result_dict = client.infer_batch(**inputs)
             output_type = client.model_config.outputs[0].dtype
@@ -267,13 +273,12 @@ def query_llm(
                         "object": "text_completion",
                         "created": int(time.time()),
                         "model": self.model_name,
-                        # TODO if compute_logprobs is True then add log_probs
-                        ## Convert log_probs to a list to make it json serializable
                         "choices": [{"text": str(sentences),
-                                     "log_probs":result_dict["log_probs"].tolist(),
-                                     "generation_logits": result_dict["generation_logits"].tolist()
-                                     }]
+                                     #"generation_logits": result_dict["generation_logits"].tolist()
+                                     }]   
                     }
+                    # Convert gneration logits to a list to make it json serializable and add it to openai_response dict
+                    if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
                     return openai_response
                 else:
                     return sentences
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index aaf9f3b6c0a0..6218cd2ed6f4 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -8,8 +8,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import json
 import os
 from pathlib import Path
 import requests
@@ -33,6 +31,7 @@ def __init__(self):
             self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0')
             self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60))
             self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true'
+            self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true'
         except Exception as error:
             print("An exception occurred:", error)
             return
@@ -52,11 +51,17 @@ def triton_request_timeout(self):
     @property
     def openai_format_response(self):
         """
-        Retuns the response from Triton server in OpenAI compatible formar if set to True,
-        default set in config.json is false.
+        Retuns the response from Triton server in OpenAI compatible formar if set to True.
         """
         return self._openai_format_response
 
+    @property
+    def output_generation_logits(self):
+        """
+        Retuns the generation logits along with text in Triton server output if set to True.
+        """
+        return self._output_generation_logits
+
 
 app = FastAPI()
 triton_settings = TritonSettings()
@@ -113,9 +118,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            # TODO make these two user configurable ??
-            all_probs=True,
-            compute_logprob=True
+            output_generation_logits=triton_settings.output_generation_logits
         )
         if triton_settings.openai_format_response:
             return output
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 76ea171bdc29..e16275b2208d 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -180,6 +180,8 @@ def export(
         reduce_fusion: bool = True,
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
+        gather_context_logits: Optional[bool] = False,
+        gather_generation_logits: Optional[bool] = False
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -218,6 +220,8 @@ def export(
             reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
             fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
             fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
+            gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False
+            gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False
         """
         if n_gpus is not None:
             warnings.warn(
@@ -495,6 +499,8 @@ def get_transformer_config(nemo_model_config):
                             multiple_profiles=multiple_profiles,
                             gpt_attention_plugin=gpt_attention_plugin,
                             gemm_plugin=gemm_plugin,
+                            gather_context_logits=gather_context_logits,
+                            gather_generation_logits=gather_generation_logits
                         )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
@@ -688,6 +694,7 @@ def forward(
         prompt_embeddings_checkpoint_path: str = None,
         streaming: bool = False,
         output_log_probs: bool = False,
+        output_generation_logits: bool = False,
         **sampling_kwargs,
     ):
         """
@@ -706,6 +713,7 @@ def forward(
             task_ids (List(str)): list of the task ids for the prompt tables.
             prompt_embeddings_table (List(float)): prompt embeddings table.
             prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
+            output_generation_logits (bool): if True returns generation_logits in the outout of generate method.
             sampling_kwargs: Additional kwargs to set in the SamplingConfig.
         """
 
@@ -784,6 +792,7 @@ def forward(
                     no_repeat_ngram_size=no_repeat_ngram_size,
                     output_log_probs=output_log_probs,
                     multiprocessed_env=multiprocessed_env,
+                    output_generation_logits=output_generation_logits,
                     **sampling_kwargs,
                 )
             else:
@@ -862,20 +871,19 @@ def get_triton_input(self):
             Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True),
-            Tensor(name="all_probs", shape=(-1,), dtype=np.bool_, optional=True)
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
         )
         return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single),
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),
                    Tensor(name="generation_logits", shape=(-1,), dtype=np.single))
         return outputs
 
     @batch
     def triton_infer_fn(self, **inputs: np.ndarray):
-        log_probs = None
+        output_dict = {}
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
             if "max_output_len" in inputs:
@@ -902,23 +910,20 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             if "lora_uids" in inputs:
                 lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
                 infer_input["lora_uids"] = lora_uids[0].tolist()
-            if "compute_logprob" in inputs:
-                infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0]
-            if "all_probs" in inputs:
-                infer_input["all_probs"] = inputs.pop("all_probs")[0][0]
-
-            if infer_input["output_log_probs"]:
-                output_texts, log_probs, generation_logits = self.forward(**infer_input)
-                log_probs = np.array(log_probs.cpu().numpy())
-                generation_logits = np.array(generation_logits.cpu().numpy())
+            if "output_generation_logits" in inputs:
+                infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0]
+
+            if infer_input["output_generation_logits"]:
+                output_texts, generation_logits = self.forward(**infer_input)
+                output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy())
             else:
                 output_texts = self.forward(**infer_input)
-            output = cast_output(output_texts, np.bytes_)
+            output_dict["outputs"] = cast_output(output_texts, np.bytes_)
         except Exception as error:
             err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
+            output_dict["outputs"] = cast_output([err_msg], np.bytes_)
 
-        return {"outputs": output, "log_probs": log_probs, "generation_logits": generation_logits}
+        return output_dict
 
     @batch
     def triton_infer_fn_streaming(self, **inputs: np.ndarray):
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 424e4c3f27d9..88767917301e 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -54,6 +54,8 @@ def build_and_save_engine(
     gpt_attention_plugin: str = "auto",
     gemm_plugin: str = "auto",
     reduce_fusion: bool = False,
+    gather_context_logits: bool = False,
+    gather_generation_logits: bool = False
 ):
     architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
     try:
@@ -96,8 +98,8 @@ def build_and_save_engine(
         'max_num_tokens': max_num_tokens,
         'opt_num_tokens': opt_num_tokens,
         'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
-        'gather_context_logits': False,
-        'gather_generation_logits': True,
+        'gather_context_logits': gather_context_logits,
+        'gather_generation_logits': gather_generation_logits,
         'strongly_typed': False,
         'builder_opt': None,
         'use_refit': use_refit,
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 7cbb038c5b0f..84c4be7a616f 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -649,6 +649,7 @@ def generate(
     streaming: bool = False,
     output_log_probs=False,
     multiprocessed_env=False,
+    output_generation_logits=False,
     **sampling_kwargs,
 ) -> Optional[List[List[str]]]:
     """Generate the output sequence from the input sequence.
@@ -702,16 +703,14 @@ def generate(
     output_ids = outputs['output_ids']
     sequence_lengths = outputs['sequence_lengths']
     input_lengths = [t.shape[0] for t in input_tensors]
-    log_probs = outputs['log_probs']
-    generation_logits = outputs['generation_logits']
 
     output_lines_list = [
         tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
         for b in range(output_ids.shape[0])
     ]
 
-    if output_log_probs:
-        return output_lines_list, log_probs, generation_logits
+    if output_generation_logits:
+        return output_lines_list, outputs['generation_logits']
     return output_lines_list
 
 

From b6bdf90e2d57301df6d47b0d79421ced2396a8a0 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Fri, 8 Nov 2024 16:50:21 -0800
Subject: [PATCH 11/21] Add copyright and initialize special_tokens_kwargs in
 eval_utils.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/evaluation/eval_utils.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/eval_utils.py
index 0287d1e3378f..cb35dec698bc 100644
--- a/nemo/collections/llm/evaluation/eval_utils.py
+++ b/nemo/collections/llm/evaluation/eval_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 import requests
 from requests.exceptions import RequestException
@@ -64,10 +78,11 @@ def loglikelihood(self, requests: list[Instance]):
         Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance.
         Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples.
         """
+        special_tokens_kwargs = {}
         if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
-            special_tokens_kwargs = {'add_bos': self.add_bos}
+            special_tokens_kwargs['add_bos'] = self.add_bos
         elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer":
-            special_tokens_kwargs = {'add_special_tokens': self.add_bos} ## Hardcode for now. TODO Infer add_bos from input.
+            special_tokens_kwargs['add_special_tokens'] = self.add_bos
 
         results = []
         for request in requests:

From 32a9d9add27b3a4b72643ceed5b144b67e4e6b30 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 08:11:48 -0800
Subject: [PATCH 12/21] Add the following chanes

1) Move get_trtllm_deployable and unset_environment_variables to deploy base.py
2) Rename eval_utils.py to base.py
3) REstore scripts/export/convert_nemo2_for_export.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py                   |   6 +-
 nemo/collections/llm/deploy/__init__.py       |   3 +
 nemo/collections/llm/deploy/base.py           | 102 +++++++++++++++
 nemo/collections/llm/evaluation/__init__.py   |   4 +-
 .../llm/evaluation/{eval_utils.py => base.py} |  86 ------------
 nemo/deploy/nlp/query_llm.py                  |   5 +-
 nemo/lightning/pytorch/callbacks/debugging.py |   1 -
 scripts/export/convert_nemo2_for_export.py    | 123 ++++++++++++++++++
 8 files changed, 235 insertions(+), 95 deletions(-)
 create mode 100644 nemo/collections/llm/deploy/__init__.py
 create mode 100644 nemo/collections/llm/deploy/base.py
 rename nemo/collections/llm/evaluation/{eval_utils.py => base.py} (72%)
 create mode 100644 scripts/export/convert_nemo2_for_export.py

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 4ffc93b8d3b9..a870b55c9574 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -312,9 +312,9 @@ def deploy(
         logProb of the output token. Default: True.
     """
     from nemo.deploy import DeployPyTriton
-    from nemo.collections.llm import evaluation
+    from nemo.collections.llm import deploy
 
-    evaluation.unset_environment_variables()
+    deploy.unset_environment_variables()
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
@@ -326,7 +326,7 @@ def deploy(
         os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
         os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits)
 
-    triton_deployable = evaluation.get_trtllm_deployable(
+    triton_deployable = deploy.get_trtllm_deployable(
         nemo_checkpoint,
         model_type,
         triton_model_repository,
diff --git a/nemo/collections/llm/deploy/__init__.py b/nemo/collections/llm/deploy/__init__.py
new file mode 100644
index 000000000000..312cfb93ca1c
--- /dev/null
+++ b/nemo/collections/llm/deploy/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.deploy.base import unset_environment_variables, get_trtllm_deployable
+
+__all__ = ["unset_environment_variables", "get_trtllm_deployable"]
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
new file mode 100644
index 000000000000..2ae87c1f3a46
--- /dev/null
+++ b/nemo/collections/llm/deploy/base.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.utils import logging
+import subprocess
+import os
+from pathlib import Path
+
+def unset_environment_variables():
+    """
+    SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
+    on clusters. This method takes care of unsetting these env variables
+    # TODO maybe move this to NeMo-Run script ?
+    """
+    logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables")
+
+    # Function to unset variables with a specific prefix
+    def unset_vars_with_prefix(prefix):
+        cmd = f"env | grep ^{prefix} | cut -d= -f1"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        vars_to_unset = result.stdout.strip().split('\n')
+        for var in vars_to_unset:
+            if var:  # Check if the variable name is not empty
+                os.environ.pop(var, None)
+
+    # Unset variables for each prefix
+    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
+        unset_vars_with_prefix(prefix)
+
+    logging.info("Variables unset successfully")
+
+def get_trtllm_deployable(
+    nemo_checkpoint,
+    model_type,
+    triton_model_repository,
+    num_gpus,
+    tensor_parallelism_size,
+    pipeline_parallelism_size,
+    max_input_len,
+    max_output_len,
+    max_batch_size,
+    dtype,
+    output_generation_logits
+):
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
+    if triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = triton_model_repository
+
+    if nemo_checkpoint is None and triton_model_repository is None:
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
+        raise ValueError(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
+        )
+
+    if nemo_checkpoint is not None and model_type is None:
+        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
+
+    trt_llm_exporter = TensorRTLLM(
+        model_dir=trt_llm_path,
+        load_model=(nemo_checkpoint is None),
+    )
+
+    if nemo_checkpoint is not None:
+        try:
+            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=nemo_checkpoint,
+                model_type=model_type,
+                n_gpus=num_gpus,
+                tensor_parallelism_size=tensor_parallelism_size,
+                pipeline_parallelism_size=pipeline_parallelism_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                dtype=dtype,
+                gather_generation_logits=output_generation_logits
+            )
+        except Exception as error:
+            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
+
+    return trt_llm_exporter
\ No newline at end of file
diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py
index bfe66b3e0ee4..bca7c6251588 100644
--- a/nemo/collections/llm/evaluation/__init__.py
+++ b/nemo/collections/llm/evaluation/__init__.py
@@ -1,3 +1,3 @@
-from nemo.collections.llm.evaluation.eval_utils import NeMoFWLMEval, unset_environment_variables, get_trtllm_deployable, wait_for_rest_service
+from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service
 
-__all__ = ["NeMoFWLMEval", "unset_environment_variables", "get_trtllm_deployable", "wait_for_rest_service"]
\ No newline at end of file
+__all__ = ["NeMoFWLMEval", "wait_for_rest_service"]
\ No newline at end of file
diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/base.py
similarity index 72%
rename from nemo/collections/llm/evaluation/eval_utils.py
rename to nemo/collections/llm/evaluation/base.py
index cb35dec698bc..145d70b5c6fc 100644
--- a/nemo/collections/llm/evaluation/eval_utils.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -15,9 +15,6 @@
 import time
 import requests
 from requests.exceptions import RequestException
-import subprocess
-import os
-from pathlib import Path
 
 import torch
 import torch.nn.functional as F
@@ -155,89 +152,6 @@ def generate_until(self, inputs: list[Instance]):
 
         return results
 
-def unset_environment_variables():
-    """
-    SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
-    on clusters. This method takes care of unsetting these env variables
-    # TODO maybe move this to NeMo-Run script ?
-    """
-    logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables")
-
-    # Function to unset variables with a specific prefix
-    def unset_vars_with_prefix(prefix):
-        cmd = f"env | grep ^{prefix} | cut -d= -f1"
-        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-        vars_to_unset = result.stdout.strip().split('\n')
-        for var in vars_to_unset:
-            if var:  # Check if the variable name is not empty
-                os.environ.pop(var, None)
-
-    # Unset variables for each prefix
-    for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
-        unset_vars_with_prefix(prefix)
-
-    logging.info("Variables unset successfully")
-
-def get_trtllm_deployable(
-    nemo_checkpoint,
-    model_type,
-    triton_model_repository,
-    num_gpus,
-    tensor_parallelism_size,
-    pipeline_parallelism_size,
-    max_input_len,
-    max_output_len,
-    max_batch_size,
-    dtype,
-    output_generation_logits
-):
-    from nemo.export.tensorrt_llm import TensorRTLLM
-
-    if triton_model_repository is None:
-        trt_llm_path = "/tmp/trt_llm_model_dir/"
-        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
-    else:
-        trt_llm_path = triton_model_repository
-
-    if nemo_checkpoint is None and triton_model_repository is None:
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
-        )
-
-    if nemo_checkpoint is not None and model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
-
-    trt_llm_exporter = TensorRTLLM(
-        model_dir=trt_llm_path,
-        load_model=(nemo_checkpoint is None),
-    )
-
-    if nemo_checkpoint is not None:
-        try:
-            logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
-            trt_llm_exporter.export(
-                nemo_checkpoint_path=nemo_checkpoint,
-                model_type=model_type,
-                n_gpus=num_gpus,
-                tensor_parallelism_size=tensor_parallelism_size,
-                pipeline_parallelism_size=pipeline_parallelism_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                dtype=dtype,
-                gather_generation_logits=output_generation_logits
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-
-    return trt_llm_exporter
 
 def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
     """
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 62215917733b..4c55cf3b2c15 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -273,9 +273,8 @@ def query_llm(
                         "object": "text_completion",
                         "created": int(time.time()),
                         "model": self.model_name,
-                        "choices": [{"text": str(sentences),
-                                     #"generation_logits": result_dict["generation_logits"].tolist()
-                                     }]   
+                        "choices": [{"text": str(sentences)
+                                     }]
                     }
                     # Convert gneration logits to a list to make it json serializable and add it to openai_response dict
                     if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py
index b4f80ac89608..b5da06dfbf53 100644
--- a/nemo/lightning/pytorch/callbacks/debugging.py
+++ b/nemo/lightning/pytorch/callbacks/debugging.py
@@ -116,7 +116,6 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule)
         Iterate over model parameters, find gradient tensor, apply and collect outputs of
         param_fn and grad_fn, and log outputs in a table.
         """
-        from prettytable import PrettyTable
         def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]:
             """If using MCore optimizer, search the grad buckets for param's grad tensor."""
             if not isinstance(pl_module.optim, MegatronOptimizerModule):
diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py
new file mode 100644
index 000000000000..f1eea3cfa6b8
--- /dev/null
+++ b/scripts/export/convert_nemo2_for_export.py
@@ -0,0 +1,123 @@
+opyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert a NeMo 2.0 checkpoint to NeMo 1.0 for TRTLLM export.
+Example to run this conversion script:
+```
+    python /opt/NeMo/scripts/scripts/export/convert_nemo2_for_export.py \
+     --input_path /path/to/nemo2/ckpt \
+     --output_path /path/to/output \
+     --tokenizer_type huggingface \
+     --tokenizer_name meta-llama/Meta-Llama-3.1-8B \
+     --symbolic_link=True
+```
+"""
+
+import os
+import shutil
+from argparse import ArgumentParser
+
+from omegaconf import OmegaConf
+
+from nemo.lightning import io
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input_path",
+        type=str,
+        required=True,
+        help="Path to nemo 2.0 checkpoint",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Output path",
+    )
+    parser.add_argument(
+        "--tokenizer_type",
+        type=str,
+        default="huggingface",
+        help="Type of tokenizer",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default="meta-llama/Meta-Llama-3.1-8B",
+        help="Name or path of tokenizer",
+    )
+    parser.add_argument(
+        "--symbolic_link",
+        type=bool,
+        default=True,
+        help="Whether to use symbiloc link for model weights",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    input_path = args.input_path
+    output_path = args.output_path
+    weight_path = os.path.join(output_path, "model_weights")
+
+    if os.path.exists(output_path):
+        shutil.rmtree(output_path)
+        print(f"Remove existing {output_path}")
+
+    os.makedirs(output_path, exist_ok=True)
+
+    config = io.load_context(input_path, subpath="model.config")
+
+    config_dict = {}
+    for k, v in config.__dict__.items():
+        if isinstance(v, (float, int, str, bool)):
+            config_dict[k] = v
+        elif k == "activation_func":
+            config_dict["activation"] = v.__name__
+
+    if config_dict.get("num_moe_experts") is None:
+        config_dict["num_moe_experts"] = 0
+        config_dict["moe_router_topk"] = 0
+    if config_dict["activation"] == "silu":
+        config_dict["activation"] = "fast-swiglu"
+
+    config_dict["mcore_gpt"] = True
+    config_dict["max_position_embeddings"] = config_dict.get("seq_length")
+    config_dict["tokenizer"] = {
+        "library": args.tokenizer_type,
+        "type": args.tokenizer_name,
+        "use_fast": True,
+    }
+
+    yaml_config = OmegaConf.create(config_dict)
+    OmegaConf.save(config=yaml_config, f=os.path.join(output_path, "model_config.yaml"))
+
+    if args.symbolic_link:
+        os.symlink(input_path, weight_path)
+    else:
+        os.makedirs(weight_path, exist_ok=True)
+        for file in os.listdir(input_path):
+            source_path = os.path.join(input_path, file)
+            target_path = os.path.join(weight_path, file)
+            shutil.copy(source_path, target_path)
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)

From 0bdea4b400fb03762741f26c257d46dd09e601f3 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 08:17:29 -0800
Subject: [PATCH 13/21] Fix a minor typo

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 scripts/export/convert_nemo2_for_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py
index f1eea3cfa6b8..0703322cd854 100644
--- a/scripts/export/convert_nemo2_for_export.py
+++ b/scripts/export/convert_nemo2_for_export.py
@@ -1,4 +1,4 @@
-opyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 05428c210102046a0886513858dc98bda3481874 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 11:07:19 -0800
Subject: [PATCH 14/21] Revert output_log_probs and all_probs arg in
 tensorrt_llm_run.py

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/export/trt_llm/tensorrt_llm_run.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 84c4be7a616f..ef67c918290f 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -279,8 +279,6 @@ def _forward(
                 streaming=streaming,
                 output_sequence_lengths=True,
                 return_dict=True,
-                output_log_probs=sampling_kwargs.get('output_log_probs', False),
-                all_probs=sampling_kwargs.get('all_probs', False),
             )
 
             torch.cuda.synchronize()

From 8428a6884adceef80bc91bb842c1c3277fd90413 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 11:42:55 -0800
Subject: [PATCH 15/21] Fix docstrings formatting

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py | 50 +++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index a870b55c9574..22515c25a559 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -403,30 +403,32 @@ def evaluate(
 ):
     """
     Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
-    nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is
-    required to tokenize the evaluation input and output prompts.
-    url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}.
-    Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server.
-    The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
-    model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able
-    to launch evaluation.
-    eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
-    These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run,
-    but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet.
-    num_fewshot (int): number of examples in few-shot context. Default: None.
-    limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples.
-    If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset.
-    bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000.
-    # inference params
-    max_tokens_to_generate (int): max tokens to generate. Default: 256.
-    temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001.
-    Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value.
-    top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider
-    the single most likely token for the next prediction. Default: 0.0.
-    top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token
-    for the next prediction. Default: 1
-    add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for
-    CausalLM its set to False. If needed set add_bos to True.
+
+    Args:
+        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is
+        required to tokenize the evaluation input and output prompts.
+        url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}.
+        Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server.
+        The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
+        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able
+        to launch evaluation.
+        eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
+        These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run,
+        but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet.
+        num_fewshot (int): number of examples in few-shot context. Default: None.
+        limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples.
+        If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset.
+        bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000.
+        # inference params
+        max_tokens_to_generate (int): max tokens to generate. Default: 256.
+        temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001.
+        Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value.
+        top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider
+        the single most likely token for the next prediction. Default: 0.0.
+        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token
+        for the next prediction. Default: 1
+        add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for
+        CausalLM its set to False. If needed set add_bos to True.
 
     """
     try:

From 85b988549240957a71e38de02b779223c89c1c42 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 15:43:35 -0800
Subject: [PATCH 16/21] Pylint and other minor fixes

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py             | 73 +++++++++++++------------
 nemo/collections/llm/deploy/base.py     | 21 +++++--
 nemo/collections/llm/evaluation/base.py | 22 +++++---
 nemo/deploy/service/rest_model_api.py   |  2 +-
 4 files changed, 72 insertions(+), 46 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 22515c25a559..db8556afa072 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -288,12 +288,13 @@ def deploy(
         nemo_checkpoint (Path): Path for nemo checkpoint.
         model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama.
         triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name
-        is passed to the evalute method for the model to be accessible while sending evalution requests.  Default: 'triton_model'.
+        is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'.
         triton_model_version (Optional[int]): Version for the triton model. Default: 1.
         triton_port (int): Port for the PyTriton server. Default: 8000.
         triton_http_address (str): HTTP address for the PyTriton server. Default:  "0.0.0.0".
-        triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60,
-        triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engin gets saved in this path specified. Default: None.
+        triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60.
+        triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engine gets saved in this specified
+        path. If None, saves it in /tmp dir. Default: None.
         num_gpus (int): Number of GPUs for export to trtllm and deploy. Default: 1.
         tensor_parallelism_size (int): Tensor parallelism size. Default: 1.
         pipeline_parallelism_size (int): Pipeline parallelism size. Default: 1.
@@ -301,15 +302,14 @@ def deploy(
         max_input_len (int): Max input length of the model. Default: 256.
         max_output_len (int): Max output length of the model. Default: 256.
         max_batch_size (int): Max batch size of the model. Default: 8.
-        start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server. Needs to be True
-        to be able to run evaluation . Default: True.
+        start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server.
+        Needs to be True to be able to run evaluation. Default: True.
         rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0".
-        rest_service_port (int): Port for the rest service. Ensure the rest service port is the port fowarded between host machine and docker
-        when running locally inside a docker container. Default: 8080.
-        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation.
-        Default: True.
-        output_generation_logits (bool): If true builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the
-        logProb of the output token. Default: True.
+        rest_service_port (int): Port for the rest service. Default: 8080.
+        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be
+        True while running evaluation. Default: True.
+        output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
+        generation_logits are used to compute the logProb of the output token. Default: True.
     """
     from nemo.deploy import DeployPyTriton
     from nemo.collections.llm import deploy
@@ -388,7 +388,7 @@ def deploy(
 
 def evaluate(
     nemo_checkpoint_path: Path,
-    url: str = "http://0.0.0.0:1234/v1",
+    url: str = "http://0.0.0.0:8080/v1",
     model_name: str = "triton_model",
     eval_task: str = "gsm8k",
     num_fewshot: Optional[int] = None,
@@ -402,34 +402,39 @@ def evaluate(
     add_bos: Optional[bool] = False,
 ):
     """
-    Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
+    Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness
+    (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
 
     Args:
-        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is
-        required to tokenize the evaluation input and output prompts.
-        url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}.
-        Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server.
-        The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
-        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able
-        to launch evaluation.
+        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which
+        is required to tokenize the evaluation input and output prompts.
+        url (str): rest serice url and port that were used in the deploy method above in the format:
+        http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts (from lm-eval-harness)
+        are sent to this url which is then passed to the model deployed on PyTriton server. The rest service url and port
+        serve as the entry point to evaluate model deployed on PyTriton server.
+        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name
+        passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model".
         eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
-        These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run,
-        but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet.
+        These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from
+        lm-evaluation-harness can be run, but only the above mentioned ones are tested. Tasks of type
+        loglikelihood_rolling are not supported yet.
         num_fewshot (int): number of examples in few-shot context. Default: None.
-        limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples.
-        If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset.
-        bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000.
+        limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit
+        is a percentage of the total number of examples. If int say x, then run evaluation only on x number of samples
+        from the eval dataset. Default: None, which means eval is run the entire dataset.
+        bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0
+        for no stderr calculations to be performed. Default: 100000.
         # inference params
         max_tokens_to_generate (int): max tokens to generate. Default: 256.
-        temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001.
-        Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value.
-        top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider
-        the single most likely token for the next prediction. Default: 0.0.
-        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token
-        for the next prediction. Default: 1
-        add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for
-        CausalLM its set to False. If needed set add_bos to True.
-
+        temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token
+        with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM(# TODO to be investigated).
+        Hence using a very samll value as the default. Default: 0.000000001.
+        top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability.
+        top_p=0 means the model will only consider the single most likely token for the next prediction. Default: 0.0.
+        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will
+        only consider the single most likely token for the next prediction. Default: 1
+        add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when
+        encoding a string. Default: False since typically for CausalLM its set to False. If needed set add_bos to True.
     """
     try:
         # lm-evaluation-harness import
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
index 2ae87c1f3a46..46ce57152f54 100644
--- a/nemo/collections/llm/deploy/base.py
+++ b/nemo/collections/llm/deploy/base.py
@@ -17,28 +17,38 @@
 import os
 from pathlib import Path
 
-def unset_environment_variables():
+def unset_environment_variables() -> None:
     """
     SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
     on clusters. This method takes care of unsetting these env variables
-    # TODO maybe move this to NeMo-Run script ?
     """
     logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables")
 
     # Function to unset variables with a specific prefix
     def unset_vars_with_prefix(prefix):
+        unset_vars = []
         cmd = f"env | grep ^{prefix} | cut -d= -f1"
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
         vars_to_unset = result.stdout.strip().split('\n')
         for var in vars_to_unset:
             if var:  # Check if the variable name is not empty
                 os.environ.pop(var, None)
+                unset_vars.append(var)
+        return unset_vars
+
+    # Collect all unset variables across all prefixes
+    all_unset_vars = []
 
     # Unset variables for each prefix
     for prefix in ['SLURM_', 'PMI_', 'PMIX_']:
-        unset_vars_with_prefix(prefix)
+        unset_vars = unset_vars_with_prefix(prefix)
+        all_unset_vars.extend(unset_vars)
+
+    if all_unset_vars:
+        logging.info(f"Unset env variables: {', '.join(all_unset_vars)}")
+    else:
+        logging.info("No env variables were unset.")
 
-    logging.info("Variables unset successfully")
 
 def get_trtllm_deployable(
     nemo_checkpoint,
@@ -53,6 +63,9 @@ def get_trtllm_deployable(
     dtype,
     output_generation_logits
 ):
+    """
+    Exports the nemo checkpoint to trtllm and returns trt_llm_exporter that is used to deploy on PyTriton.
+    """
     from nemo.export.tensorrt_llm import TensorRTLLM
 
     if triton_model_repository is None:
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index 145d70b5c6fc..0fdc41cff06b 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -28,7 +28,8 @@
 
 class NeMoFWLMEval(LM):
     """
-    NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with our model deployed on PyTriton server.
+    NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with
+    NeMo model deployed on PyTriton server.
     Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
     """
     def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos):
@@ -44,7 +45,8 @@ def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, tempe
 
     def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False):
         """
-        A private method that sends post request to the model on PyTriton server and returns either generated text or logits.
+        A private method that sends post request to the model on PyTriton server and returns either generated text or
+        logits.
         """
         # send a post request to /v1/completions/ endpoint with the payload
         response = requests.post(f"{self.api_url}/v1/completions/", json=payload)
@@ -63,17 +65,22 @@ def _generate_tokens_logits(self, payload, return_text: bool = False, return_log
             return response_data['choices'][0]['generation_logits']
 
     def tokenizer_type(self, tokenizer):
+        """
+        Returns the type of the tokenizer.
+        """
         if isinstance(tokenizer, AutoTokenizer):
             return "AutoTokenizer"
         elif isinstance(tokenizer, SentencePieceTokenizer):
             return "SentencePieceTokenizer"
         else:
-            return "Unknown tokenizer type"
+            raise ValueError("Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
+                             "how to handle special tokens for this tokenizer")
 
     def loglikelihood(self, requests: list[Instance]):
         """
-        Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance.
-        Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples.
+        Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass
+        defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here
+        loglikelihood) and other relevant args like few shot samples.
         """
         special_tokens_kwargs = {}
         if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
@@ -129,8 +136,9 @@ def loglikelihood_rolling(self, requests: list[Instance]):
 
     def generate_until(self, inputs: list[Instance]):
         """
-        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance.
-        Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples.
+        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass
+        defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here
+        loglikelihood) and other relevant args like few shot samples.
         """
         results = []
         for instance in inputs:
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 6218cd2ed6f4..21c68f18580d 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -51,7 +51,7 @@ def triton_request_timeout(self):
     @property
     def openai_format_response(self):
         """
-        Retuns the response from Triton server in OpenAI compatible formar if set to True.
+        Retuns the response from Triton server in OpenAI compatible format if set to True.
         """
         return self._openai_format_response
 

From 1a2245821afc41d934fcf27bf8d35c5d9115ada9 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 13 Nov 2024 16:01:39 -0800
Subject: [PATCH 17/21] Fix pylint and typos

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/collections/llm/api.py             | 37 +++++++++++++------------
 nemo/collections/llm/evaluation/base.py |  9 ++++--
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index db8556afa072..26bcb4275ba7 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
 from copy import deepcopy
 from pathlib import Path
@@ -287,8 +286,9 @@ def deploy(
     Args:
         nemo_checkpoint (Path): Path for nemo checkpoint.
         model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama.
-        triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name
-        is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'.
+        triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model
+        name is passed to the evalute method for the model to be accessible while sending evalution requests.
+        Default: 'triton_model'.
         triton_model_version (Optional[int]): Version for the triton model. Default: 1.
         triton_port (int): Port for the PyTriton server. Default: 8000.
         triton_http_address (str): HTTP address for the PyTriton server. Default:  "0.0.0.0".
@@ -306,8 +306,8 @@ def deploy(
         Needs to be True to be able to run evaluation. Default: True.
         rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0".
         rest_service_port (int): Port for the rest service. Default: 8080.
-        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be
-        True while running evaluation. Default: True.
+        openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to
+        be True while running evaluation. Default: True.
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
         generation_logits are used to compute the logProb of the output token. Default: True.
     """
@@ -406,14 +406,14 @@ def evaluate(
     (https://github.com/EleutherAI/lm-evaluation-harness/tree/main).
 
     Args:
-        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which
-        is required to tokenize the evaluation input and output prompts.
-        url (str): rest serice url and port that were used in the deploy method above in the format:
-        http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts (from lm-eval-harness)
-        are sent to this url which is then passed to the model deployed on PyTriton server. The rest service url and port
-        serve as the entry point to evaluate model deployed on PyTriton server.
-        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name
-        passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model".
+        nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt
+        which is required to tokenize the evaluation input and output prompts.
+        url (str): rest service url and port that were used in the deploy method above in the format:
+        http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts
+        (from lm-eval-harness) are sent to this url which is then passed to the model deployed on PyTriton server.
+        The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server.
+        model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as
+        triton_model_name passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model".
         eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k".
         These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from
         lm-evaluation-harness can be run, but only the above mentioned ones are tested. Tasks of type
@@ -427,12 +427,12 @@ def evaluate(
         # inference params
         max_tokens_to_generate (int): max tokens to generate. Default: 256.
         temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token
-        with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM(# TODO to be investigated).
-        Hence using a very samll value as the default. Default: 0.000000001.
+        with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM
+        (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001.
         top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability.
         top_p=0 means the model will only consider the single most likely token for the next prediction. Default: 0.0.
-        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will
-        only consider the single most likely token for the next prediction. Default: 1
+        top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model
+        will only consider the single most likely token for the next prediction. Default: 1
         add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when
         encoding a string. Default: False since typically for CausalLM its set to False. If needed set add_bos to True.
     """
@@ -440,7 +440,8 @@ def evaluate(
         # lm-evaluation-harness import
         from lm_eval import evaluator
     except ImportError:
-        raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required to run evaluations")
+        raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required "
+                          "to run evaluations")
 
     from nemo.collections.llm import evaluation
 
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index 0fdc41cff06b..f9dc3debb298 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -132,13 +132,16 @@ def loglikelihood(self, requests: list[Instance]):
         return results
 
     def loglikelihood_rolling(self, requests: list[Instance]):
+        """
+        Defines the loglikelihood_rolling request type. Yet to be implemented.
+        """
         pass
 
     def generate_until(self, inputs: list[Instance]):
         """
-        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass
-        defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here
-        loglikelihood) and other relevant args like few shot samples.
+        Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a
+        dataclass defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request
+        type(here loglikelihood) and other relevant args like few shot samples.
         """
         results = []
         for instance in inputs:

From f6654c99e2272c6b1860b2bfa0e3a42b6f4712da Mon Sep 17 00:00:00 2001
From: athitten <athitten@users.noreply.github.com>
Date: Thu, 14 Nov 2024 00:03:11 +0000
Subject: [PATCH 18/21] Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>
---
 nemo/collections/llm/api.py                   | 15 ++++++----
 nemo/collections/llm/deploy/__init__.py       |  2 +-
 nemo/collections/llm/deploy/base.py           | 12 ++++----
 nemo/collections/llm/evaluation/__init__.py   |  2 +-
 nemo/collections/llm/evaluation/base.py       | 30 ++++++++++---------
 nemo/deploy/nlp/query_llm.py                  |  8 ++---
 nemo/deploy/service/rest_model_api.py         |  2 +-
 nemo/export/tensorrt_llm.py                   | 10 ++++---
 nemo/export/trt_llm/tensorrt_llm_build.py     |  2 +-
 nemo/lightning/pytorch/callbacks/debugging.py |  1 +
 10 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 26bcb4275ba7..07899b2ee484 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -276,7 +276,7 @@ def deploy(
     rest_service_http_address: str = "0.0.0.0",
     rest_service_port: int = 8080,
     openai_format_response: bool = True,
-    output_generation_logits: bool = True
+    output_generation_logits: bool = True,
 ):
     """
     Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm.
@@ -311,8 +311,8 @@ def deploy(
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
         generation_logits are used to compute the logProb of the output token. Default: True.
     """
-    from nemo.deploy import DeployPyTriton
     from nemo.collections.llm import deploy
+    from nemo.deploy import DeployPyTriton
 
     deploy.unset_environment_variables()
     if start_rest_service:
@@ -337,7 +337,7 @@ def deploy(
         max_output_len,
         max_batch_size,
         dtype,
-        output_generation_logits
+        output_generation_logits,
     )
 
     try:
@@ -440,8 +440,9 @@ def evaluate(
         # lm-evaluation-harness import
         from lm_eval import evaluator
     except ImportError:
-        raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required "
-                          "to run evaluations")
+        raise ImportError(
+            "Please ensure that lm-evaluation-harness is installed in your env as it is required " "to run evaluations"
+        )
 
     from nemo.collections.llm import evaluation
 
@@ -450,7 +451,9 @@ def evaluate(
     # Wait for rest service to be ready before starting evaluation
     evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health")
     # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
-    model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos)
+    model = evaluation.NeMoFWLMEval(
+        model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
+    )
     results = evaluator.simple_evaluate(
         model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
     )
diff --git a/nemo/collections/llm/deploy/__init__.py b/nemo/collections/llm/deploy/__init__.py
index 312cfb93ca1c..24c102bfa0d2 100644
--- a/nemo/collections/llm/deploy/__init__.py
+++ b/nemo/collections/llm/deploy/__init__.py
@@ -1,3 +1,3 @@
-from nemo.collections.llm.deploy.base import unset_environment_variables, get_trtllm_deployable
+from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables
 
 __all__ = ["unset_environment_variables", "get_trtllm_deployable"]
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
index 46ce57152f54..e21198f5884b 100644
--- a/nemo/collections/llm/deploy/base.py
+++ b/nemo/collections/llm/deploy/base.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.utils import logging
-import subprocess
 import os
+import subprocess
 from pathlib import Path
 
+from nemo.utils import logging
+
+
 def unset_environment_variables() -> None:
     """
     SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work
@@ -61,7 +63,7 @@ def get_trtllm_deployable(
     max_output_len,
     max_batch_size,
     dtype,
-    output_generation_logits
+    output_generation_logits,
 ):
     """
     Exports the nemo checkpoint to trtllm and returns trt_llm_exporter that is used to deploy on PyTriton.
@@ -107,9 +109,9 @@ def get_trtllm_deployable(
                 max_output_len=max_output_len,
                 max_batch_size=max_batch_size,
                 dtype=dtype,
-                gather_generation_logits=output_generation_logits
+                gather_generation_logits=output_generation_logits,
             )
         except Exception as error:
             raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
 
-    return trt_llm_exporter
\ No newline at end of file
+    return trt_llm_exporter
diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py
index bca7c6251588..3012689bb8da 100644
--- a/nemo/collections/llm/evaluation/__init__.py
+++ b/nemo/collections/llm/evaluation/__init__.py
@@ -1,3 +1,3 @@
 from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service
 
-__all__ = ["NeMoFWLMEval", "wait_for_rest_service"]
\ No newline at end of file
+__all__ = ["NeMoFWLMEval", "wait_for_rest_service"]
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index f9dc3debb298..f43e9328cf65 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import time
-import requests
-from requests.exceptions import RequestException
 
+import requests
 import torch
 import torch.nn.functional as F
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from requests.exceptions import RequestException
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.utils import logging
 
-from lm_eval.api.instance import Instance
-from lm_eval.api.model import LM
 
 class NeMoFWLMEval(LM):
     """
@@ -32,6 +32,7 @@ class NeMoFWLMEval(LM):
     NeMo model deployed on PyTriton server.
     Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
     """
+
     def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos):
         self.model_name = model_name
         self.api_url = api_url
@@ -73,8 +74,10 @@ def tokenizer_type(self, tokenizer):
         elif isinstance(tokenizer, SentencePieceTokenizer):
             return "SentencePieceTokenizer"
         else:
-            raise ValueError("Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
-                             "how to handle special tokens for this tokenizer")
+            raise ValueError(
+                "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
+                "how to handle special tokens for this tokenizer"
+            )
 
     def loglikelihood(self, requests: list[Instance]):
         """
@@ -97,11 +100,12 @@ def loglikelihood(self, requests: list[Instance]):
             # get encoded tokens of continuation
             continuation_enc = self.tokenizer.tokenizer.encode(continuation, **special_tokens_kwargs)
             # for SentencePeice consider the encoded tokens from the 2nd token since first encoded token is space.
-            if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:]
+            if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
+                continuation_enc = continuation_enc[1:]
             num_cont_tokens = len(continuation_enc)
             # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request
             self.max_tokens_to_generate = num_cont_tokens
-            # Create payload to query the model deployed on PyTriton server 
+            # Create payload to query the model deployed on PyTriton server
             payload = {
                 "model": self.model_name,
                 "prompt": context,
@@ -121,9 +125,7 @@ def loglikelihood(self, requests: list[Instance]):
             # Check if all greedy_tokens match the the actual continuation tokens
             is_greedy = (greedy_tokens == cont_toks).all()
             # Get the logits corresponding to the actual continuation tokens
-            logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                    -1
-                )
+            logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
             # result is tuple of logProb of generating the continuation token and is_greedy
             result = (float(logits.sum()), bool(is_greedy))
 
@@ -147,7 +149,7 @@ def generate_until(self, inputs: list[Instance]):
         for instance in inputs:
             # Access the 'arguments' attribute of the Instance which contains the input prompt string
             prompt = instance.arguments[0]
-            # Create payload to query the model deployed on PyTriton server 
+            # Create payload to query the model deployed on PyTriton server
             payload = {
                 "model": self.model_name,
                 "prompt": prompt,
@@ -170,7 +172,7 @@ def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
 
     Args:
     rest_url (str): URL of the REST service's health endpoint
-    max_retries (int): Maximum number of retry attempts. Defaul: 60. 
+    max_retries (int): Maximum number of retry attempts. Defaul: 60.
     retry_interval (int): Time to wait between retries in seconds. Default: 2.
 
     Returns:
@@ -204,4 +206,4 @@ def check_service(url):
         time.sleep(retry_interval)
 
     logging.info("Timeout: REST service did not become ready.")
-    return False
\ No newline at end of file
+    return False
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 4c55cf3b2c15..e1d21bb54b76 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -174,7 +174,7 @@ def query_llm(
         end_strings=None,
         init_timeout=60.0,
         openai_format_response: bool = False,
-        output_generation_logits: bool = False
+        output_generation_logits: bool = False,
     ):
         """
         Query the Triton server synchronously and return a list of responses.
@@ -273,11 +273,11 @@ def query_llm(
                         "object": "text_completion",
                         "created": int(time.time()),
                         "model": self.model_name,
-                        "choices": [{"text": str(sentences)
-                                     }]
+                        "choices": [{"text": str(sentences)}],
                     }
                     # Convert gneration logits to a list to make it json serializable and add it to openai_response dict
-                    if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
+                    if output_generation_logits:
+                        openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist()
                     return openai_response
                 else:
                     return sentences
diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 21c68f18580d..2a65b19c0d50 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -118,7 +118,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            output_generation_logits=triton_settings.output_generation_logits
+            output_generation_logits=triton_settings.output_generation_logits,
         )
         if triton_settings.openai_format_response:
             return output
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index e16275b2208d..a1e6cb0e03c4 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -181,7 +181,7 @@ def export(
         fp8_quantized: Optional[bool] = None,
         fp8_kvcache: Optional[bool] = None,
         gather_context_logits: Optional[bool] = False,
-        gather_generation_logits: Optional[bool] = False
+        gather_generation_logits: Optional[bool] = False,
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -500,7 +500,7 @@ def get_transformer_config(nemo_model_config):
                             gpt_attention_plugin=gpt_attention_plugin,
                             gemm_plugin=gemm_plugin,
                             gather_context_logits=gather_context_logits,
-                            gather_generation_logits=gather_generation_logits
+                            gather_generation_logits=gather_generation_logits,
                         )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
@@ -877,8 +877,10 @@ def get_triton_input(self):
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),
-                   Tensor(name="generation_logits", shape=(-1,), dtype=np.single))
+        outputs = (
+            Tensor(name="outputs", shape=(-1,), dtype=bytes),
+            Tensor(name="generation_logits", shape=(-1,), dtype=np.single),
+        )
         return outputs
 
     @batch
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 88767917301e..38fb80ca3272 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -55,7 +55,7 @@ def build_and_save_engine(
     gemm_plugin: str = "auto",
     reduce_fusion: bool = False,
     gather_context_logits: bool = False,
-    gather_generation_logits: bool = False
+    gather_generation_logits: bool = False,
 ):
     architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
     try:
diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py
index b5da06dfbf53..5f6e722ef89b 100644
--- a/nemo/lightning/pytorch/callbacks/debugging.py
+++ b/nemo/lightning/pytorch/callbacks/debugging.py
@@ -116,6 +116,7 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule)
         Iterate over model parameters, find gradient tensor, apply and collect outputs of
         param_fn and grad_fn, and log outputs in a table.
         """
+
         def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]:
             """If using MCore optimizer, search the grad buckets for param's grad tensor."""
             if not isinstance(pl_module.optim, MegatronOptimizerModule):

From 5f03ceef0c9d0f0aab7c1fbbfb2134b9b389d64b Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Thu, 14 Nov 2024 22:41:53 -0800
Subject: [PATCH 19/21] Avoid multiple calls for tokenizer_type

Co-authored-by: Ananth Subramaniam <ananth.subramaniam@gmail.com>
Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
---
 nemo/collections/llm/evaluation/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index f43e9328cf65..b1734d6f4d43 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -86,9 +86,10 @@ def loglikelihood(self, requests: list[Instance]):
         loglikelihood) and other relevant args like few shot samples.
         """
         special_tokens_kwargs = {}
-        if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
+        tokenizer_type = self.tokenizer_type(self.tokenizer)
+        if tokenizer_type == "SentencePieceTokenizer":
             special_tokens_kwargs['add_bos'] = self.add_bos
-        elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer":
+        elif tokenizer_type == "AutoTokenizer":
             special_tokens_kwargs['add_special_tokens'] = self.add_bos
 
         results = []

From 88842a18dfd02722d4928d365140cdb602f06cfc Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Thu, 14 Nov 2024 23:05:21 -0800
Subject: [PATCH 20/21] Replace print statements with logging statements

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/deploy/service/rest_model_api.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index 2a65b19c0d50..c2f5b394fb3b 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -17,7 +17,7 @@
 from pydantic_settings import BaseSettings
 
 from nemo.deploy.nlp import NemoQueryLLM
-
+from nemo.utils import logging
 
 class TritonSettings(BaseSettings):
     _triton_service_port: int
@@ -33,7 +33,7 @@ def __init__(self):
             self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true'
             self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true'
         except Exception as error:
-            print("An exception occurred:", error)
+            logging.error("An exception occurred trying to retrieve set args in TritonSettings class. Error:", error)
             return
 
     @property
@@ -93,7 +93,7 @@ async def check_triton_health():
     triton_url = (
         f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
     )
-    print(f"Attempting to connect to Triton server at: {triton_url}")
+    logging.info(f"Attempting to connect to Triton server at: {triton_url}")
     try:
         response = requests.get(triton_url, timeout=5)
         if response.status_code == 200:
@@ -118,7 +118,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            output_generation_logits=triton_settings.output_generation_logits,
+            output_generation_logits=triton_settings.output_generation_logits
         )
         if triton_settings.openai_format_response:
             return output
@@ -127,5 +127,5 @@ def completions_v1(request: CompletionRequest):
                 "output": output[0][0],
             }
     except Exception as error:
-        print("An exception occurred:", error)
+        logging.error("An exception occurred with the post request to /v1/completions/ endpoint:", error)
         return {"error": "An exception occurred"}

From a3adb69009eaaf2145ecf248317a880ff276d50d Mon Sep 17 00:00:00 2001
From: athitten <athitten@users.noreply.github.com>
Date: Fri, 15 Nov 2024 07:06:54 +0000
Subject: [PATCH 21/21] Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>
---
 nemo/deploy/service/rest_model_api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py
index c2f5b394fb3b..64afea167295 100644
--- a/nemo/deploy/service/rest_model_api.py
+++ b/nemo/deploy/service/rest_model_api.py
@@ -19,6 +19,7 @@
 from nemo.deploy.nlp import NemoQueryLLM
 from nemo.utils import logging
 
+
 class TritonSettings(BaseSettings):
     _triton_service_port: int
     _triton_service_ip: str
@@ -118,7 +119,7 @@ def completions_v1(request: CompletionRequest):
             temperature=request.temperature,
             init_timeout=triton_settings.triton_request_timeout,
             openai_format_response=triton_settings.openai_format_response,
-            output_generation_logits=triton_settings.output_generation_logits
+            output_generation_logits=triton_settings.output_generation_logits,
         )
         if triton_settings.openai_format_response:
             return output