From 43a104a9e8757716cc0c81e21383f875a9e73ad2 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 10:31:18 -0800
Subject: [PATCH 1/9] Fix llm.deploy api

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index a27829412fe3..c508dc5b9f87 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -377,10 +377,10 @@ def deploy(
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
         generation_logits are used to compute the logProb of the output token. Default: True.
     """
-    from nemo.collections.llm import deploy
+    from nemo.collections.llm.deploy.base import unset_environment_variables
     from nemo.deploy import DeployPyTriton
 
-    deploy.unset_environment_variables()
+    unset_environment_variables()
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")

From a3536bb1246e99745018bdc2104be5a5f16f59c8 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 10:47:57 -0800
Subject: [PATCH 2/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index c508dc5b9f87..fbba43f87d3d 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -377,7 +377,7 @@ def deploy(
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
         generation_logits are used to compute the logProb of the output token. Default: True.
     """
-    from nemo.collections.llm.deploy.base import unset_environment_variables
+    from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables
     from nemo.deploy import DeployPyTriton
 
     unset_environment_variables()
@@ -392,7 +392,7 @@ def deploy(
         os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
         os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits)
 
-    triton_deployable = deploy.get_trtllm_deployable(
+    triton_deployable = get_trtllm_deployable(
         nemo_checkpoint,
         model_type,
         triton_model_repository,

From 13cb66925b5bdbddc476ad1a7b17cdbf8be5fb75 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 15:52:22 -0800
Subject: [PATCH 3/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index fbba43f87d3d..9768f909c0df 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -325,7 +325,7 @@ def ptq(
 def deploy(
     nemo_checkpoint: Path = None,
     model_type: str = "llama",
-    triton_model_name: str = 'triton_model',
+    triton_model_name: str = "triton_model",
     triton_model_version: Optional[int] = 1,
     triton_port: int = 8000,
     triton_http_address: str = "0.0.0.0",
@@ -386,11 +386,11 @@ def deploy(
             logging.error("REST service port and Triton server port cannot use the same port.")
             return
         # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py
-        os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address
-        os.environ['TRITON_PORT'] = str(triton_port)
-        os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout)
-        os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response)
-        os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits)
+        os.environ["TRITON_HTTP_ADDRESS"] = triton_http_address
+        os.environ["TRITON_PORT"] = str(triton_port)
+        os.environ["TRITON_REQUEST_TIMEOUT"] = str(triton_request_timeout)
+        os.environ["OPENAI_FORMAT_RESPONSE"] = str(openai_format_response)
+        os.environ["OUTPUT_GENERATION_LOGITS"] = str(output_generation_logits)
 
     triton_deployable = get_trtllm_deployable(
         nemo_checkpoint,
@@ -513,7 +513,7 @@ def evaluate(
     from nemo.collections.llm import evaluation
 
     # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt.
-    tokenizer = io.load_context(nemo_checkpoint_path + '/context', subpath="model").tokenizer
+    tokenizer = io.load_context(nemo_checkpoint_path + "/context", subpath="model.tokenizer")
     # Wait for rest service to be ready before starting evaluation
     evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health")
     # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
@@ -524,7 +524,7 @@ def evaluate(
         model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
     )
 
-    print("score", results['results'][eval_task])
+    print("score", results["results"][eval_task])
 
 
 @run.cli.entrypoint(name="import", namespace="llm")

From 19e323e99fee89468f61baadefe1ce4d3875f677 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 16:11:23 -0800
Subject: [PATCH 4/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/evaluation/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index b1734d6f4d43..ea598a0e5c47 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import time
+import warnings
+from logging import warning
 
 import requests
 import torch
@@ -74,7 +76,7 @@ def tokenizer_type(self, tokenizer):
         elif isinstance(tokenizer, SentencePieceTokenizer):
             return "SentencePieceTokenizer"
         else:
-            raise ValueError(
+            warnings.warn(
                 "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
                 "how to handle special tokens for this tokenizer"
             )

From c152915542c30f4d598e83b1ae026ebfb2c6d5a8 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 18:14:08 -0800
Subject: [PATCH 5/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/evaluation/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index ea598a0e5c47..be9e118d7ef5 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -169,7 +169,7 @@ def generate_until(self, inputs: list[Instance]):
         return results
 
 
-def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2):
+def wait_for_rest_service(rest_url, max_retries=600, retry_interval=2):
     """
     Wait for REST service to be ready.
 

From 810c2a887a734f917f89701c1a2aa42b05ce6e23 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Sun, 17 Nov 2024 20:56:03 -0800
Subject: [PATCH 6/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 9768f909c0df..9c6d2d51c030 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -521,7 +521,7 @@ def evaluate(
         model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
     )
     results = evaluator.simple_evaluate(
-        model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters
+        model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, batch_size=8
     )
 
     print("score", results["results"][eval_task])

From 4abdd38981ee05a80de5194631a2abfcee9313b1 Mon Sep 17 00:00:00 2001
From: hemildesai <hemildesai@users.noreply.github.com>
Date: Thu, 21 Nov 2024 21:17:56 +0000
Subject: [PATCH 7/9] Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
---
 nemo/collections/llm/api.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 9c6d2d51c030..5645ba145c18 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -521,7 +521,12 @@ def evaluate(
         model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
     )
     results = evaluator.simple_evaluate(
-        model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, batch_size=8
+        model=model,
+        tasks=eval_task,
+        limit=limit,
+        num_fewshot=num_fewshot,
+        bootstrap_iters=bootstrap_iters,
+        batch_size=8,
     )
 
     print("score", results["results"][eval_task])

From b05fd8cc90a188435c6f471da42cef778d63e02a Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Wed, 11 Dec 2024 11:09:21 -0800
Subject: [PATCH 8/9] PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/evaluation/base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index be9e118d7ef5..f8f6639e3f3c 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import time
-import warnings
-from logging import warning
 
 import requests
 import torch
@@ -76,7 +74,7 @@ def tokenizer_type(self, tokenizer):
         elif isinstance(tokenizer, SentencePieceTokenizer):
             return "SentencePieceTokenizer"
         else:
-            warnings.warn(
+            raise ValueError(
                 "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check "
                 "how to handle special tokens for this tokenizer"
             )

From 10ed6deeb15ca8a51595a417766e3700744acc5c Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Wed, 11 Dec 2024 11:29:18 -0800
Subject: [PATCH 9/9] fix

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 5645ba145c18..d030eb88863c 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -41,7 +41,6 @@
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
 
-
 if TYPE_CHECKING:
     from megatron.core.inference.common_inference_params import CommonInferenceParams
     from megatron.core.inference.inference_request import InferenceRequest
@@ -526,7 +525,6 @@ def evaluate(
         limit=limit,
         num_fewshot=num_fewshot,
         bootstrap_iters=bootstrap_iters,
-        batch_size=8,
     )
 
     print("score", results["results"][eval_task])