From 43a104a9e8757716cc0c81e21383f875a9e73ad2 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 10:31:18 -0800 Subject: [PATCH 1/9] Fix llm.deploy api Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index a27829412fe3..c508dc5b9f87 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -377,10 +377,10 @@ def deploy( output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the logProb of the output token. Default: True. """ - from nemo.collections.llm import deploy + from nemo.collections.llm.deploy.base import unset_environment_variables from nemo.deploy import DeployPyTriton - deploy.unset_environment_variables() + unset_environment_variables() if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") From a3536bb1246e99745018bdc2104be5a5f16f59c8 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 10:47:57 -0800 Subject: [PATCH 2/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index c508dc5b9f87..fbba43f87d3d 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -377,7 +377,7 @@ def deploy( output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the logProb of the output token. Default: True. """ - from nemo.collections.llm.deploy.base import unset_environment_variables + from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables from nemo.deploy import DeployPyTriton unset_environment_variables() @@ -392,7 +392,7 @@ def deploy( os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response) os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits) - triton_deployable = deploy.get_trtllm_deployable( + triton_deployable = get_trtllm_deployable( nemo_checkpoint, model_type, triton_model_repository, From 13cb66925b5bdbddc476ad1a7b17cdbf8be5fb75 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 15:52:22 -0800 Subject: [PATCH 3/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index fbba43f87d3d..9768f909c0df 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -325,7 +325,7 @@ def ptq( def deploy( nemo_checkpoint: Path = None, model_type: str = "llama", - triton_model_name: str = 'triton_model', + triton_model_name: str = "triton_model", triton_model_version: Optional[int] = 1, triton_port: int = 8000, triton_http_address: str = "0.0.0.0", @@ -386,11 +386,11 @@ def deploy( logging.error("REST service port and Triton server port cannot use the same port.") return # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py - os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address - os.environ['TRITON_PORT'] = str(triton_port) - os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout) - os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response) - os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits) + os.environ["TRITON_HTTP_ADDRESS"] = triton_http_address + os.environ["TRITON_PORT"] = str(triton_port) + os.environ["TRITON_REQUEST_TIMEOUT"] = str(triton_request_timeout) + os.environ["OPENAI_FORMAT_RESPONSE"] = str(openai_format_response) + os.environ["OUTPUT_GENERATION_LOGITS"] = str(output_generation_logits) triton_deployable = get_trtllm_deployable( nemo_checkpoint, @@ -513,7 +513,7 @@ def evaluate( from nemo.collections.llm import evaluation # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt. - tokenizer = io.load_context(nemo_checkpoint_path + '/context', subpath="model").tokenizer + tokenizer = io.load_context(nemo_checkpoint_path + "/context", subpath="model.tokenizer") # Wait for rest service to be ready before starting evaluation evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health") # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate @@ -524,7 +524,7 @@ def evaluate( model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters ) - print("score", results['results'][eval_task]) + print("score", results["results"][eval_task]) @run.cli.entrypoint(name="import", namespace="llm") From 19e323e99fee89468f61baadefe1ce4d3875f677 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 16:11:23 -0800 Subject: [PATCH 4/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/evaluation/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index b1734d6f4d43..ea598a0e5c47 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -13,6 +13,8 @@ # limitations under the License. import time +import warnings +from logging import warning import requests import torch @@ -74,7 +76,7 @@ def tokenizer_type(self, tokenizer): elif isinstance(tokenizer, SentencePieceTokenizer): return "SentencePieceTokenizer" else: - raise ValueError( + warnings.warn( "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check " "how to handle special tokens for this tokenizer" ) From c152915542c30f4d598e83b1ae026ebfb2c6d5a8 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 18:14:08 -0800 Subject: [PATCH 5/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/evaluation/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index ea598a0e5c47..be9e118d7ef5 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -169,7 +169,7 @@ def generate_until(self, inputs: list[Instance]): return results -def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): +def wait_for_rest_service(rest_url, max_retries=600, retry_interval=2): """ Wait for REST service to be ready. From 810c2a887a734f917f89701c1a2aa42b05ce6e23 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Sun, 17 Nov 2024 20:56:03 -0800 Subject: [PATCH 6/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 9768f909c0df..9c6d2d51c030 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -521,7 +521,7 @@ def evaluate( model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos ) results = evaluator.simple_evaluate( - model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters + model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, batch_size=8 ) print("score", results["results"][eval_task]) From 4abdd38981ee05a80de5194631a2abfcee9313b1 Mon Sep 17 00:00:00 2001 From: hemildesai Date: Thu, 21 Nov 2024 21:17:56 +0000 Subject: [PATCH 7/9] Apply isort and black reformatting Signed-off-by: hemildesai --- nemo/collections/llm/api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 9c6d2d51c030..5645ba145c18 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -521,7 +521,12 @@ def evaluate( model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos ) results = evaluator.simple_evaluate( - model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, batch_size=8 + model=model, + tasks=eval_task, + limit=limit, + num_fewshot=num_fewshot, + bootstrap_iters=bootstrap_iters, + batch_size=8, ) print("score", results["results"][eval_task]) From b05fd8cc90a188435c6f471da42cef778d63e02a Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 11 Dec 2024 11:09:21 -0800 Subject: [PATCH 8/9] PR feedback Signed-off-by: Hemil Desai --- nemo/collections/llm/evaluation/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index be9e118d7ef5..f8f6639e3f3c 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -13,8 +13,6 @@ # limitations under the License. import time -import warnings -from logging import warning import requests import torch @@ -76,7 +74,7 @@ def tokenizer_type(self, tokenizer): elif isinstance(tokenizer, SentencePieceTokenizer): return "SentencePieceTokenizer" else: - warnings.warn( + raise ValueError( "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check " "how to handle special tokens for this tokenizer" ) From 10ed6deeb15ca8a51595a417766e3700744acc5c Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 11 Dec 2024 11:29:18 -0800 Subject: [PATCH 9/9] fix Signed-off-by: Hemil Desai --- nemo/collections/llm/api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 5645ba145c18..d030eb88863c 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -41,7 +41,6 @@ from nemo.utils import logging from nemo.utils.get_rank import is_global_rank_zero - if TYPE_CHECKING: from megatron.core.inference.common_inference_params import CommonInferenceParams from megatron.core.inference.inference_request import InferenceRequest @@ -526,7 +525,6 @@ def evaluate( limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters, - batch_size=8, ) print("score", results["results"][eval_task])