From f06c51b2f9a4c823c6ef5a080e1bee06514b3ed6 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 25 Sep 2024 11:27:32 -0700 Subject: [PATCH 01/21] Add evaluate method and other minor fixes Signed-off-by: Abhishree --- nemo/collections/llm/__init__.py | 7 ++ nemo/collections/llm/api.py | 117 +++++++++++++++++++++++--- nemo/deploy/service/rest_model_api.py | 7 +- scripts/deploy/nlp/deploy_triton.py | 3 +- 4 files changed, 119 insertions(+), 15 deletions(-) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 2051f844d888..a67d24f5bbc8 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -231,3 +231,10 @@ __all__.append("deploy") except ImportError as error: logging.warning(f"The deploy module could not be imported: {error}") + +try: + from nemo.collections.llm.api import evaluate + + __all__.append("evaluate") +except ImportError as error: + logging.warning(f"The evaluate module could not be imported: {error}") diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index fdceff5d959e..7f1bdf94c876 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -333,7 +333,7 @@ def deploy( model_type: str = "llama", triton_model_name: str = "xxx", triton_model_version: Optional[int] = 1, - triton_port: int = 8080, + triton_port: int = 8000, triton_http_address: str = "0.0.0.0", triton_request_timeout: int = 60, triton_model_repository: Path = None, @@ -348,6 +348,7 @@ def deploy( rest_service_http_address: str = "0.0.0.0", rest_service_port: int = 8000, openai_format_response: bool = False, + ckpt_type: str = "nemo" ): from nemo.deploy import DeployPyTriton @@ -358,18 +359,28 @@ def deploy( # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) - triton_deployable = get_trtllm_deployable( - nemo_checkpoint, - model_type, - triton_model_repository, - num_gpus, - tensor_parallelism_size, - pipeline_parallelism_size, - max_input_len, - max_output_len, - max_batch_size, - dtype, - ) + #TODO: directly support deploy of trtllm engine wo exporting to TRTLLM + if ckpt_type == "trtllm": + triton_deployable = get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, + ) + elif ckpt_type == "nemo": + if nemo_checkpoint is None: + raise ValueError("In-Framework deployment requires a .nemo checkpoint") + try: + from nemo.deploy.nlp import MegatronLLMDeployable + except Exception as e: + raise ValueError("MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}") + triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus) try: nm = DeployPyTriton( @@ -383,6 +394,7 @@ def deploy( logging.info("Triton deploy function will be called.") nm.deploy() + nm.run() except Exception as error: logging.error("Error message has occurred during deploy function. Error message: " + str(error)) return @@ -415,6 +427,85 @@ def deploy( logging.info("Model serving will be stopped.") nm.stop() +def evaluate( + url: str = "http://0.0.0.0:1234/v1", + model_name: str = "xxxx", + eval_task: str = "gsm8k", + num_fewshot: Optional[int] = None, + limit: Optional[Union[int, float]] = None, + bootstrap_iters: int = 100000, + ): + + from lm_eval import tasks, evaluator + from lm_eval.api.model import LM + import requests + class CustomModel(LM): + def __init__(self, model_name, api_url): + self.model_name = model_name + self.api_url = api_url + + super().__init__() + + def loglikelihood(self, requests): + # Implement log likelihood calculation logic here + pass + + def loglikelihood_rolling(self, requests): + # Implement log likelihood calculation logic here + pass + + def generate_until(self, inputs): + results = [] + for instance in inputs: + # Access the 'arguments' attribute of the Instance + prompt = instance.arguments[0] # This should be the prompt string + + # Extract other parameters from the 'arguments' or 'doc' as needed + max_tokens = 50 # Set a default or extract from instance if available + #temperature = instance.arguments[1].get('temperature', 1.0) + # top_p = instance.arguments[1].get('top_p', 1.0) + # top_k = instance.arguments[1].get('top_k', 0) + temperature = 1.0 + top_p = 0 + top_k = 1.0 + + payload = { + "model": self.model_name, + "prompt": prompt, + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": top_p, + "n": top_k + } + + response = requests.post(f"{self.api_url}/completions/", json=payload) + response_data = response.json() + + if 'error' in response_data: + raise Exception(f"API Error: {response_data['error']}") + + # Assuming the response is in OpenAI format + generated_text = response_data['choices'][0]['text'] + results.append(generated_text) + + return results + model = CustomModel(model_name, url) + #task = tasks.get_task_dict(eval_task) + # Run evaluation + # results = evaluator.evaluate( + # lm=model, + # limit=1, + # task_dict=task + # ) + results = evaluator.simple_evaluate( + model=model, + tasks=eval_task, + limit=limit, + num_fewshot=num_fewshot, + bootstrap_iters=bootstrap_iters + ) + + print("--score---",results['results']['gsm8k']) @run.cli.entrypoint(name="import", namespace="llm") def import_ckpt( diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index fbc774883faa..6342da7e185a 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -76,13 +76,18 @@ class CompletionRequest(BaseModel): frequency_penalty: float = 1.0 +@app.get("/hello") +def root(): + return {"message": "Hello World"} + @app.get("/triton_health") async def check_triton_health(): """ This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application. Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible. """ - triton_url = f"triton_settings.triton_service_ip:str(triton_settings.triton_service_port)/v2/health/ready" + triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" + print(f"Attempting to connect to Triton server at: {triton_url}") try: response = requests.get(triton_url, timeout=5) if response.status_code == 200: diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index e3394726fa1c..154ffc90dc9c 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -419,13 +419,14 @@ def nemo_deploy(argv): LOGGER.info("Triton deploy function will be called.") nm.deploy() + nm.run() except Exception as error: LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) return try: LOGGER.info("Model serving on Triton is will be started.") - if args.start_rest_service == "True": + if args.start_rest_service: try: LOGGER.info("REST service will be started.") uvicorn.run( From 25302b4814ccd9c0541647792e85a3a1758652e1 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 25 Sep 2024 15:29:33 -0700 Subject: [PATCH 02/21] Add inference params to evaluate method Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 38 ++++++++++++--------------- nemo/deploy/service/rest_model_api.py | 4 +-- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 7f1bdf94c876..65e4f07828ef 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -434,16 +434,24 @@ def evaluate( num_fewshot: Optional[int] = None, limit: Optional[Union[int, float]] = None, bootstrap_iters: int = 100000, + # inference params + max_tokens_to_generate: Optional[int] = 256, + temperature: Optional[float] = None, + top_p: Optional[float] = 0.0, + top_k: Optional[int] = 1, ): from lm_eval import tasks, evaluator from lm_eval.api.model import LM import requests class CustomModel(LM): - def __init__(self, model_name, api_url): + def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k): self.model_name = model_name self.api_url = api_url - + self.max_tokens_to_generate = max_tokens_to_generate + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k super().__init__() def loglikelihood(self, requests): @@ -460,22 +468,16 @@ def generate_until(self, inputs): # Access the 'arguments' attribute of the Instance prompt = instance.arguments[0] # This should be the prompt string - # Extract other parameters from the 'arguments' or 'doc' as needed - max_tokens = 50 # Set a default or extract from instance if available - #temperature = instance.arguments[1].get('temperature', 1.0) - # top_p = instance.arguments[1].get('top_p', 1.0) - # top_k = instance.arguments[1].get('top_k', 0) - temperature = 1.0 - top_p = 0 - top_k = 1.0 + # Extract default temperature from instance of the benchmark or use the uder defined value + temperature = instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature payload = { "model": self.model_name, "prompt": prompt, - "max_tokens": max_tokens, + "max_tokens": self.max_tokens_to_generate, "temperature": temperature, - "top_p": top_p, - "n": top_k + "top_p": self.top_p, + "top_k": self.top_k } response = requests.post(f"{self.api_url}/completions/", json=payload) @@ -489,14 +491,8 @@ def generate_until(self, inputs): results.append(generated_text) return results - model = CustomModel(model_name, url) - #task = tasks.get_task_dict(eval_task) - # Run evaluation - # results = evaluator.evaluate( - # lm=model, - # limit=1, - # task_dict=task - # ) + model = CustomModel(model_name, url, temperature, top_p, top_k) + results = evaluator.simple_evaluate( model=model, tasks=eval_task, diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 6342da7e185a..f7b470c00b34 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -70,7 +70,7 @@ class CompletionRequest(BaseModel): max_tokens: int = 512 temperature: float = 1.0 top_p: float = 0.0 - n: int = 1 + top_k: int = 1 stream: bool = False stop: str | None = None frequency_penalty: float = 1.0 @@ -106,7 +106,7 @@ def completions_v1(request: CompletionRequest): output = nq.query_llm( prompts=[request.prompt], max_output_len=request.max_tokens, - top_k=request.n, + top_k=request.top_k, top_p=request.top_p, temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, From 02e9edb7d5d713fd78da853690249a51b995c615 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Mon, 30 Sep 2024 13:54:16 -0700 Subject: [PATCH 03/21] Add wait_for_rest_service fn to evaluate method Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 47 +++++++- nemo/deploy/service/rest_model_api.py | 12 +- scripts/export/convert_nemo2_for_export.py | 123 --------------------- 3 files changed, 51 insertions(+), 131 deletions(-) delete mode 100644 scripts/export/convert_nemo2_for_export.py diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 65e4f07828ef..670b13bdc62e 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -443,7 +443,51 @@ def evaluate( from lm_eval import tasks, evaluator from lm_eval.api.model import LM + import time import requests + from requests.exceptions import RequestException + + def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2): + """ + Wait for REST service to be ready. + + Args: + rest_url (str): URL of the REST service's health endpoint + max_retries (int): Maximum number of retry attempts + retry_interval (int): Time to wait between retries in seconds + + Returns: + bool: True if rest service is ready, False otherwise + """ + for _ in range(max_retries): + rest_ready = check_service(rest_url) + + if rest_ready: + print("REST service is ready.") + return True + + print(f"REST Service not ready yet. Retrying in {retry_interval} seconds...") + time.sleep(retry_interval) + + print("Timeout: One or both services did not become ready.") + return False + + def check_service(url): + """ + Check if a service is ready by making a GET request to its health endpoint. + + Args: + url (str): URL of the service's health endpoint + + Returns: + bool: True if the service is ready, False otherwise + """ + try: + response = requests.get(url, timeout=5) + return response.status_code == 200 + except RequestException: + return False + class CustomModel(LM): def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k): self.model_name = model_name @@ -491,8 +535,9 @@ def generate_until(self, inputs): results.append(generated_text) return results - model = CustomModel(model_name, url, temperature, top_p, top_k) + wait_for_rest_service(rest_url=f"{url}/health") + model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k) results = evaluator.simple_evaluate( model=model, tasks=eval_task, diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index f7b470c00b34..9b330088487f 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -20,7 +20,6 @@ from nemo.deploy.nlp import NemoQueryLLM - class TritonSettings(BaseSettings): _triton_service_port: int _triton_service_ip: str @@ -63,7 +62,6 @@ def openai_format_response(self): app = FastAPI() triton_settings = TritonSettings() - class CompletionRequest(BaseModel): model: str prompt: str @@ -76,15 +74,15 @@ class CompletionRequest(BaseModel): frequency_penalty: float = 1.0 -@app.get("/hello") -def root(): - return {"message": "Hello World"} +@app.get("/v1/health") +def health_check(): + return {"status": "ok"} -@app.get("/triton_health") +@app.get("/v1/triton_health") async def check_triton_health(): """ This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application. - Verify by running: curl http://service_http_address:service_port/triton_health and the returned status should inform if the server is accessible. + Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible. """ triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" print(f"Attempting to connect to Triton server at: {triton_url}") diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py deleted file mode 100644 index 0703322cd854..000000000000 --- a/scripts/export/convert_nemo2_for_export.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert a NeMo 2.0 checkpoint to NeMo 1.0 for TRTLLM export. -Example to run this conversion script: -``` - python /opt/NeMo/scripts/scripts/export/convert_nemo2_for_export.py \ - --input_path /path/to/nemo2/ckpt \ - --output_path /path/to/output \ - --tokenizer_type huggingface \ - --tokenizer_name meta-llama/Meta-Llama-3.1-8B \ - --symbolic_link=True -``` -""" - -import os -import shutil -from argparse import ArgumentParser - -from omegaconf import OmegaConf - -from nemo.lightning import io - - -def get_args(): - parser = ArgumentParser() - parser.add_argument( - "--input_path", - type=str, - required=True, - help="Path to nemo 2.0 checkpoint", - ) - parser.add_argument( - "--output_path", - type=str, - required=True, - help="Output path", - ) - parser.add_argument( - "--tokenizer_type", - type=str, - default="huggingface", - help="Type of tokenizer", - ) - parser.add_argument( - "--tokenizer_name", - type=str, - default="meta-llama/Meta-Llama-3.1-8B", - help="Name or path of tokenizer", - ) - parser.add_argument( - "--symbolic_link", - type=bool, - default=True, - help="Whether to use symbiloc link for model weights", - ) - - args = parser.parse_args() - return args - - -def main(args): - input_path = args.input_path - output_path = args.output_path - weight_path = os.path.join(output_path, "model_weights") - - if os.path.exists(output_path): - shutil.rmtree(output_path) - print(f"Remove existing {output_path}") - - os.makedirs(output_path, exist_ok=True) - - config = io.load_context(input_path, subpath="model.config") - - config_dict = {} - for k, v in config.__dict__.items(): - if isinstance(v, (float, int, str, bool)): - config_dict[k] = v - elif k == "activation_func": - config_dict["activation"] = v.__name__ - - if config_dict.get("num_moe_experts") is None: - config_dict["num_moe_experts"] = 0 - config_dict["moe_router_topk"] = 0 - if config_dict["activation"] == "silu": - config_dict["activation"] = "fast-swiglu" - - config_dict["mcore_gpt"] = True - config_dict["max_position_embeddings"] = config_dict.get("seq_length") - config_dict["tokenizer"] = { - "library": args.tokenizer_type, - "type": args.tokenizer_name, - "use_fast": True, - } - - yaml_config = OmegaConf.create(config_dict) - OmegaConf.save(config=yaml_config, f=os.path.join(output_path, "model_config.yaml")) - - if args.symbolic_link: - os.symlink(input_path, weight_path) - else: - os.makedirs(weight_path, exist_ok=True) - for file in os.listdir(input_path): - source_path = os.path.join(input_path, file) - target_path = os.path.join(weight_path, file) - shutil.copy(source_path, target_path) - - -if __name__ == "__main__": - args = get_args() - main(args) From 3ee94addb68cfb5907438ac6c7101cdb8ef6b76e Mon Sep 17 00:00:00 2001 From: athitten Date: Mon, 30 Sep 2024 20:56:08 +0000 Subject: [PATCH 04/21] Apply isort and black reformatting Signed-off-by: athitten --- nemo/collections/llm/api.py | 35 +++++++++++++++------------ nemo/deploy/service/rest_model_api.py | 7 +++++- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 670b13bdc62e..a9d3cfb8d2b6 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -348,7 +348,7 @@ def deploy( rest_service_http_address: str = "0.0.0.0", rest_service_port: int = 8000, openai_format_response: bool = False, - ckpt_type: str = "nemo" + ckpt_type: str = "nemo", ): from nemo.deploy import DeployPyTriton @@ -359,7 +359,7 @@ def deploy( # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) - #TODO: directly support deploy of trtllm engine wo exporting to TRTLLM + # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM if ckpt_type == "trtllm": triton_deployable = get_trtllm_deployable( nemo_checkpoint, @@ -379,7 +379,9 @@ def deploy( try: from nemo.deploy.nlp import MegatronLLMDeployable except Exception as e: - raise ValueError("MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}") + raise ValueError( + "MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}" + ) triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus) try: @@ -427,8 +429,9 @@ def deploy( logging.info("Model serving will be stopped.") nm.stop() + def evaluate( - url: str = "http://0.0.0.0:1234/v1", + url: str = "http://0.0.0.0:1234/v1", model_name: str = "xxxx", eval_task: str = "gsm8k", num_fewshot: Optional[int] = None, @@ -439,12 +442,13 @@ def evaluate( temperature: Optional[float] = None, top_p: Optional[float] = 0.0, top_k: Optional[int] = 1, - ): +): - from lm_eval import tasks, evaluator - from lm_eval.api.model import LM import time + import requests + from lm_eval import evaluator, tasks + from lm_eval.api.model import LM from requests.exceptions import RequestException def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2): @@ -513,7 +517,9 @@ def generate_until(self, inputs): prompt = instance.arguments[0] # This should be the prompt string # Extract default temperature from instance of the benchmark or use the uder defined value - temperature = instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + temperature = ( + instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + ) payload = { "model": self.model_name, @@ -521,7 +527,7 @@ def generate_until(self, inputs): "max_tokens": self.max_tokens_to_generate, "temperature": temperature, "top_p": self.top_p, - "top_k": self.top_k + "top_k": self.top_k, } response = requests.post(f"{self.api_url}/completions/", json=payload) @@ -539,14 +545,11 @@ def generate_until(self, inputs): wait_for_rest_service(rest_url=f"{url}/health") model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k) results = evaluator.simple_evaluate( - model=model, - tasks=eval_task, - limit=limit, - num_fewshot=num_fewshot, - bootstrap_iters=bootstrap_iters - ) + model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters + ) + + print("--score---", results['results']['gsm8k']) - print("--score---",results['results']['gsm8k']) @run.cli.entrypoint(name="import", namespace="llm") def import_ckpt( diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 9b330088487f..a3b0594fb020 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -20,6 +20,7 @@ from nemo.deploy.nlp import NemoQueryLLM + class TritonSettings(BaseSettings): _triton_service_port: int _triton_service_ip: str @@ -62,6 +63,7 @@ def openai_format_response(self): app = FastAPI() triton_settings = TritonSettings() + class CompletionRequest(BaseModel): model: str prompt: str @@ -78,13 +80,16 @@ class CompletionRequest(BaseModel): def health_check(): return {"status": "ok"} + @app.get("/v1/triton_health") async def check_triton_health(): """ This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application. Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible. """ - triton_url = f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" + triton_url = ( + f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" + ) print(f"Attempting to connect to Triton server at: {triton_url}") try: response = requests.get(triton_url, timeout=5) From 7d79edc0bb66357369bba379cb80c4c00a834bd7 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Mon, 7 Oct 2024 22:23:28 -0700 Subject: [PATCH 05/21] Add logprobs to be returned by Pytriton for trtllm models Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 133 ++++++++++++++++++++---- nemo/deploy/nlp/query_llm.py | 4 +- nemo/deploy/service/rest_model_api.py | 1 + nemo/export/tensorrt_llm.py | 13 ++- nemo/export/trt_llm/tensorrt_llm_run.py | 2 + 5 files changed, 129 insertions(+), 24 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index a9d3cfb8d2b6..0d353ecba3aa 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -447,7 +447,9 @@ def evaluate( import time import requests - from lm_eval import evaluator, tasks + from lm_eval import evaluator + ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base + from lm_eval.api.instance import Instance from lm_eval.api.model import LM from requests.exceptions import RequestException @@ -493,6 +495,9 @@ def check_service(url): return False class CustomModel(LM): + """ + Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md + """ def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k): self.model_name = model_name self.api_url = api_url @@ -502,42 +507,130 @@ def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top self.top_k = top_k super().__init__() - def loglikelihood(self, requests): - # Implement log likelihood calculation logic here - pass + def _generate_tokens_logprobs(self, payload, + return_text: bool = False, + return_logprobs: bool = False): + response = requests.post(f"{self.api_url}/completions/", json=payload) + response_data = response.json() + + if 'error' in response_data: + raise Exception(f"API Error: {response_data['error']}") - def loglikelihood_rolling(self, requests): - # Implement log likelihood calculation logic here - pass + # Assuming the response is in OpenAI format + if return_text: + return response_data['choices'][0]['text'] - def generate_until(self, inputs): + if return_logprobs: + return response_data['choices'][0]['log_probs'] + + def loglikelihood(self, requests: list[Instance]): + # log likelihood calculation logic here results = [] - for instance in inputs: + for request in requests: + context = request.arguments[0] + continuation = request.arguments[1] + full_text = context + continuation + instance = Instance( + request_type="loglikelihood", + #doc={'text': full_text}, + doc=request.doc, + arguments=(full_text,), + idx=0, + ) # Access the 'arguments' attribute of the Instance prompt = instance.arguments[0] # This should be the prompt string - # Extract default temperature from instance of the benchmark or use the uder defined value - temperature = ( - instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + # Extract default temperature from instance of the benchmark or use the user defined value + # Does not work for MMLU since the input instance does not contain temp key + # temperature = ( + # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + # ) + payload = { + "model": self.model_name, + "prompt": prompt, + "max_tokens": self.max_tokens_to_generate, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + #"compute_logprob": True ##TODO Do we want to have this as an + # user defined value or set it to True by default ? + } + + log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True) + + # Assuming log_probs is a list of log probabilities for each token + # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ? + continuation_log_prob = sum(log_probs[0][0][-len(continuation):]) + results.append((continuation_log_prob, False)) + + return results + + def loglikelihood_rolling(self, requests: list[Instance]): + # log likelihood rolling calculation logic here + results = [] + for request in requests: + context = request.arguments[0] + continuation = request.arguments[1] + full_text = context + continuation + instance = Instance( + request_type="loglikelihood_rolling", + #doc={'text': full_text}, + doc=request.doc, + arguments=(full_text,), + idx=0, ) + # Access the 'arguments' attribute of the Instance + prompt = instance.arguments[0] # This should be the prompt string + # Extract default temperature from instance of the benchmark or use the user defined value + # Does not work for MMLU since the input instance does not contain temp key + # temperature = ( + # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + # ) payload = { "model": self.model_name, "prompt": prompt, "max_tokens": self.max_tokens_to_generate, - "temperature": temperature, + "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, + #"compute_logprob": True ##TODO Do we want to have this as an + # user defined value or set it to True by default ? } - response = requests.post(f"{self.api_url}/completions/", json=payload) - response_data = response.json() + log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True) + + # Assuming log_probs is a list of log probabilities for each token + continuation_log_probs = log_probs[0][0][-len(continuation):] + results.append((continuation_log_probs, False)) + + return results + + def generate_until(self, inputs: list[Instance]): + # `Instance` is a dataclass defined in [`lm_eval.api.instance`] https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/lm_eval/api/instance.py + results = [] + for instance in inputs: + # Access the 'arguments' attribute of the Instance + prompt = instance.arguments[0] # This should be the prompt string + + # Extract default temperature from instance of the benchmark or use the user defined value + # Does not work for MMLU since the input instance does not contain temp key + # temperature = ( + # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature + # ) + payload = { + "model": self.model_name, + "prompt": prompt, + "max_tokens": self.max_tokens_to_generate, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + #"compute_logprob": True ##TODO Do we want to have this as an + # user defined value or set it to True by default ? + } - if 'error' in response_data: - raise Exception(f"API Error: {response_data['error']}") + generated_text = self._generate_tokens_logprobs(payload, return_text=True) - # Assuming the response is in OpenAI format - generated_text = response_data['choices'][0]['text'] results.append(generated_text) return results @@ -548,7 +641,7 @@ def generate_until(self, inputs): model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters ) - print("--score---", results['results']['gsm8k']) + print("--results---", results['results'][eval_task]) @run.cli.entrypoint(name="import", namespace="llm") diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 7e873db6b5b1..1f01e228bb8d 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -267,7 +267,9 @@ def query_llm( "object": "text_completion", "created": int(time.time()), "model": self.model_name, - "choices": [{"text": str(sentences)}], + #TODO if compute_logprobs is True then add log_probs + ## Convert log_probs to a list to make it json serializable + "choices": [{"text": str(sentences), "log_probs":result_dict["log_probs"].tolist()}] } return openai_response else: diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index a3b0594fb020..fdd68ecb1000 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -114,6 +114,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, + compute_logprob=True ) if triton_settings.openai_format_response: return output diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 08b0b822cad4..5b298696e07d 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -862,12 +862,13 @@ def get_triton_input(self): Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True), Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), + Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True) ) return inputs @property def get_triton_output(self): - outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),) + outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single)) return outputs @batch @@ -898,14 +899,20 @@ def triton_infer_fn(self, **inputs: np.ndarray): if "lora_uids" in inputs: lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8") infer_input["lora_uids"] = lora_uids[0].tolist() + if "compute_logprob" in inputs: + infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0] - output_texts = self.forward(**infer_input) + if infer_input["output_log_probs"]: + output_texts, log_probs = self.forward(**infer_input) + log_probs = np.array(log_probs.cpu().numpy()) + else: + output_texts = self.forward(**infer_input) output = cast_output(output_texts, np.bytes_) except Exception as error: err_msg = "An error occurred: {0}".format(str(error)) output = cast_output([err_msg], np.bytes_) - return {"outputs": output} + return {"outputs": output, "log_probs": log_probs} @batch def triton_infer_fn_streaming(self, **inputs: np.ndarray): diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index bd7b8abd5f9e..110b7c0f1558 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -279,6 +279,7 @@ def _forward( streaming=streaming, output_sequence_lengths=True, return_dict=True, + output_log_probs=sampling_kwargs.get('output_log_probs', False), ) torch.cuda.synchronize() @@ -699,6 +700,7 @@ def generate( output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] input_lengths = [t.shape[0] for t in input_tensors] + log_probs = outputs['log_probs'] output_lines_list = [ tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]]) From 8563cc3545c9b263dd1aaa01a458da64afcf534b Mon Sep 17 00:00:00 2001 From: Abhishree Date: Mon, 14 Oct 2024 16:45:27 -0700 Subject: [PATCH 06/21] Increase max_retries in wait_for_rest_service method Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 0d353ecba3aa..5f3759e79638 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -351,7 +351,6 @@ def deploy( ckpt_type: str = "nemo", ): from nemo.deploy import DeployPyTriton - if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") @@ -453,7 +452,7 @@ def evaluate( from lm_eval.api.model import LM from requests.exceptions import RequestException - def wait_for_rest_service(rest_url, max_retries=30, retry_interval=2): + def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): """ Wait for REST service to be ready. From a35d8d72f9ee7c280cbd06dd36bc8785dfd519b1 Mon Sep 17 00:00:00 2001 From: athitten Date: Tue, 15 Oct 2024 00:14:29 +0000 Subject: [PATCH 07/21] Apply isort and black reformatting Signed-off-by: athitten --- nemo/collections/llm/api.py | 21 +++++++++++---------- nemo/deploy/nlp/query_llm.py | 4 ++-- nemo/deploy/service/rest_model_api.py | 2 +- nemo/export/tensorrt_llm.py | 7 +++++-- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 5f3759e79638..6ddc4c9db403 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -351,6 +351,7 @@ def deploy( ckpt_type: str = "nemo", ): from nemo.deploy import DeployPyTriton + if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") @@ -447,6 +448,7 @@ def evaluate( import requests from lm_eval import evaluator + ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base from lm_eval.api.instance import Instance from lm_eval.api.model import LM @@ -497,6 +499,7 @@ class CustomModel(LM): """ Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ + def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k): self.model_name = model_name self.api_url = api_url @@ -506,9 +509,7 @@ def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top self.top_k = top_k super().__init__() - def _generate_tokens_logprobs(self, payload, - return_text: bool = False, - return_logprobs: bool = False): + def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_logprobs: bool = False): response = requests.post(f"{self.api_url}/completions/", json=payload) response_data = response.json() @@ -531,7 +532,7 @@ def loglikelihood(self, requests: list[Instance]): full_text = context + continuation instance = Instance( request_type="loglikelihood", - #doc={'text': full_text}, + # doc={'text': full_text}, doc=request.doc, arguments=(full_text,), idx=0, @@ -551,7 +552,7 @@ def loglikelihood(self, requests: list[Instance]): "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, - #"compute_logprob": True ##TODO Do we want to have this as an + # "compute_logprob": True ##TODO Do we want to have this as an # user defined value or set it to True by default ? } @@ -559,7 +560,7 @@ def loglikelihood(self, requests: list[Instance]): # Assuming log_probs is a list of log probabilities for each token # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ? - continuation_log_prob = sum(log_probs[0][0][-len(continuation):]) + continuation_log_prob = sum(log_probs[0][0][-len(continuation) :]) results.append((continuation_log_prob, False)) return results @@ -573,7 +574,7 @@ def loglikelihood_rolling(self, requests: list[Instance]): full_text = context + continuation instance = Instance( request_type="loglikelihood_rolling", - #doc={'text': full_text}, + # doc={'text': full_text}, doc=request.doc, arguments=(full_text,), idx=0, @@ -593,14 +594,14 @@ def loglikelihood_rolling(self, requests: list[Instance]): "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, - #"compute_logprob": True ##TODO Do we want to have this as an + # "compute_logprob": True ##TODO Do we want to have this as an # user defined value or set it to True by default ? } log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True) # Assuming log_probs is a list of log probabilities for each token - continuation_log_probs = log_probs[0][0][-len(continuation):] + continuation_log_probs = log_probs[0][0][-len(continuation) :] results.append((continuation_log_probs, False)) return results @@ -624,7 +625,7 @@ def generate_until(self, inputs: list[Instance]): "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, - #"compute_logprob": True ##TODO Do we want to have this as an + # "compute_logprob": True ##TODO Do we want to have this as an # user defined value or set it to True by default ? } diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 1f01e228bb8d..a96c0e3a25eb 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -267,9 +267,9 @@ def query_llm( "object": "text_completion", "created": int(time.time()), "model": self.model_name, - #TODO if compute_logprobs is True then add log_probs + # TODO if compute_logprobs is True then add log_probs ## Convert log_probs to a list to make it json serializable - "choices": [{"text": str(sentences), "log_probs":result_dict["log_probs"].tolist()}] + "choices": [{"text": str(sentences), "log_probs": result_dict["log_probs"].tolist()}], } return openai_response else: diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index fdd68ecb1000..119220da8ace 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -114,7 +114,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - compute_logprob=True + compute_logprob=True, ) if triton_settings.openai_format_response: return output diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 5b298696e07d..21f3cc599ad6 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -862,13 +862,16 @@ def get_triton_input(self): Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True), Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), - Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True) + Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True), ) return inputs @property def get_triton_output(self): - outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single)) + outputs = ( + Tensor(name="outputs", shape=(-1,), dtype=bytes), + Tensor(name="log_probs", shape=(-1,), dtype=np.single), + ) return outputs @batch From 25fced220873bed911d0f01d1b03767fce3b87a1 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 1 Nov 2024 21:20:04 -0700 Subject: [PATCH 08/21] Add unset slurm vars and use env vars for Triton args Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 26 +++++++++++++++++-- nemo/deploy/service/rest_model_api.py | 10 +++---- nemo/lightning/pytorch/callbacks/debugging.py | 2 +- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 6ddc4c9db403..3f23ceee22e6 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -326,6 +326,24 @@ def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, with open("nemo/deploy/service/config.json", "w") as f: json.dump(args_dict, f) +def unset_environment_variables(): + import subprocess + print("Unsetting all SLURM_, PMI_, PMIX_ Variables") + + # Function to unset variables with a specific prefix + def unset_vars_with_prefix(prefix): + cmd = f"env | grep ^{prefix} | cut -d= -f1" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + vars_to_unset = result.stdout.strip().split('\n') + for var in vars_to_unset: + if var: # Check if the variable name is not empty + os.environ.pop(var, None) + + # Unset variables for each prefix + for prefix in ['SLURM_', 'PMI_', 'PMIX_']: + unset_vars_with_prefix(prefix) + + print("Variables unset successfully") @run.cli.entrypoint(namespace="llm") def deploy( @@ -351,13 +369,17 @@ def deploy( ckpt_type: str = "nemo", ): from nemo.deploy import DeployPyTriton - + unset_environment_variables() if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") return # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py - store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) + #store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) + os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address + os.environ['TRITON_PORT'] = str(triton_port) + os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout) + os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response) # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM if ckpt_type == "trtllm": diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 119220da8ace..0c11a04a876c 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -29,12 +29,10 @@ class TritonSettings(BaseSettings): def __init__(self): super(TritonSettings, self).__init__() try: - with open(os.path.join(Path.cwd(), 'nemo/deploy/service/config.json')) as config: - config_json = json.load(config) - self._triton_service_port = config_json["triton_service_port"] - self._triton_service_ip = config_json["triton_service_ip"] - self._triton_request_timeout = config_json["triton_request_timeout"] - self._openai_format_response = config_json["openai_format_response"] + self._triton_service_port = int(os.environ.get('TRITON_PORT', 8080)) + self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0') + self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60)) + self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true' except Exception as error: print("An exception occurred:", error) return diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py index 5f6e722ef89b..b4f80ac89608 100644 --- a/nemo/lightning/pytorch/callbacks/debugging.py +++ b/nemo/lightning/pytorch/callbacks/debugging.py @@ -116,7 +116,7 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule) Iterate over model parameters, find gradient tensor, apply and collect outputs of param_fn and grad_fn, and log outputs in a table. """ - + from prettytable import PrettyTable def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]: """If using MCore optimizer, search the grad buckets for param's grad tensor.""" if not isinstance(pl_module.optim, MegatronOptimizerModule): From 41eb551b3a116bf4a7f8b44edb2fd24a14e92fec Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 1 Nov 2024 16:30:10 -0700 Subject: [PATCH 09/21] Add logic to get logProbs from logits Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 72 ++++++++++++----------- nemo/deploy/nlp/query_llm.py | 5 +- nemo/deploy/service/rest_model_api.py | 5 +- nemo/export/tensorrt_llm.py | 15 +++-- nemo/export/trt_llm/tensorrt_llm_build.py | 2 +- nemo/export/trt_llm/tensorrt_llm_run.py | 5 +- 6 files changed, 60 insertions(+), 44 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 3f23ceee22e6..3a3f1e61dae9 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -453,6 +453,7 @@ def deploy( def evaluate( + nemo_checkpoint_path: Path, url: str = "http://0.0.0.0:1234/v1", model_name: str = "xxxx", eval_task: str = "gsm8k", @@ -521,10 +522,10 @@ class CustomModel(LM): """ Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ - - def __init__(self, model_name, api_url, max_tokens_to_generate, temperature, top_p, top_k): + def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k): self.model_name = model_name self.api_url = api_url + self.tokenizer = tokenizer self.max_tokens_to_generate = max_tokens_to_generate self.temperature = temperature self.top_p = top_p @@ -543,33 +544,30 @@ def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_l return response_data['choices'][0]['text'] if return_logprobs: - return response_data['choices'][0]['log_probs'] + # generation_logits is needed only for loglikelihood tasks + return response_data['choices'][0]['log_probs'], response_data['choices'][0]['generation_logits'] + def loglikelihood(self, requests: list[Instance]): - # log likelihood calculation logic here + import numpy as np + import torch + import torch.nn.functional as F + + special_tokens_kwargs = {'add_special_tokens': False} ## Hardcode for now. TODO Infer add_bos from input. results = [] for request in requests: context = request.arguments[0] continuation = request.arguments[1] - full_text = context + continuation - instance = Instance( - request_type="loglikelihood", - # doc={'text': full_text}, - doc=request.doc, - arguments=(full_text,), - idx=0, - ) - # Access the 'arguments' attribute of the Instance - prompt = instance.arguments[0] # This should be the prompt string + context_enc = self.tokenizer.tokenizer.encode(context) #, **special_tokens_kwargs) #errors for SentencePeicetokenizer + continuation_enc = self.tokenizer.tokenizer.encode(continuation) #, **special_tokens_kwargs) + continuation_enc = continuation_enc[1:] #for SentencePeice since first encoded token is space, comment this for HF tokenizer + num_cont_tokens = len(continuation_enc) + ## Update self.max_tokens_to_generate with number of continuation tokens in the request + self.max_tokens_to_generate = num_cont_tokens - # Extract default temperature from instance of the benchmark or use the user defined value - # Does not work for MMLU since the input instance does not contain temp key - # temperature = ( - # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature - # ) payload = { "model": self.model_name, - "prompt": prompt, + "prompt": context, "max_tokens": self.max_tokens_to_generate, "temperature": self.temperature, "top_p": self.top_p, @@ -577,17 +575,24 @@ def loglikelihood(self, requests: list[Instance]): # "compute_logprob": True ##TODO Do we want to have this as an # user defined value or set it to True by default ? } - - log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True) - - # Assuming log_probs is a list of log probabilities for each token - # TODO : why is log_prbs returned as list of list ? Change it to just a list maybe in query_llm ? - continuation_log_prob = sum(log_probs[0][0][-len(continuation) :]) - results.append((continuation_log_prob, False)) + log_probs, generation_logits = self._generate_tokens_logprobs(payload, return_logprobs=True) + # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation + multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1) + cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0) + greedy_tokens = multi_logits.argmax(dim=-1) + max_equal = (greedy_tokens == cont_toks).all() + logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze( + -1 + ) + result = (float(logits.sum()), bool(max_equal)) + + results.append(result) return results def loglikelihood_rolling(self, requests: list[Instance]): + ## Note: loglikelihood_rolling does not have correct implementation yet, + # the tasks we have working so far: gsm8k, mmlu, lambada dont need loglikelihood_rolling # log likelihood rolling calculation logic here results = [] for request in requests: @@ -635,11 +640,6 @@ def generate_until(self, inputs: list[Instance]): # Access the 'arguments' attribute of the Instance prompt = instance.arguments[0] # This should be the prompt string - # Extract default temperature from instance of the benchmark or use the user defined value - # Does not work for MMLU since the input instance does not contain temp key - # temperature = ( - # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature - # ) payload = { "model": self.model_name, "prompt": prompt, @@ -657,13 +657,17 @@ def generate_until(self, inputs: list[Instance]): return results + ## Get tokenizer from nemo 2.0 model, in case of 1.0 please add appropriate code to get + ## tokenizer from 1.0 ckpt and pass it to CustomModel + model = io.load_context(nemo_checkpoint_path, subpath="model") + wait_for_rest_service(rest_url=f"{url}/health") - model = CustomModel(model_name, url, max_tokens_to_generate, temperature, top_p, top_k) + model = CustomModel(model_name, url, model.tokenizer, max_tokens_to_generate, temperature, top_p, top_k) results = evaluator.simple_evaluate( model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters ) - print("--results---", results['results'][eval_task]) + print("score", results['results'][eval_task]) @run.cli.entrypoint(name="import", namespace="llm") diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index a96c0e3a25eb..6aa9ed79813a 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -269,7 +269,10 @@ def query_llm( "model": self.model_name, # TODO if compute_logprobs is True then add log_probs ## Convert log_probs to a list to make it json serializable - "choices": [{"text": str(sentences), "log_probs": result_dict["log_probs"].tolist()}], + "choices": [{"text": str(sentences), + "log_probs":result_dict["log_probs"].tolist(), + "generation_logits": result_dict["generation_logits"].tolist() + }] } return openai_response else: diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 0c11a04a876c..aaf9f3b6c0a0 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -107,12 +107,15 @@ def completions_v1(request: CompletionRequest): output = nq.query_llm( prompts=[request.prompt], max_output_len=request.max_tokens, + # when these below params are passed as None top_k=request.top_k, top_p=request.top_p, temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - compute_logprob=True, + # TODO make these two user configurable ?? + all_probs=True, + compute_logprob=True ) if triton_settings.openai_format_response: return output diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 21f3cc599ad6..76ea171bdc29 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -863,19 +863,19 @@ def get_triton_input(self): Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True), + Tensor(name="all_probs", shape=(-1,), dtype=np.bool_, optional=True) ) return inputs @property def get_triton_output(self): - outputs = ( - Tensor(name="outputs", shape=(-1,), dtype=bytes), - Tensor(name="log_probs", shape=(-1,), dtype=np.single), - ) + outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single), + Tensor(name="generation_logits", shape=(-1,), dtype=np.single)) return outputs @batch def triton_infer_fn(self, **inputs: np.ndarray): + log_probs = None try: infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))} if "max_output_len" in inputs: @@ -904,10 +904,13 @@ def triton_infer_fn(self, **inputs: np.ndarray): infer_input["lora_uids"] = lora_uids[0].tolist() if "compute_logprob" in inputs: infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0] + if "all_probs" in inputs: + infer_input["all_probs"] = inputs.pop("all_probs")[0][0] if infer_input["output_log_probs"]: - output_texts, log_probs = self.forward(**infer_input) + output_texts, log_probs, generation_logits = self.forward(**infer_input) log_probs = np.array(log_probs.cpu().numpy()) + generation_logits = np.array(generation_logits.cpu().numpy()) else: output_texts = self.forward(**infer_input) output = cast_output(output_texts, np.bytes_) @@ -915,7 +918,7 @@ def triton_infer_fn(self, **inputs: np.ndarray): err_msg = "An error occurred: {0}".format(str(error)) output = cast_output([err_msg], np.bytes_) - return {"outputs": output, "log_probs": log_probs} + return {"outputs": output, "log_probs": log_probs, "generation_logits": generation_logits} @batch def triton_infer_fn_streaming(self, **inputs: np.ndarray): diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 4be2d42ebe4d..424e4c3f27d9 100755 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -97,7 +97,7 @@ def build_and_save_engine( 'opt_num_tokens': opt_num_tokens, 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, 'gather_context_logits': False, - 'gather_generation_logits': False, + 'gather_generation_logits': True, 'strongly_typed': False, 'builder_opt': None, 'use_refit': use_refit, diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 110b7c0f1558..7cbb038c5b0f 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -280,6 +280,7 @@ def _forward( output_sequence_lengths=True, return_dict=True, output_log_probs=sampling_kwargs.get('output_log_probs', False), + all_probs=sampling_kwargs.get('all_probs', False), ) torch.cuda.synchronize() @@ -693,6 +694,7 @@ def generate( multiprocessed_env=multiprocessed_env, **sampling_kwargs, ) + assert outputs is not None if tensorrt_llm.mpi_rank() != 0: return None @@ -701,6 +703,7 @@ def generate( sequence_lengths = outputs['sequence_lengths'] input_lengths = [t.shape[0] for t in input_tensors] log_probs = outputs['log_probs'] + generation_logits = outputs['generation_logits'] output_lines_list = [ tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]]) @@ -708,7 +711,7 @@ def generate( ] if output_log_probs: - return output_lines_list, log_probs + return output_lines_list, log_probs, generation_logits return output_lines_list From d4ca0e10f051372ab99db4097f1c8169979f9902 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Thu, 7 Nov 2024 20:00:06 -0800 Subject: [PATCH 10/21] Refactor, clean and organize the code 1) Refactors the code and creates an evaluation folder where all util methods live 2) Add doctsrings, comments 3) Expose gather_context_logits, gather_generation_logits in trtllm and add output_generation_logits flag to return generation logits and remove output_logporbs as its not getting used anymore Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 420 +++++------------- nemo/collections/llm/evaluation/__init__.py | 3 + nemo/collections/llm/evaluation/eval_utils.py | 267 +++++++++++ nemo/deploy/nlp/query_llm.py | 15 +- nemo/deploy/service/rest_model_api.py | 17 +- nemo/export/tensorrt_llm.py | 37 +- nemo/export/trt_llm/tensorrt_llm_build.py | 6 +- nemo/export/trt_llm/tensorrt_llm_run.py | 7 +- 8 files changed, 418 insertions(+), 354 deletions(-) create mode 100644 nemo/collections/llm/evaluation/__init__.py create mode 100644 nemo/collections/llm/evaluation/eval_utils.py diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 3a3f1e61dae9..4ffc93b8d3b9 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -256,100 +256,11 @@ def validate( return app_state.exp_dir -def get_trtllm_deployable( - nemo_checkpoint, - model_type, - triton_model_repository, - num_gpus, - tensor_parallelism_size, - pipeline_parallelism_size, - max_input_len, - max_output_len, - max_batch_size, - dtype, -): - from nemo.export.tensorrt_llm import TensorRTLLM - - if triton_model_repository is None: - trt_llm_path = "/tmp/trt_llm_model_dir/" - Path(trt_llm_path).mkdir(parents=True, exist_ok=True) - else: - trt_llm_path = triton_model_repository - - if nemo_checkpoint is None and triton_model_repository is None: - raise ValueError( - "The provided model repository is not a valid TensorRT-LLM model " - "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine." - ) - - if nemo_checkpoint is None and not os.path.isdir(triton_model_repository): - raise ValueError( - "The provided model repository is not a valid TensorRT-LLM model " - "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine." - ) - - if nemo_checkpoint is not None and model_type is None: - raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") - - trt_llm_exporter = TensorRTLLM( - model_dir=trt_llm_path, - load_model=(nemo_checkpoint is None), - ) - - if nemo_checkpoint is not None: - try: - logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") - trt_llm_exporter.export( - nemo_checkpoint_path=nemo_checkpoint, - model_type=model_type, - n_gpus=num_gpus, - tensor_parallelism_size=tensor_parallelism_size, - pipeline_parallelism_size=pipeline_parallelism_size, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_batch_size=max_batch_size, - dtype=dtype, - ) - except Exception as error: - raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) - - return trt_llm_exporter - - -def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response): - args_dict = { - "triton_service_ip": triton_http_address, - "triton_service_port": triton_port, - "triton_request_timeout": triton_request_timeout, - "openai_format_response": openai_format_response, - } - with open("nemo/deploy/service/config.json", "w") as f: - json.dump(args_dict, f) - -def unset_environment_variables(): - import subprocess - print("Unsetting all SLURM_, PMI_, PMIX_ Variables") - - # Function to unset variables with a specific prefix - def unset_vars_with_prefix(prefix): - cmd = f"env | grep ^{prefix} | cut -d= -f1" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - vars_to_unset = result.stdout.strip().split('\n') - for var in vars_to_unset: - if var: # Check if the variable name is not empty - os.environ.pop(var, None) - - # Unset variables for each prefix - for prefix in ['SLURM_', 'PMI_', 'PMIX_']: - unset_vars_with_prefix(prefix) - - print("Variables unset successfully") - @run.cli.entrypoint(namespace="llm") def deploy( nemo_checkpoint: Path = None, model_type: str = "llama", - triton_model_name: str = "xxx", + triton_model_name: str = 'triton_model', triton_model_version: Optional[int] = 1, triton_port: int = 8000, triton_http_address: str = "0.0.0.0", @@ -362,49 +273,72 @@ def deploy( max_input_len: int = 256, max_output_len: int = 256, max_batch_size: int = 8, - start_rest_service: bool = False, + start_rest_service: bool = True, rest_service_http_address: str = "0.0.0.0", - rest_service_port: int = 8000, - openai_format_response: bool = False, - ckpt_type: str = "nemo", + rest_service_port: int = 8080, + openai_format_response: bool = True, + output_generation_logits: bool = True ): + """ + Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm. + Also starts rest service that is used to send OpenAI API compatible input request + to the PyTiton server. + + Args: + nemo_checkpoint (Path): Path for nemo checkpoint. + model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama. + triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name + is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'. + triton_model_version (Optional[int]): Version for the triton model. Default: 1. + triton_port (int): Port for the PyTriton server. Default: 8000. + triton_http_address (str): HTTP address for the PyTriton server. Default: "0.0.0.0". + triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60, + triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engin gets saved in this path specified. Default: None. + num_gpus (int): Number of GPUs for export to trtllm and deploy. Default: 1. + tensor_parallelism_size (int): Tensor parallelism size. Default: 1. + pipeline_parallelism_size (int): Pipeline parallelism size. Default: 1. + dtype (str): dtype of the TensorRT-LLM model. Default: "bfloat16". + max_input_len (int): Max input length of the model. Default: 256. + max_output_len (int): Max output length of the model. Default: 256. + max_batch_size (int): Max batch size of the model. Default: 8. + start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server. Needs to be True + to be able to run evaluation . Default: True. + rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0". + rest_service_port (int): Port for the rest service. Ensure the rest service port is the port fowarded between host machine and docker + when running locally inside a docker container. Default: 8080. + openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation. + Default: True. + output_generation_logits (bool): If true builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the + logProb of the output token. Default: True. + """ from nemo.deploy import DeployPyTriton - unset_environment_variables() + from nemo.collections.llm import evaluation + + evaluation.unset_environment_variables() if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") return - # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py - #store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response) + # Store triton ip, port and other args relevant for REST API as env vars to be accessible by rest_model_api.py os.environ['TRITON_HTTP_ADDRESS'] = triton_http_address os.environ['TRITON_PORT'] = str(triton_port) os.environ['TRITON_REQUEST_TIMEOUT'] = str(triton_request_timeout) os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response) - - # TODO: directly support deploy of trtllm engine wo exporting to TRTLLM - if ckpt_type == "trtllm": - triton_deployable = get_trtllm_deployable( - nemo_checkpoint, - model_type, - triton_model_repository, - num_gpus, - tensor_parallelism_size, - pipeline_parallelism_size, - max_input_len, - max_output_len, - max_batch_size, - dtype, - ) - elif ckpt_type == "nemo": - if nemo_checkpoint is None: - raise ValueError("In-Framework deployment requires a .nemo checkpoint") - try: - from nemo.deploy.nlp import MegatronLLMDeployable - except Exception as e: - raise ValueError( - "MegatronLLMDeployable is not supported in this environment as it was not imported.{type(e).__name__}: {e}" - ) - triton_deployable = MegatronLLMDeployable(nemo_checkpoint, num_gpus) + os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits) + + triton_deployable = evaluation.get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, + output_generation_logits + ) try: nm = DeployPyTriton( @@ -455,214 +389,60 @@ def deploy( def evaluate( nemo_checkpoint_path: Path, url: str = "http://0.0.0.0:1234/v1", - model_name: str = "xxxx", + model_name: str = "triton_model", eval_task: str = "gsm8k", num_fewshot: Optional[int] = None, limit: Optional[Union[int, float]] = None, bootstrap_iters: int = 100000, # inference params max_tokens_to_generate: Optional[int] = 256, - temperature: Optional[float] = None, + temperature: Optional[float] = 0.000000001, top_p: Optional[float] = 0.0, top_k: Optional[int] = 1, + add_bos: Optional[bool] = False, ): + """ + Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main). + nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is + required to tokenize the evaluation input and output prompts. + url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}. + Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server. + The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server. + model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able + to launch evaluation. + eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k". + These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run, + but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet. + num_fewshot (int): number of examples in few-shot context. Default: None. + limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples. + If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset. + bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000. + # inference params + max_tokens_to_generate (int): max tokens to generate. Default: 256. + temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001. + Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value. + top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider + the single most likely token for the next prediction. Default: 0.0. + top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token + for the next prediction. Default: 1 + add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for + CausalLM its set to False. If needed set add_bos to True. - import time - - import requests - from lm_eval import evaluator - - ## This may change, how to deal with it ? In the past Instance class was in lm_eval.base - from lm_eval.api.instance import Instance - from lm_eval.api.model import LM - from requests.exceptions import RequestException - - def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): - """ - Wait for REST service to be ready. - - Args: - rest_url (str): URL of the REST service's health endpoint - max_retries (int): Maximum number of retry attempts - retry_interval (int): Time to wait between retries in seconds - - Returns: - bool: True if rest service is ready, False otherwise - """ - for _ in range(max_retries): - rest_ready = check_service(rest_url) - - if rest_ready: - print("REST service is ready.") - return True - - print(f"REST Service not ready yet. Retrying in {retry_interval} seconds...") - time.sleep(retry_interval) - - print("Timeout: One or both services did not become ready.") - return False - - def check_service(url): - """ - Check if a service is ready by making a GET request to its health endpoint. - - Args: - url (str): URL of the service's health endpoint - - Returns: - bool: True if the service is ready, False otherwise - """ - try: - response = requests.get(url, timeout=5) - return response.status_code == 200 - except RequestException: - return False - - class CustomModel(LM): - """ - Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md - """ - def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k): - self.model_name = model_name - self.api_url = api_url - self.tokenizer = tokenizer - self.max_tokens_to_generate = max_tokens_to_generate - self.temperature = temperature - self.top_p = top_p - self.top_k = top_k - super().__init__() - - def _generate_tokens_logprobs(self, payload, return_text: bool = False, return_logprobs: bool = False): - response = requests.post(f"{self.api_url}/completions/", json=payload) - response_data = response.json() - - if 'error' in response_data: - raise Exception(f"API Error: {response_data['error']}") - - # Assuming the response is in OpenAI format - if return_text: - return response_data['choices'][0]['text'] - - if return_logprobs: - # generation_logits is needed only for loglikelihood tasks - return response_data['choices'][0]['log_probs'], response_data['choices'][0]['generation_logits'] - - - def loglikelihood(self, requests: list[Instance]): - import numpy as np - import torch - import torch.nn.functional as F - - special_tokens_kwargs = {'add_special_tokens': False} ## Hardcode for now. TODO Infer add_bos from input. - results = [] - for request in requests: - context = request.arguments[0] - continuation = request.arguments[1] - context_enc = self.tokenizer.tokenizer.encode(context) #, **special_tokens_kwargs) #errors for SentencePeicetokenizer - continuation_enc = self.tokenizer.tokenizer.encode(continuation) #, **special_tokens_kwargs) - continuation_enc = continuation_enc[1:] #for SentencePeice since first encoded token is space, comment this for HF tokenizer - num_cont_tokens = len(continuation_enc) - ## Update self.max_tokens_to_generate with number of continuation tokens in the request - self.max_tokens_to_generate = num_cont_tokens - - payload = { - "model": self.model_name, - "prompt": context, - "max_tokens": self.max_tokens_to_generate, - "temperature": self.temperature, - "top_p": self.top_p, - "top_k": self.top_k, - # "compute_logprob": True ##TODO Do we want to have this as an - # user defined value or set it to True by default ? - } - log_probs, generation_logits = self._generate_tokens_logprobs(payload, return_logprobs=True) - # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation - multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1) - cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0) - greedy_tokens = multi_logits.argmax(dim=-1) - max_equal = (greedy_tokens == cont_toks).all() - logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze( - -1 - ) - result = (float(logits.sum()), bool(max_equal)) - - results.append(result) - - return results - - def loglikelihood_rolling(self, requests: list[Instance]): - ## Note: loglikelihood_rolling does not have correct implementation yet, - # the tasks we have working so far: gsm8k, mmlu, lambada dont need loglikelihood_rolling - # log likelihood rolling calculation logic here - results = [] - for request in requests: - context = request.arguments[0] - continuation = request.arguments[1] - full_text = context + continuation - instance = Instance( - request_type="loglikelihood_rolling", - # doc={'text': full_text}, - doc=request.doc, - arguments=(full_text,), - idx=0, - ) - # Access the 'arguments' attribute of the Instance - prompt = instance.arguments[0] # This should be the prompt string - - # Extract default temperature from instance of the benchmark or use the user defined value - # Does not work for MMLU since the input instance does not contain temp key - # temperature = ( - # instance.arguments[1].get('temperature', 1.0) if not self.temperature else self.temperature - # ) - payload = { - "model": self.model_name, - "prompt": prompt, - "max_tokens": self.max_tokens_to_generate, - "temperature": self.temperature, - "top_p": self.top_p, - "top_k": self.top_k, - # "compute_logprob": True ##TODO Do we want to have this as an - # user defined value or set it to True by default ? - } - - log_probs = self._generate_tokens_logprobs(payload, return_logprobs=True) - - # Assuming log_probs is a list of log probabilities for each token - continuation_log_probs = log_probs[0][0][-len(continuation) :] - results.append((continuation_log_probs, False)) - - return results - - def generate_until(self, inputs: list[Instance]): - # `Instance` is a dataclass defined in [`lm_eval.api.instance`] https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/lm_eval/api/instance.py - results = [] - for instance in inputs: - # Access the 'arguments' attribute of the Instance - prompt = instance.arguments[0] # This should be the prompt string - - payload = { - "model": self.model_name, - "prompt": prompt, - "max_tokens": self.max_tokens_to_generate, - "temperature": self.temperature, - "top_p": self.top_p, - "top_k": self.top_k, - # "compute_logprob": True ##TODO Do we want to have this as an - # user defined value or set it to True by default ? - } - - generated_text = self._generate_tokens_logprobs(payload, return_text=True) - - results.append(generated_text) - - return results - - ## Get tokenizer from nemo 2.0 model, in case of 1.0 please add appropriate code to get - ## tokenizer from 1.0 ckpt and pass it to CustomModel - model = io.load_context(nemo_checkpoint_path, subpath="model") - - wait_for_rest_service(rest_url=f"{url}/health") - model = CustomModel(model_name, url, model.tokenizer, max_tokens_to_generate, temperature, top_p, top_k) + """ + try: + # lm-evaluation-harness import + from lm_eval import evaluator + except ImportError: + raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required to run evaluations") + + from nemo.collections.llm import evaluation + + # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt. + tokenizer = io.load_context(nemo_checkpoint_path + '/context', subpath="model").tokenizer + # Wait for rest service to be ready before starting evaluation + evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health") + # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate + model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos) results = evaluator.simple_evaluate( model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters ) diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py new file mode 100644 index 000000000000..bfe66b3e0ee4 --- /dev/null +++ b/nemo/collections/llm/evaluation/__init__.py @@ -0,0 +1,3 @@ +from nemo.collections.llm.evaluation.eval_utils import NeMoFWLMEval, unset_environment_variables, get_trtllm_deployable, wait_for_rest_service + +__all__ = ["NeMoFWLMEval", "unset_environment_variables", "get_trtllm_deployable", "wait_for_rest_service"] \ No newline at end of file diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/eval_utils.py new file mode 100644 index 000000000000..0287d1e3378f --- /dev/null +++ b/nemo/collections/llm/evaluation/eval_utils.py @@ -0,0 +1,267 @@ +import time +import requests +from requests.exceptions import RequestException +import subprocess +import os +from pathlib import Path + +import torch +import torch.nn.functional as F + +from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer +from nemo.utils import logging + +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM + +class NeMoFWLMEval(LM): + """ + NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with our model deployed on PyTriton server. + Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md + """ + def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos): + self.model_name = model_name + self.api_url = api_url + self.tokenizer = tokenizer + self.max_tokens_to_generate = max_tokens_to_generate + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.add_bos = add_bos + super().__init__() + + def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False): + """ + A private method that sends post request to the model on PyTriton server and returns either generated text or logits. + """ + # send a post request to /v1/completions/ endpoint with the payload + response = requests.post(f"{self.api_url}/v1/completions/", json=payload) + response_data = response.json() + + if 'error' in response_data: + raise Exception(f"API Error: {response_data['error']}") + + # Assuming the response is in OpenAI format + if return_text: + # in case of generate_until tasks return just the text + return response_data['choices'][0]['text'] + + if return_logits: + # in case of loglikelihood tasks return the logits + return response_data['choices'][0]['generation_logits'] + + def tokenizer_type(self, tokenizer): + if isinstance(tokenizer, AutoTokenizer): + return "AutoTokenizer" + elif isinstance(tokenizer, SentencePieceTokenizer): + return "SentencePieceTokenizer" + else: + return "Unknown tokenizer type" + + def loglikelihood(self, requests: list[Instance]): + """ + Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance. + Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples. + """ + if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": + special_tokens_kwargs = {'add_bos': self.add_bos} + elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer": + special_tokens_kwargs = {'add_special_tokens': self.add_bos} ## Hardcode for now. TODO Infer add_bos from input. + + results = [] + for request in requests: + # get the input prompt from the request + context = request.arguments[0] + # get the output prompt from the request + continuation = request.arguments[1] + # get encoded tokens of continuation + continuation_enc = self.tokenizer.tokenizer.encode(continuation, **special_tokens_kwargs) + # for SentencePeice consider the encoded tokens from the 2nd token since first encoded token is space. + if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:] + num_cont_tokens = len(continuation_enc) + # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request + self.max_tokens_to_generate = num_cont_tokens + # Create payload to query the model deployed on PyTriton server + payload = { + "model": self.model_name, + "prompt": context, + "max_tokens": self.max_tokens_to_generate, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + } + # Get the logits from the model + generation_logits = self._generate_tokens_logits(payload, return_logits=True) + # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax + multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1) + # Convert encoded continuation tokens to torch tensor + cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0) + # Get the greedy token from the logits (i.e token with the highest prob) + greedy_tokens = multi_logits.argmax(dim=-1) + # Check if all greedy_tokens match the the actual continuation tokens + is_greedy = (greedy_tokens == cont_toks).all() + # Get the logits corresponding to the actual continuation tokens + logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze( + -1 + ) + # result is tuple of logProb of generating the continuation token and is_greedy + result = (float(logits.sum()), bool(is_greedy)) + + results.append(result) + + return results + + def loglikelihood_rolling(self, requests: list[Instance]): + pass + + def generate_until(self, inputs: list[Instance]): + """ + Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance. + Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples. + """ + results = [] + for instance in inputs: + # Access the 'arguments' attribute of the Instance which contains the input prompt string + prompt = instance.arguments[0] + # Create payload to query the model deployed on PyTriton server + payload = { + "model": self.model_name, + "prompt": prompt, + "max_tokens": self.max_tokens_to_generate, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + } + # Get the text generated by the model + generated_text = self._generate_tokens_logits(payload, return_text=True) + + results.append(generated_text) + + return results + +def unset_environment_variables(): + """ + SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work + on clusters. This method takes care of unsetting these env variables + # TODO maybe move this to NeMo-Run script ? + """ + logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables") + + # Function to unset variables with a specific prefix + def unset_vars_with_prefix(prefix): + cmd = f"env | grep ^{prefix} | cut -d= -f1" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + vars_to_unset = result.stdout.strip().split('\n') + for var in vars_to_unset: + if var: # Check if the variable name is not empty + os.environ.pop(var, None) + + # Unset variables for each prefix + for prefix in ['SLURM_', 'PMI_', 'PMIX_']: + unset_vars_with_prefix(prefix) + + logging.info("Variables unset successfully") + +def get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, + output_generation_logits +): + from nemo.export.tensorrt_llm import TensorRTLLM + + if triton_model_repository is None: + trt_llm_path = "/tmp/trt_llm_model_dir/" + Path(trt_llm_path).mkdir(parents=True, exist_ok=True) + else: + trt_llm_path = triton_model_repository + + if nemo_checkpoint is None and triton_model_repository is None: + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine." + ) + + if nemo_checkpoint is None and not os.path.isdir(triton_model_repository): + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine." + ) + + if nemo_checkpoint is not None and model_type is None: + raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") + + trt_llm_exporter = TensorRTLLM( + model_dir=trt_llm_path, + load_model=(nemo_checkpoint is None), + ) + + if nemo_checkpoint is not None: + try: + logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") + trt_llm_exporter.export( + nemo_checkpoint_path=nemo_checkpoint, + model_type=model_type, + n_gpus=num_gpus, + tensor_parallelism_size=tensor_parallelism_size, + pipeline_parallelism_size=pipeline_parallelism_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + dtype=dtype, + gather_generation_logits=output_generation_logits + ) + except Exception as error: + raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) + + return trt_llm_exporter + +def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): + """ + Wait for REST service to be ready. + + Args: + rest_url (str): URL of the REST service's health endpoint + max_retries (int): Maximum number of retry attempts. Defaul: 60. + retry_interval (int): Time to wait between retries in seconds. Default: 2. + + Returns: + bool: True if rest service is ready, False otherwise + """ + + def check_service(url): + """ + Check if the service is ready by making a GET request to its health endpoint. + + Args: + url (str): URL of the service's health endpoint + + Returns: + bool: True if the service is ready, False otherwise + """ + try: + response = requests.get(url, timeout=5) + return response.status_code == 200 + except RequestException: + return False + + for _ in range(max_retries): + rest_ready = check_service(rest_url) + + if rest_ready: + logging.info("REST service is ready.") + return True + + logging.info(f"REST Service not ready yet. Retrying in {retry_interval} seconds...") + time.sleep(retry_interval) + + logging.info("Timeout: REST service did not become ready.") + return False \ No newline at end of file diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 6aa9ed79813a..62215917733b 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -174,6 +174,7 @@ def query_llm( end_strings=None, init_timeout=60.0, openai_format_response: bool = False, + output_generation_logits: bool = False ): """ Query the Triton server synchronously and return a list of responses. @@ -190,6 +191,8 @@ def query_llm( no_repeat_ngram_size (int): no repeat ngram size. task_id (str): downstream task id if virtual tokens are used. init_timeout (flat): timeout for the connection. + openai_format_response: return response similar to OpenAI API format + output_generation_logits: return generation logits from model on PyTriton """ prompts = str_list2numpy(prompts) @@ -248,6 +251,9 @@ def query_llm( if end_strings is not None: inputs["end_strings"] = str_list2numpy(end_strings) + if output_generation_logits is not None: + inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_) + with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: result_dict = client.infer_batch(**inputs) output_type = client.model_config.outputs[0].dtype @@ -267,13 +273,12 @@ def query_llm( "object": "text_completion", "created": int(time.time()), "model": self.model_name, - # TODO if compute_logprobs is True then add log_probs - ## Convert log_probs to a list to make it json serializable "choices": [{"text": str(sentences), - "log_probs":result_dict["log_probs"].tolist(), - "generation_logits": result_dict["generation_logits"].tolist() - }] + #"generation_logits": result_dict["generation_logits"].tolist() + }] } + # Convert gneration logits to a list to make it json serializable and add it to openai_response dict + if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist() return openai_response else: return sentences diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index aaf9f3b6c0a0..6218cd2ed6f4 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -8,8 +8,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import json import os from pathlib import Path import requests @@ -33,6 +31,7 @@ def __init__(self): self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0') self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60)) self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true' + self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true' except Exception as error: print("An exception occurred:", error) return @@ -52,11 +51,17 @@ def triton_request_timeout(self): @property def openai_format_response(self): """ - Retuns the response from Triton server in OpenAI compatible formar if set to True, - default set in config.json is false. + Retuns the response from Triton server in OpenAI compatible formar if set to True. """ return self._openai_format_response + @property + def output_generation_logits(self): + """ + Retuns the generation logits along with text in Triton server output if set to True. + """ + return self._output_generation_logits + app = FastAPI() triton_settings = TritonSettings() @@ -113,9 +118,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - # TODO make these two user configurable ?? - all_probs=True, - compute_logprob=True + output_generation_logits=triton_settings.output_generation_logits ) if triton_settings.openai_format_response: return output diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 76ea171bdc29..e16275b2208d 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -180,6 +180,8 @@ def export( reduce_fusion: bool = True, fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None, + gather_context_logits: Optional[bool] = False, + gather_generation_logits: Optional[bool] = False ): """ Exports nemo checkpoints to TensorRT-LLM. @@ -218,6 +220,8 @@ def export( reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type. fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type. + gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False + gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False """ if n_gpus is not None: warnings.warn( @@ -495,6 +499,8 @@ def get_transformer_config(nemo_model_config): multiple_profiles=multiple_profiles, gpt_attention_plugin=gpt_attention_plugin, gemm_plugin=gemm_plugin, + gather_context_logits=gather_context_logits, + gather_generation_logits=gather_generation_logits ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") @@ -688,6 +694,7 @@ def forward( prompt_embeddings_checkpoint_path: str = None, streaming: bool = False, output_log_probs: bool = False, + output_generation_logits: bool = False, **sampling_kwargs, ): """ @@ -706,6 +713,7 @@ def forward( task_ids (List(str)): list of the task ids for the prompt tables. prompt_embeddings_table (List(float)): prompt embeddings table. prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table. + output_generation_logits (bool): if True returns generation_logits in the outout of generate method. sampling_kwargs: Additional kwargs to set in the SamplingConfig. """ @@ -784,6 +792,7 @@ def forward( no_repeat_ngram_size=no_repeat_ngram_size, output_log_probs=output_log_probs, multiprocessed_env=multiprocessed_env, + output_generation_logits=output_generation_logits, **sampling_kwargs, ) else: @@ -862,20 +871,19 @@ def get_triton_input(self): Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True), Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), - Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True), - Tensor(name="all_probs", shape=(-1,), dtype=np.bool_, optional=True) + Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False), ) return inputs @property def get_triton_output(self): - outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="log_probs", shape=(-1,), dtype=np.single), + outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="generation_logits", shape=(-1,), dtype=np.single)) return outputs @batch def triton_infer_fn(self, **inputs: np.ndarray): - log_probs = None + output_dict = {} try: infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))} if "max_output_len" in inputs: @@ -902,23 +910,20 @@ def triton_infer_fn(self, **inputs: np.ndarray): if "lora_uids" in inputs: lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8") infer_input["lora_uids"] = lora_uids[0].tolist() - if "compute_logprob" in inputs: - infer_input["output_log_probs"] = inputs.pop("compute_logprob")[0][0] - if "all_probs" in inputs: - infer_input["all_probs"] = inputs.pop("all_probs")[0][0] - - if infer_input["output_log_probs"]: - output_texts, log_probs, generation_logits = self.forward(**infer_input) - log_probs = np.array(log_probs.cpu().numpy()) - generation_logits = np.array(generation_logits.cpu().numpy()) + if "output_generation_logits" in inputs: + infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0] + + if infer_input["output_generation_logits"]: + output_texts, generation_logits = self.forward(**infer_input) + output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy()) else: output_texts = self.forward(**infer_input) - output = cast_output(output_texts, np.bytes_) + output_dict["outputs"] = cast_output(output_texts, np.bytes_) except Exception as error: err_msg = "An error occurred: {0}".format(str(error)) - output = cast_output([err_msg], np.bytes_) + output_dict["outputs"] = cast_output([err_msg], np.bytes_) - return {"outputs": output, "log_probs": log_probs, "generation_logits": generation_logits} + return output_dict @batch def triton_infer_fn_streaming(self, **inputs: np.ndarray): diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 424e4c3f27d9..88767917301e 100755 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -54,6 +54,8 @@ def build_and_save_engine( gpt_attention_plugin: str = "auto", gemm_plugin: str = "auto", reduce_fusion: bool = False, + gather_context_logits: bool = False, + gather_generation_logits: bool = False ): architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture try: @@ -96,8 +98,8 @@ def build_and_save_engine( 'max_num_tokens': max_num_tokens, 'opt_num_tokens': opt_num_tokens, 'max_prompt_embedding_table_size': max_prompt_embedding_table_size, - 'gather_context_logits': False, - 'gather_generation_logits': True, + 'gather_context_logits': gather_context_logits, + 'gather_generation_logits': gather_generation_logits, 'strongly_typed': False, 'builder_opt': None, 'use_refit': use_refit, diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 7cbb038c5b0f..84c4be7a616f 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -649,6 +649,7 @@ def generate( streaming: bool = False, output_log_probs=False, multiprocessed_env=False, + output_generation_logits=False, **sampling_kwargs, ) -> Optional[List[List[str]]]: """Generate the output sequence from the input sequence. @@ -702,16 +703,14 @@ def generate( output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] input_lengths = [t.shape[0] for t in input_tensors] - log_probs = outputs['log_probs'] - generation_logits = outputs['generation_logits'] output_lines_list = [ tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]]) for b in range(output_ids.shape[0]) ] - if output_log_probs: - return output_lines_list, log_probs, generation_logits + if output_generation_logits: + return output_lines_list, outputs['generation_logits'] return output_lines_list From b6bdf90e2d57301df6d47b0d79421ced2396a8a0 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 8 Nov 2024 16:50:21 -0800 Subject: [PATCH 11/21] Add copyright and initialize special_tokens_kwargs in eval_utils.py Signed-off-by: Abhishree --- nemo/collections/llm/evaluation/eval_utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/eval_utils.py index 0287d1e3378f..cb35dec698bc 100644 --- a/nemo/collections/llm/evaluation/eval_utils.py +++ b/nemo/collections/llm/evaluation/eval_utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import time import requests from requests.exceptions import RequestException @@ -64,10 +78,11 @@ def loglikelihood(self, requests: list[Instance]): Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples. """ + special_tokens_kwargs = {} if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": - special_tokens_kwargs = {'add_bos': self.add_bos} + special_tokens_kwargs['add_bos'] = self.add_bos elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer": - special_tokens_kwargs = {'add_special_tokens': self.add_bos} ## Hardcode for now. TODO Infer add_bos from input. + special_tokens_kwargs['add_special_tokens'] = self.add_bos results = [] for request in requests: From 32a9d9add27b3a4b72643ceed5b144b67e4e6b30 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 08:11:48 -0800 Subject: [PATCH 12/21] Add the following chanes 1) Move get_trtllm_deployable and unset_environment_variables to deploy base.py 2) Rename eval_utils.py to base.py 3) REstore scripts/export/convert_nemo2_for_export.py Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 6 +- nemo/collections/llm/deploy/__init__.py | 3 + nemo/collections/llm/deploy/base.py | 102 +++++++++++++++ nemo/collections/llm/evaluation/__init__.py | 4 +- .../llm/evaluation/{eval_utils.py => base.py} | 86 ------------ nemo/deploy/nlp/query_llm.py | 5 +- nemo/lightning/pytorch/callbacks/debugging.py | 1 - scripts/export/convert_nemo2_for_export.py | 123 ++++++++++++++++++ 8 files changed, 235 insertions(+), 95 deletions(-) create mode 100644 nemo/collections/llm/deploy/__init__.py create mode 100644 nemo/collections/llm/deploy/base.py rename nemo/collections/llm/evaluation/{eval_utils.py => base.py} (72%) create mode 100644 scripts/export/convert_nemo2_for_export.py diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 4ffc93b8d3b9..a870b55c9574 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -312,9 +312,9 @@ def deploy( logProb of the output token. Default: True. """ from nemo.deploy import DeployPyTriton - from nemo.collections.llm import evaluation + from nemo.collections.llm import deploy - evaluation.unset_environment_variables() + deploy.unset_environment_variables() if start_rest_service: if triton_port == rest_service_port: logging.error("REST service port and Triton server port cannot use the same port.") @@ -326,7 +326,7 @@ def deploy( os.environ['OPENAI_FORMAT_RESPONSE'] = str(openai_format_response) os.environ['OUTPUT_GENERATION_LOGITS'] = str(output_generation_logits) - triton_deployable = evaluation.get_trtllm_deployable( + triton_deployable = deploy.get_trtllm_deployable( nemo_checkpoint, model_type, triton_model_repository, diff --git a/nemo/collections/llm/deploy/__init__.py b/nemo/collections/llm/deploy/__init__.py new file mode 100644 index 000000000000..312cfb93ca1c --- /dev/null +++ b/nemo/collections/llm/deploy/__init__.py @@ -0,0 +1,3 @@ +from nemo.collections.llm.deploy.base import unset_environment_variables, get_trtllm_deployable + +__all__ = ["unset_environment_variables", "get_trtllm_deployable"] diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py new file mode 100644 index 000000000000..2ae87c1f3a46 --- /dev/null +++ b/nemo/collections/llm/deploy/base.py @@ -0,0 +1,102 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo.utils import logging +import subprocess +import os +from pathlib import Path + +def unset_environment_variables(): + """ + SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work + on clusters. This method takes care of unsetting these env variables + # TODO maybe move this to NeMo-Run script ? + """ + logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables") + + # Function to unset variables with a specific prefix + def unset_vars_with_prefix(prefix): + cmd = f"env | grep ^{prefix} | cut -d= -f1" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + vars_to_unset = result.stdout.strip().split('\n') + for var in vars_to_unset: + if var: # Check if the variable name is not empty + os.environ.pop(var, None) + + # Unset variables for each prefix + for prefix in ['SLURM_', 'PMI_', 'PMIX_']: + unset_vars_with_prefix(prefix) + + logging.info("Variables unset successfully") + +def get_trtllm_deployable( + nemo_checkpoint, + model_type, + triton_model_repository, + num_gpus, + tensor_parallelism_size, + pipeline_parallelism_size, + max_input_len, + max_output_len, + max_batch_size, + dtype, + output_generation_logits +): + from nemo.export.tensorrt_llm import TensorRTLLM + + if triton_model_repository is None: + trt_llm_path = "/tmp/trt_llm_model_dir/" + Path(trt_llm_path).mkdir(parents=True, exist_ok=True) + else: + trt_llm_path = triton_model_repository + + if nemo_checkpoint is None and triton_model_repository is None: + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine." + ) + + if nemo_checkpoint is None and not os.path.isdir(triton_model_repository): + raise ValueError( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine." + ) + + if nemo_checkpoint is not None and model_type is None: + raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") + + trt_llm_exporter = TensorRTLLM( + model_dir=trt_llm_path, + load_model=(nemo_checkpoint is None), + ) + + if nemo_checkpoint is not None: + try: + logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") + trt_llm_exporter.export( + nemo_checkpoint_path=nemo_checkpoint, + model_type=model_type, + n_gpus=num_gpus, + tensor_parallelism_size=tensor_parallelism_size, + pipeline_parallelism_size=pipeline_parallelism_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + dtype=dtype, + gather_generation_logits=output_generation_logits + ) + except Exception as error: + raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) + + return trt_llm_exporter \ No newline at end of file diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py index bfe66b3e0ee4..bca7c6251588 100644 --- a/nemo/collections/llm/evaluation/__init__.py +++ b/nemo/collections/llm/evaluation/__init__.py @@ -1,3 +1,3 @@ -from nemo.collections.llm.evaluation.eval_utils import NeMoFWLMEval, unset_environment_variables, get_trtllm_deployable, wait_for_rest_service +from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service -__all__ = ["NeMoFWLMEval", "unset_environment_variables", "get_trtllm_deployable", "wait_for_rest_service"] \ No newline at end of file +__all__ = ["NeMoFWLMEval", "wait_for_rest_service"] \ No newline at end of file diff --git a/nemo/collections/llm/evaluation/eval_utils.py b/nemo/collections/llm/evaluation/base.py similarity index 72% rename from nemo/collections/llm/evaluation/eval_utils.py rename to nemo/collections/llm/evaluation/base.py index cb35dec698bc..145d70b5c6fc 100644 --- a/nemo/collections/llm/evaluation/eval_utils.py +++ b/nemo/collections/llm/evaluation/base.py @@ -15,9 +15,6 @@ import time import requests from requests.exceptions import RequestException -import subprocess -import os -from pathlib import Path import torch import torch.nn.functional as F @@ -155,89 +152,6 @@ def generate_until(self, inputs: list[Instance]): return results -def unset_environment_variables(): - """ - SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work - on clusters. This method takes care of unsetting these env variables - # TODO maybe move this to NeMo-Run script ? - """ - logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables") - - # Function to unset variables with a specific prefix - def unset_vars_with_prefix(prefix): - cmd = f"env | grep ^{prefix} | cut -d= -f1" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - vars_to_unset = result.stdout.strip().split('\n') - for var in vars_to_unset: - if var: # Check if the variable name is not empty - os.environ.pop(var, None) - - # Unset variables for each prefix - for prefix in ['SLURM_', 'PMI_', 'PMIX_']: - unset_vars_with_prefix(prefix) - - logging.info("Variables unset successfully") - -def get_trtllm_deployable( - nemo_checkpoint, - model_type, - triton_model_repository, - num_gpus, - tensor_parallelism_size, - pipeline_parallelism_size, - max_input_len, - max_output_len, - max_batch_size, - dtype, - output_generation_logits -): - from nemo.export.tensorrt_llm import TensorRTLLM - - if triton_model_repository is None: - trt_llm_path = "/tmp/trt_llm_model_dir/" - Path(trt_llm_path).mkdir(parents=True, exist_ok=True) - else: - trt_llm_path = triton_model_repository - - if nemo_checkpoint is None and triton_model_repository is None: - raise ValueError( - "The provided model repository is not a valid TensorRT-LLM model " - "directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine." - ) - - if nemo_checkpoint is None and not os.path.isdir(triton_model_repository): - raise ValueError( - "The provided model repository is not a valid TensorRT-LLM model " - "directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine." - ) - - if nemo_checkpoint is not None and model_type is None: - raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") - - trt_llm_exporter = TensorRTLLM( - model_dir=trt_llm_path, - load_model=(nemo_checkpoint is None), - ) - - if nemo_checkpoint is not None: - try: - logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") - trt_llm_exporter.export( - nemo_checkpoint_path=nemo_checkpoint, - model_type=model_type, - n_gpus=num_gpus, - tensor_parallelism_size=tensor_parallelism_size, - pipeline_parallelism_size=pipeline_parallelism_size, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_batch_size=max_batch_size, - dtype=dtype, - gather_generation_logits=output_generation_logits - ) - except Exception as error: - raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) - - return trt_llm_exporter def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): """ diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 62215917733b..4c55cf3b2c15 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -273,9 +273,8 @@ def query_llm( "object": "text_completion", "created": int(time.time()), "model": self.model_name, - "choices": [{"text": str(sentences), - #"generation_logits": result_dict["generation_logits"].tolist() - }] + "choices": [{"text": str(sentences) + }] } # Convert gneration logits to a list to make it json serializable and add it to openai_response dict if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist() diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py index b4f80ac89608..b5da06dfbf53 100644 --- a/nemo/lightning/pytorch/callbacks/debugging.py +++ b/nemo/lightning/pytorch/callbacks/debugging.py @@ -116,7 +116,6 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule) Iterate over model parameters, find gradient tensor, apply and collect outputs of param_fn and grad_fn, and log outputs in a table. """ - from prettytable import PrettyTable def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]: """If using MCore optimizer, search the grad buckets for param's grad tensor.""" if not isinstance(pl_module.optim, MegatronOptimizerModule): diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py new file mode 100644 index 000000000000..f1eea3cfa6b8 --- /dev/null +++ b/scripts/export/convert_nemo2_for_export.py @@ -0,0 +1,123 @@ +opyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Convert a NeMo 2.0 checkpoint to NeMo 1.0 for TRTLLM export. +Example to run this conversion script: +``` + python /opt/NeMo/scripts/scripts/export/convert_nemo2_for_export.py \ + --input_path /path/to/nemo2/ckpt \ + --output_path /path/to/output \ + --tokenizer_type huggingface \ + --tokenizer_name meta-llama/Meta-Llama-3.1-8B \ + --symbolic_link=True +``` +""" + +import os +import shutil +from argparse import ArgumentParser + +from omegaconf import OmegaConf + +from nemo.lightning import io + + +def get_args(): + parser = ArgumentParser() + parser.add_argument( + "--input_path", + type=str, + required=True, + help="Path to nemo 2.0 checkpoint", + ) + parser.add_argument( + "--output_path", + type=str, + required=True, + help="Output path", + ) + parser.add_argument( + "--tokenizer_type", + type=str, + default="huggingface", + help="Type of tokenizer", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default="meta-llama/Meta-Llama-3.1-8B", + help="Name or path of tokenizer", + ) + parser.add_argument( + "--symbolic_link", + type=bool, + default=True, + help="Whether to use symbiloc link for model weights", + ) + + args = parser.parse_args() + return args + + +def main(args): + input_path = args.input_path + output_path = args.output_path + weight_path = os.path.join(output_path, "model_weights") + + if os.path.exists(output_path): + shutil.rmtree(output_path) + print(f"Remove existing {output_path}") + + os.makedirs(output_path, exist_ok=True) + + config = io.load_context(input_path, subpath="model.config") + + config_dict = {} + for k, v in config.__dict__.items(): + if isinstance(v, (float, int, str, bool)): + config_dict[k] = v + elif k == "activation_func": + config_dict["activation"] = v.__name__ + + if config_dict.get("num_moe_experts") is None: + config_dict["num_moe_experts"] = 0 + config_dict["moe_router_topk"] = 0 + if config_dict["activation"] == "silu": + config_dict["activation"] = "fast-swiglu" + + config_dict["mcore_gpt"] = True + config_dict["max_position_embeddings"] = config_dict.get("seq_length") + config_dict["tokenizer"] = { + "library": args.tokenizer_type, + "type": args.tokenizer_name, + "use_fast": True, + } + + yaml_config = OmegaConf.create(config_dict) + OmegaConf.save(config=yaml_config, f=os.path.join(output_path, "model_config.yaml")) + + if args.symbolic_link: + os.symlink(input_path, weight_path) + else: + os.makedirs(weight_path, exist_ok=True) + for file in os.listdir(input_path): + source_path = os.path.join(input_path, file) + target_path = os.path.join(weight_path, file) + shutil.copy(source_path, target_path) + + +if __name__ == "__main__": + args = get_args() + main(args) From 0bdea4b400fb03762741f26c257d46dd09e601f3 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 08:17:29 -0800 Subject: [PATCH 13/21] Fix a minor typo Signed-off-by: Abhishree --- scripts/export/convert_nemo2_for_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py index f1eea3cfa6b8..0703322cd854 100644 --- a/scripts/export/convert_nemo2_for_export.py +++ b/scripts/export/convert_nemo2_for_export.py @@ -1,4 +1,4 @@ -opyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 05428c210102046a0886513858dc98bda3481874 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 11:07:19 -0800 Subject: [PATCH 14/21] Revert output_log_probs and all_probs arg in tensorrt_llm_run.py Signed-off-by: Abhishree --- nemo/export/trt_llm/tensorrt_llm_run.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index 84c4be7a616f..ef67c918290f 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -279,8 +279,6 @@ def _forward( streaming=streaming, output_sequence_lengths=True, return_dict=True, - output_log_probs=sampling_kwargs.get('output_log_probs', False), - all_probs=sampling_kwargs.get('all_probs', False), ) torch.cuda.synchronize() From 8428a6884adceef80bc91bb842c1c3277fd90413 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 11:42:55 -0800 Subject: [PATCH 15/21] Fix docstrings formatting Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 50 +++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index a870b55c9574..22515c25a559 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -403,30 +403,32 @@ def evaluate( ): """ Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main). - nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is - required to tokenize the evaluation input and output prompts. - url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}. - Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server. - The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server. - model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able - to launch evaluation. - eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k". - These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run, - but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet. - num_fewshot (int): number of examples in few-shot context. Default: None. - limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples. - If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset. - bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000. - # inference params - max_tokens_to_generate (int): max tokens to generate. Default: 256. - temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001. - Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value. - top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider - the single most likely token for the next prediction. Default: 0.0. - top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token - for the next prediction. Default: 1 - add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for - CausalLM its set to False. If needed set add_bos to True. + + Args: + nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is + required to tokenize the evaluation input and output prompts. + url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}. + Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server. + The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server. + model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able + to launch evaluation. + eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k". + These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run, + but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet. + num_fewshot (int): number of examples in few-shot context. Default: None. + limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples. + If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset. + bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000. + # inference params + max_tokens_to_generate (int): max tokens to generate. Default: 256. + temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001. + Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value. + top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider + the single most likely token for the next prediction. Default: 0.0. + top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token + for the next prediction. Default: 1 + add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for + CausalLM its set to False. If needed set add_bos to True. """ try: From 85b988549240957a71e38de02b779223c89c1c42 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 15:43:35 -0800 Subject: [PATCH 16/21] Pylint and other minor fixes Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 73 +++++++++++++------------ nemo/collections/llm/deploy/base.py | 21 +++++-- nemo/collections/llm/evaluation/base.py | 22 +++++--- nemo/deploy/service/rest_model_api.py | 2 +- 4 files changed, 72 insertions(+), 46 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 22515c25a559..db8556afa072 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -288,12 +288,13 @@ def deploy( nemo_checkpoint (Path): Path for nemo checkpoint. model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama. triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name - is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'. + is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'. triton_model_version (Optional[int]): Version for the triton model. Default: 1. triton_port (int): Port for the PyTriton server. Default: 8000. triton_http_address (str): HTTP address for the PyTriton server. Default: "0.0.0.0". - triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60, - triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engin gets saved in this path specified. Default: None. + triton_request_timeout (int): Timeout in seconds for Triton server. Default: 60. + triton_model_repository (Path): Folder for the trt-llm conversion, trt-llm engine gets saved in this specified + path. If None, saves it in /tmp dir. Default: None. num_gpus (int): Number of GPUs for export to trtllm and deploy. Default: 1. tensor_parallelism_size (int): Tensor parallelism size. Default: 1. pipeline_parallelism_size (int): Pipeline parallelism size. Default: 1. @@ -301,15 +302,14 @@ def deploy( max_input_len (int): Max input length of the model. Default: 256. max_output_len (int): Max output length of the model. Default: 256. max_batch_size (int): Max batch size of the model. Default: 8. - start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server. Needs to be True - to be able to run evaluation . Default: True. + start_rest_service (bool): Start rest service that is used to send evaluation requests to the PyTriton server. + Needs to be True to be able to run evaluation. Default: True. rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0". - rest_service_port (int): Port for the rest service. Ensure the rest service port is the port fowarded between host machine and docker - when running locally inside a docker container. Default: 8080. - openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation. - Default: True. - output_generation_logits (bool): If true builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the - logProb of the output token. Default: True. + rest_service_port (int): Port for the rest service. Default: 8080. + openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be + True while running evaluation. Default: True. + output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. + generation_logits are used to compute the logProb of the output token. Default: True. """ from nemo.deploy import DeployPyTriton from nemo.collections.llm import deploy @@ -388,7 +388,7 @@ def deploy( def evaluate( nemo_checkpoint_path: Path, - url: str = "http://0.0.0.0:1234/v1", + url: str = "http://0.0.0.0:8080/v1", model_name: str = "triton_model", eval_task: str = "gsm8k", num_fewshot: Optional[int] = None, @@ -402,34 +402,39 @@ def evaluate( add_bos: Optional[bool] = False, ): """ - Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness (https://github.com/EleutherAI/lm-evaluation-harness/tree/main). + Evaluates nemo model deployed on PyTriton server (via trtllm) using lm-evaluation-harness + (https://github.com/EleutherAI/lm-evaluation-harness/tree/main). Args: - nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is - required to tokenize the evaluation input and output prompts. - url (str): rest serice url and port that were used in the deploy method above in the format: http://{rest_service_http}:{rest_service_port}. - Post requests with evaluation input prompts (from lm-eval-harness) are sent to this url which is then passed to the model deployed in PyTriton server. - The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server. - model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name passed to the deploy method above to be able - to launch evaluation. + nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which + is required to tokenize the evaluation input and output prompts. + url (str): rest serice url and port that were used in the deploy method above in the format: + http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts (from lm-eval-harness) + are sent to this url which is then passed to the model deployed on PyTriton server. The rest service url and port + serve as the entry point to evaluate model deployed on PyTriton server. + model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name + passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model". eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k". - These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run, - but only the above mentioned ones are tested. Tasks of type loglikelihood_rolling are not supported yet. + These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from + lm-evaluation-harness can be run, but only the above mentioned ones are tested. Tasks of type + loglikelihood_rolling are not supported yet. num_fewshot (int): number of examples in few-shot context. Default: None. - limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit is a percentage of the total number of examples. - If int say x, then run evaluation only on x number of samples/samples from the eval dataset. Default: None, which means eval is run the entire dataset. - bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. Default: 100000. + limit (Union[int, float]): Limit the number of examples per task. If <1 (i.e float val between 0 and 1), limit + is a percentage of the total number of examples. If int say x, then run evaluation only on x number of samples + from the eval dataset. Default: None, which means eval is run the entire dataset. + bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0 + for no stderr calculations to be performed. Default: 100000. # inference params max_tokens_to_generate (int): max tokens to generate. Default: 256. - temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Default: 0.000000001. - Temp can't be set to 0.0, due to a bug with TRTLLM (# TODO to be investigated) hence using a very samll value. - top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider - the single most likely token for the next prediction. Default: 0.0. - top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will only consider the single most likely token - for the next prediction. Default: 1 - add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for - CausalLM its set to False. If needed set add_bos to True. - + temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token + with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM(# TODO to be investigated). + Hence using a very samll value as the default. Default: 0.000000001. + top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. + top_p=0 means the model will only consider the single most likely token for the next prediction. Default: 0.0. + top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will + only consider the single most likely token for the next prediction. Default: 1 + add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when + encoding a string. Default: False since typically for CausalLM its set to False. If needed set add_bos to True. """ try: # lm-evaluation-harness import diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py index 2ae87c1f3a46..46ce57152f54 100644 --- a/nemo/collections/llm/deploy/base.py +++ b/nemo/collections/llm/deploy/base.py @@ -17,28 +17,38 @@ import os from pathlib import Path -def unset_environment_variables(): +def unset_environment_variables() -> None: """ SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work on clusters. This method takes care of unsetting these env variables - # TODO maybe move this to NeMo-Run script ? """ logging.info("Unsetting all SLURM_, PMI_, PMIX_ Variables") # Function to unset variables with a specific prefix def unset_vars_with_prefix(prefix): + unset_vars = [] cmd = f"env | grep ^{prefix} | cut -d= -f1" result = subprocess.run(cmd, shell=True, capture_output=True, text=True) vars_to_unset = result.stdout.strip().split('\n') for var in vars_to_unset: if var: # Check if the variable name is not empty os.environ.pop(var, None) + unset_vars.append(var) + return unset_vars + + # Collect all unset variables across all prefixes + all_unset_vars = [] # Unset variables for each prefix for prefix in ['SLURM_', 'PMI_', 'PMIX_']: - unset_vars_with_prefix(prefix) + unset_vars = unset_vars_with_prefix(prefix) + all_unset_vars.extend(unset_vars) + + if all_unset_vars: + logging.info(f"Unset env variables: {', '.join(all_unset_vars)}") + else: + logging.info("No env variables were unset.") - logging.info("Variables unset successfully") def get_trtllm_deployable( nemo_checkpoint, @@ -53,6 +63,9 @@ def get_trtllm_deployable( dtype, output_generation_logits ): + """ + Exports the nemo checkpoint to trtllm and returns trt_llm_exporter that is used to deploy on PyTriton. + """ from nemo.export.tensorrt_llm import TensorRTLLM if triton_model_repository is None: diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index 145d70b5c6fc..0fdc41cff06b 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -28,7 +28,8 @@ class NeMoFWLMEval(LM): """ - NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with our model deployed on PyTriton server. + NeMoFWLMEval is a wrapper class subclassing lm_eval.api.model.LM class, that defines how lm_eval interfaces with + NeMo model deployed on PyTriton server. Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos): @@ -44,7 +45,8 @@ def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, tempe def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False): """ - A private method that sends post request to the model on PyTriton server and returns either generated text or logits. + A private method that sends post request to the model on PyTriton server and returns either generated text or + logits. """ # send a post request to /v1/completions/ endpoint with the payload response = requests.post(f"{self.api_url}/v1/completions/", json=payload) @@ -63,17 +65,22 @@ def _generate_tokens_logits(self, payload, return_text: bool = False, return_log return response_data['choices'][0]['generation_logits'] def tokenizer_type(self, tokenizer): + """ + Returns the type of the tokenizer. + """ if isinstance(tokenizer, AutoTokenizer): return "AutoTokenizer" elif isinstance(tokenizer, SentencePieceTokenizer): return "SentencePieceTokenizer" else: - return "Unknown tokenizer type" + raise ValueError("Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check " + "how to handle special tokens for this tokenizer") def loglikelihood(self, requests: list[Instance]): """ - Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance. - Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples. + Defines the loglikelihood request. Takes input requests of type list[Instance] where Instance is a dataclass + defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here + loglikelihood) and other relevant args like few shot samples. """ special_tokens_kwargs = {} if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": @@ -129,8 +136,9 @@ def loglikelihood_rolling(self, requests: list[Instance]): def generate_until(self, inputs: list[Instance]): """ - Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass defined in lm_eval.api.instance. - Each Instance conists of the input prompt, output prompt, request type(here loglikelihood) and other relevant args like few shot samples. + Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass + defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here + loglikelihood) and other relevant args like few shot samples. """ results = [] for instance in inputs: diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 6218cd2ed6f4..21c68f18580d 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -51,7 +51,7 @@ def triton_request_timeout(self): @property def openai_format_response(self): """ - Retuns the response from Triton server in OpenAI compatible formar if set to True. + Retuns the response from Triton server in OpenAI compatible format if set to True. """ return self._openai_format_response From 1a2245821afc41d934fcf27bf8d35c5d9115ada9 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 13 Nov 2024 16:01:39 -0800 Subject: [PATCH 17/21] Fix pylint and typos Signed-off-by: Abhishree --- nemo/collections/llm/api.py | 37 +++++++++++++------------ nemo/collections/llm/evaluation/base.py | 9 ++++-- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index db8556afa072..26bcb4275ba7 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os from copy import deepcopy from pathlib import Path @@ -287,8 +286,9 @@ def deploy( Args: nemo_checkpoint (Path): Path for nemo checkpoint. model_type (str): Type of the model. Choices: gpt, llama, falcon, starcoder. Default: llama. - triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model name - is passed to the evalute method for the model to be accessible while sending evalution requests. Default: 'triton_model'. + triton_model_name (str): Name for the model that gets deployed on PyTriton. Please ensure that the same model + name is passed to the evalute method for the model to be accessible while sending evalution requests. + Default: 'triton_model'. triton_model_version (Optional[int]): Version for the triton model. Default: 1. triton_port (int): Port for the PyTriton server. Default: 8000. triton_http_address (str): HTTP address for the PyTriton server. Default: "0.0.0.0". @@ -306,8 +306,8 @@ def deploy( Needs to be True to be able to run evaluation. Default: True. rest_service_http_address (str): HTTP address for the rest service. Default: "0.0.0.0". rest_service_port (int): Port for the rest service. Default: 8080. - openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be - True while running evaluation. Default: True. + openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to + be True while running evaluation. Default: True. output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the logProb of the output token. Default: True. """ @@ -406,14 +406,14 @@ def evaluate( (https://github.com/EleutherAI/lm-evaluation-harness/tree/main). Args: - nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which - is required to tokenize the evaluation input and output prompts. - url (str): rest serice url and port that were used in the deploy method above in the format: - http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts (from lm-eval-harness) - are sent to this url which is then passed to the model deployed on PyTriton server. The rest service url and port - serve as the entry point to evaluate model deployed on PyTriton server. - model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as triton_model_name - passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model". + nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt + which is required to tokenize the evaluation input and output prompts. + url (str): rest service url and port that were used in the deploy method above in the format: + http://{rest_service_http}:{rest_service_port}. Post requests with evaluation input prompts + (from lm-eval-harness) are sent to this url which is then passed to the model deployed on PyTriton server. + The rest service url and port serve as the entry point to evaluate model deployed on PyTriton server. + model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as + triton_model_name passed to the deploy method above to be able to launch evaluation. Deafult: "triton_model". eval_task (str): task to be evaluated on. For ex: "gsm8k", "gsm8k_cot", "mmlu", "lambada". Default: "gsm8k". These are the tasks that are supported currently. Any other task of type generate_until or loglikelihood from lm-evaluation-harness can be run, but only the above mentioned ones are tested. Tasks of type @@ -427,12 +427,12 @@ def evaluate( # inference params max_tokens_to_generate (int): max tokens to generate. Default: 256. temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token - with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM(# TODO to be investigated). - Hence using a very samll value as the default. Default: 0.000000001. + with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM + (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001. top_p: Optional[float]: float value between 0 and 1. limits to the top tokens within a certain probability. top_p=0 means the model will only consider the single most likely token for the next prediction. Default: 0.0. - top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model will - only consider the single most likely token for the next prediction. Default: 1 + top_k: Optional[int]: limits to a certain number (K) of the top tokens to consider. top_k=1 means the model + will only consider the single most likely token for the next prediction. Default: 1 add_bos: Optional[bool]: whether a special token representing the beginning of a sequence should be added when encoding a string. Default: False since typically for CausalLM its set to False. If needed set add_bos to True. """ @@ -440,7 +440,8 @@ def evaluate( # lm-evaluation-harness import from lm_eval import evaluator except ImportError: - raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required to run evaluations") + raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required " + "to run evaluations") from nemo.collections.llm import evaluation diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index 0fdc41cff06b..f9dc3debb298 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -132,13 +132,16 @@ def loglikelihood(self, requests: list[Instance]): return results def loglikelihood_rolling(self, requests: list[Instance]): + """ + Defines the loglikelihood_rolling request type. Yet to be implemented. + """ pass def generate_until(self, inputs: list[Instance]): """ - Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a dataclass - defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request type(here - loglikelihood) and other relevant args like few shot samples. + Defines the generate_until request type. Takes input requests of type list[Instance] where Instance is a + dataclass defined in lm_eval.api.instance. Each Instance conists of the input prompt, output prompt, request + type(here loglikelihood) and other relevant args like few shot samples. """ results = [] for instance in inputs: From f6654c99e2272c6b1860b2bfa0e3a42b6f4712da Mon Sep 17 00:00:00 2001 From: athitten Date: Thu, 14 Nov 2024 00:03:11 +0000 Subject: [PATCH 18/21] Apply isort and black reformatting Signed-off-by: athitten --- nemo/collections/llm/api.py | 15 ++++++---- nemo/collections/llm/deploy/__init__.py | 2 +- nemo/collections/llm/deploy/base.py | 12 ++++---- nemo/collections/llm/evaluation/__init__.py | 2 +- nemo/collections/llm/evaluation/base.py | 30 ++++++++++--------- nemo/deploy/nlp/query_llm.py | 8 ++--- nemo/deploy/service/rest_model_api.py | 2 +- nemo/export/tensorrt_llm.py | 10 ++++--- nemo/export/trt_llm/tensorrt_llm_build.py | 2 +- nemo/lightning/pytorch/callbacks/debugging.py | 1 + 10 files changed, 47 insertions(+), 37 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 26bcb4275ba7..07899b2ee484 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -276,7 +276,7 @@ def deploy( rest_service_http_address: str = "0.0.0.0", rest_service_port: int = 8080, openai_format_response: bool = True, - output_generation_logits: bool = True + output_generation_logits: bool = True, ): """ Deploys nemo model on a PyTriton server by converting the nemo ckpt to trtllm. @@ -311,8 +311,8 @@ def deploy( output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. generation_logits are used to compute the logProb of the output token. Default: True. """ - from nemo.deploy import DeployPyTriton from nemo.collections.llm import deploy + from nemo.deploy import DeployPyTriton deploy.unset_environment_variables() if start_rest_service: @@ -337,7 +337,7 @@ def deploy( max_output_len, max_batch_size, dtype, - output_generation_logits + output_generation_logits, ) try: @@ -440,8 +440,9 @@ def evaluate( # lm-evaluation-harness import from lm_eval import evaluator except ImportError: - raise ImportError("Please ensure that lm-evaluation-harness is installed in your env as it is required " - "to run evaluations") + raise ImportError( + "Please ensure that lm-evaluation-harness is installed in your env as it is required " "to run evaluations" + ) from nemo.collections.llm import evaluation @@ -450,7 +451,9 @@ def evaluate( # Wait for rest service to be ready before starting evaluation evaluation.wait_for_rest_service(rest_url=f"{url}/v1/health") # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate - model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos) + model = evaluation.NeMoFWLMEval( + model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos + ) results = evaluator.simple_evaluate( model=model, tasks=eval_task, limit=limit, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters ) diff --git a/nemo/collections/llm/deploy/__init__.py b/nemo/collections/llm/deploy/__init__.py index 312cfb93ca1c..24c102bfa0d2 100644 --- a/nemo/collections/llm/deploy/__init__.py +++ b/nemo/collections/llm/deploy/__init__.py @@ -1,3 +1,3 @@ -from nemo.collections.llm.deploy.base import unset_environment_variables, get_trtllm_deployable +from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables __all__ = ["unset_environment_variables", "get_trtllm_deployable"] diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py index 46ce57152f54..e21198f5884b 100644 --- a/nemo/collections/llm/deploy/base.py +++ b/nemo/collections/llm/deploy/base.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo.utils import logging -import subprocess import os +import subprocess from pathlib import Path +from nemo.utils import logging + + def unset_environment_variables() -> None: """ SLURM_, PMI_, PMIX_ Variables are needed to be unset for trtllm export to work @@ -61,7 +63,7 @@ def get_trtllm_deployable( max_output_len, max_batch_size, dtype, - output_generation_logits + output_generation_logits, ): """ Exports the nemo checkpoint to trtllm and returns trt_llm_exporter that is used to deploy on PyTriton. @@ -107,9 +109,9 @@ def get_trtllm_deployable( max_output_len=max_output_len, max_batch_size=max_batch_size, dtype=dtype, - gather_generation_logits=output_generation_logits + gather_generation_logits=output_generation_logits, ) except Exception as error: raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) - return trt_llm_exporter \ No newline at end of file + return trt_llm_exporter diff --git a/nemo/collections/llm/evaluation/__init__.py b/nemo/collections/llm/evaluation/__init__.py index bca7c6251588..3012689bb8da 100644 --- a/nemo/collections/llm/evaluation/__init__.py +++ b/nemo/collections/llm/evaluation/__init__.py @@ -1,3 +1,3 @@ from nemo.collections.llm.evaluation.base import NeMoFWLMEval, wait_for_rest_service -__all__ = ["NeMoFWLMEval", "wait_for_rest_service"] \ No newline at end of file +__all__ = ["NeMoFWLMEval", "wait_for_rest_service"] diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index f9dc3debb298..f43e9328cf65 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -13,18 +13,18 @@ # limitations under the License. import time -import requests -from requests.exceptions import RequestException +import requests import torch import torch.nn.functional as F +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM +from requests.exceptions import RequestException from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.utils import logging -from lm_eval.api.instance import Instance -from lm_eval.api.model import LM class NeMoFWLMEval(LM): """ @@ -32,6 +32,7 @@ class NeMoFWLMEval(LM): NeMo model deployed on PyTriton server. Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ + def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos): self.model_name = model_name self.api_url = api_url @@ -73,8 +74,10 @@ def tokenizer_type(self, tokenizer): elif isinstance(tokenizer, SentencePieceTokenizer): return "SentencePieceTokenizer" else: - raise ValueError("Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check " - "how to handle special tokens for this tokenizer") + raise ValueError( + "Tokenizer type is not one of SentencePieceTokenizer or HF's AutoTokenizer. Please check " + "how to handle special tokens for this tokenizer" + ) def loglikelihood(self, requests: list[Instance]): """ @@ -97,11 +100,12 @@ def loglikelihood(self, requests: list[Instance]): # get encoded tokens of continuation continuation_enc = self.tokenizer.tokenizer.encode(continuation, **special_tokens_kwargs) # for SentencePeice consider the encoded tokens from the 2nd token since first encoded token is space. - if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:] + if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": + continuation_enc = continuation_enc[1:] num_cont_tokens = len(continuation_enc) # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request self.max_tokens_to_generate = num_cont_tokens - # Create payload to query the model deployed on PyTriton server + # Create payload to query the model deployed on PyTriton server payload = { "model": self.model_name, "prompt": context, @@ -121,9 +125,7 @@ def loglikelihood(self, requests: list[Instance]): # Check if all greedy_tokens match the the actual continuation tokens is_greedy = (greedy_tokens == cont_toks).all() # Get the logits corresponding to the actual continuation tokens - logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze( - -1 - ) + logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # result is tuple of logProb of generating the continuation token and is_greedy result = (float(logits.sum()), bool(is_greedy)) @@ -147,7 +149,7 @@ def generate_until(self, inputs: list[Instance]): for instance in inputs: # Access the 'arguments' attribute of the Instance which contains the input prompt string prompt = instance.arguments[0] - # Create payload to query the model deployed on PyTriton server + # Create payload to query the model deployed on PyTriton server payload = { "model": self.model_name, "prompt": prompt, @@ -170,7 +172,7 @@ def wait_for_rest_service(rest_url, max_retries=60, retry_interval=2): Args: rest_url (str): URL of the REST service's health endpoint - max_retries (int): Maximum number of retry attempts. Defaul: 60. + max_retries (int): Maximum number of retry attempts. Defaul: 60. retry_interval (int): Time to wait between retries in seconds. Default: 2. Returns: @@ -204,4 +206,4 @@ def check_service(url): time.sleep(retry_interval) logging.info("Timeout: REST service did not become ready.") - return False \ No newline at end of file + return False diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 4c55cf3b2c15..e1d21bb54b76 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -174,7 +174,7 @@ def query_llm( end_strings=None, init_timeout=60.0, openai_format_response: bool = False, - output_generation_logits: bool = False + output_generation_logits: bool = False, ): """ Query the Triton server synchronously and return a list of responses. @@ -273,11 +273,11 @@ def query_llm( "object": "text_completion", "created": int(time.time()), "model": self.model_name, - "choices": [{"text": str(sentences) - }] + "choices": [{"text": str(sentences)}], } # Convert gneration logits to a list to make it json serializable and add it to openai_response dict - if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist() + if output_generation_logits: + openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"].tolist() return openai_response else: return sentences diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 21c68f18580d..2a65b19c0d50 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -118,7 +118,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - output_generation_logits=triton_settings.output_generation_logits + output_generation_logits=triton_settings.output_generation_logits, ) if triton_settings.openai_format_response: return output diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index e16275b2208d..a1e6cb0e03c4 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -181,7 +181,7 @@ def export( fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None, gather_context_logits: Optional[bool] = False, - gather_generation_logits: Optional[bool] = False + gather_generation_logits: Optional[bool] = False, ): """ Exports nemo checkpoints to TensorRT-LLM. @@ -500,7 +500,7 @@ def get_transformer_config(nemo_model_config): gpt_attention_plugin=gpt_attention_plugin, gemm_plugin=gemm_plugin, gather_context_logits=gather_context_logits, - gather_generation_logits=gather_generation_logits + gather_generation_logits=gather_generation_logits, ) tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model") @@ -877,8 +877,10 @@ def get_triton_input(self): @property def get_triton_output(self): - outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes), - Tensor(name="generation_logits", shape=(-1,), dtype=np.single)) + outputs = ( + Tensor(name="outputs", shape=(-1,), dtype=bytes), + Tensor(name="generation_logits", shape=(-1,), dtype=np.single), + ) return outputs @batch diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 88767917301e..38fb80ca3272 100755 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -55,7 +55,7 @@ def build_and_save_engine( gemm_plugin: str = "auto", reduce_fusion: bool = False, gather_context_logits: bool = False, - gather_generation_logits: bool = False + gather_generation_logits: bool = False, ): architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture try: diff --git a/nemo/lightning/pytorch/callbacks/debugging.py b/nemo/lightning/pytorch/callbacks/debugging.py index b5da06dfbf53..5f6e722ef89b 100644 --- a/nemo/lightning/pytorch/callbacks/debugging.py +++ b/nemo/lightning/pytorch/callbacks/debugging.py @@ -116,6 +116,7 @@ def _apply_user_funcs(self, trainer: pl.Trainer, pl_module: pl.LightningModule) Iterate over model parameters, find gradient tensor, apply and collect outputs of param_fn and grad_fn, and log outputs in a table. """ + def find_grad_tensor(param: torch.Tensor) -> Optional[torch.Tensor]: """If using MCore optimizer, search the grad buckets for param's grad tensor.""" if not isinstance(pl_module.optim, MegatronOptimizerModule): From 5f03ceef0c9d0f0aab7c1fbbfb2134b9b389d64b Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Thu, 14 Nov 2024 22:41:53 -0800 Subject: [PATCH 19/21] Avoid multiple calls for tokenizer_type Co-authored-by: Ananth Subramaniam Signed-off-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --- nemo/collections/llm/evaluation/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index f43e9328cf65..b1734d6f4d43 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -86,9 +86,10 @@ def loglikelihood(self, requests: list[Instance]): loglikelihood) and other relevant args like few shot samples. """ special_tokens_kwargs = {} - if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": + tokenizer_type = self.tokenizer_type(self.tokenizer) + if tokenizer_type == "SentencePieceTokenizer": special_tokens_kwargs['add_bos'] = self.add_bos - elif self.tokenizer_type(self.tokenizer) == "AutoTokenizer": + elif tokenizer_type == "AutoTokenizer": special_tokens_kwargs['add_special_tokens'] = self.add_bos results = [] From 88842a18dfd02722d4928d365140cdb602f06cfc Mon Sep 17 00:00:00 2001 From: Abhishree Date: Thu, 14 Nov 2024 23:05:21 -0800 Subject: [PATCH 20/21] Replace print statements with logging statements Signed-off-by: Abhishree --- nemo/deploy/service/rest_model_api.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index 2a65b19c0d50..c2f5b394fb3b 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -17,7 +17,7 @@ from pydantic_settings import BaseSettings from nemo.deploy.nlp import NemoQueryLLM - +from nemo.utils import logging class TritonSettings(BaseSettings): _triton_service_port: int @@ -33,7 +33,7 @@ def __init__(self): self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true' self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true' except Exception as error: - print("An exception occurred:", error) + logging.error("An exception occurred trying to retrieve set args in TritonSettings class. Error:", error) return @property @@ -93,7 +93,7 @@ async def check_triton_health(): triton_url = ( f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" ) - print(f"Attempting to connect to Triton server at: {triton_url}") + logging.info(f"Attempting to connect to Triton server at: {triton_url}") try: response = requests.get(triton_url, timeout=5) if response.status_code == 200: @@ -118,7 +118,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - output_generation_logits=triton_settings.output_generation_logits, + output_generation_logits=triton_settings.output_generation_logits ) if triton_settings.openai_format_response: return output @@ -127,5 +127,5 @@ def completions_v1(request: CompletionRequest): "output": output[0][0], } except Exception as error: - print("An exception occurred:", error) + logging.error("An exception occurred with the post request to /v1/completions/ endpoint:", error) return {"error": "An exception occurred"} From a3adb69009eaaf2145ecf248317a880ff276d50d Mon Sep 17 00:00:00 2001 From: athitten Date: Fri, 15 Nov 2024 07:06:54 +0000 Subject: [PATCH 21/21] Apply isort and black reformatting Signed-off-by: athitten --- nemo/deploy/service/rest_model_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py index c2f5b394fb3b..64afea167295 100644 --- a/nemo/deploy/service/rest_model_api.py +++ b/nemo/deploy/service/rest_model_api.py @@ -19,6 +19,7 @@ from nemo.deploy.nlp import NemoQueryLLM from nemo.utils import logging + class TritonSettings(BaseSettings): _triton_service_port: int _triton_service_ip: str @@ -118,7 +119,7 @@ def completions_v1(request: CompletionRequest): temperature=request.temperature, init_timeout=triton_settings.triton_request_timeout, openai_format_response=triton_settings.openai_format_response, - output_generation_logits=triton_settings.output_generation_logits + output_generation_logits=triton_settings.output_generation_logits, ) if triton_settings.openai_format_response: return output