From aa65ae991f8e7d38c919f14ab844d12b2f7057c1 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 28 Mar 2023 20:55:30 +0800 Subject: [PATCH 1/4] [coati] fix inference requirements --- applications/Chat/inference/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/applications/Chat/inference/requirements.txt b/applications/Chat/inference/requirements.txt index 67a9874e569a..1cb70968ea70 100644 --- a/applications/Chat/inference/requirements.txt +++ b/applications/Chat/inference/requirements.txt @@ -1,5 +1,5 @@ fastapi -locustio +locust numpy pydantic safetensors @@ -8,3 +8,4 @@ sse_starlette torch uvicorn git+https://github.com/huggingface/transformers +accelerate From d4a46fc5d0e84eb2ef3befb172fcd1d9f0b59180 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 28 Mar 2023 20:56:01 +0800 Subject: [PATCH 2/4] [coati] add output postprocess --- applications/Chat/inference/server.py | 4 ++-- applications/Chat/inference/utils.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/applications/Chat/inference/server.py b/applications/Chat/inference/server.py index 46a8b9a0568a..bfcd89264296 100644 --- a/applications/Chat/inference/server.py +++ b/applications/Chat/inference/server.py @@ -17,7 +17,7 @@ from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.' -MAX_LEN = 2048 +MAX_LEN = 512 running_lock = Lock() @@ -116,7 +116,7 @@ def generate_no_stream(data: GenerationTaskReq, request: Request): prompt_len = inputs['input_ids'].size(1) response = output[0, prompt_len:] out_string = tokenizer.decode(response, skip_special_tokens=True) - return out_string.lstrip() + return prompt_processor.postprocess_output(out_string) if __name__ == '__main__': diff --git a/applications/Chat/inference/utils.py b/applications/Chat/inference/utils.py index 3d04aa57d553..a01983de35d3 100644 --- a/applications/Chat/inference/utils.py +++ b/applications/Chat/inference/utils.py @@ -1,3 +1,4 @@ +import re from threading import Lock from typing import Any, Callable, Generator, List, Optional @@ -118,6 +119,9 @@ def _format_dialogue(instruction: str, response: str = ''): return f'\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}' +STOP_PAT = re.compile(r'(###|instruction:).*', flags=(re.I | re.S)) + + class ChatPromptProcessor: def __init__(self, tokenizer, context: str, max_len: int = 2048): @@ -164,6 +168,10 @@ def preprocess_prompt(self, history: List[Dialogue], max_new_tokens: int) -> str prompt += ''.join(rows) + _format_dialogue(last_dialogue.instruction) return prompt + def postprocess_output(self, output: str) -> str: + output = STOP_PAT.sub('', output) + return output.strip() + class LockedIterator: From 3c49dda43e3996f635710b20c3f1db172cbf205f Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 28 Mar 2023 20:57:09 +0800 Subject: [PATCH 3/4] [coati] update inference readme --- applications/Chat/inference/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/applications/Chat/inference/README.md b/applications/Chat/inference/README.md index 3fb330748a13..6c23bc73cd60 100644 --- a/applications/Chat/inference/README.md +++ b/applications/Chat/inference/README.md @@ -36,6 +36,12 @@ Tha data is from [LLaMA Int8 4bit ChatBot Guide v2](https://rentry.org/llama-tar | LLaMA-30B | 15.8GB | 20GB | 64GB | RTX 3080 20GB, A4500, A5000, 3090, 4090, 6000, Tesla V100 | | LLaMA-65B | 31.2GB | 40GB | 128GB | A100 40GB, 2x3090, 2x4090, A40, RTX A6000, 8000, Titan Ada | +## General setup + +```shell +pip install -r requirements.txt +``` + ## 8-bit setup 8-bit quantization is originally supported by the latest [transformers](https://github.com/huggingface/transformers). Please install it from source. From c7af8050d8ca18e8fc7ff9fdc4a328b30f8b4314 Mon Sep 17 00:00:00 2001 From: ver217 Date: Tue, 28 Mar 2023 20:59:07 +0800 Subject: [PATCH 4/4] [coati] fix inference requirements --- applications/Chat/inference/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/applications/Chat/inference/requirements.txt b/applications/Chat/inference/requirements.txt index 1cb70968ea70..7b0ac18a3b36 100644 --- a/applications/Chat/inference/requirements.txt +++ b/applications/Chat/inference/requirements.txt @@ -9,3 +9,4 @@ torch uvicorn git+https://github.com/huggingface/transformers accelerate +bitsandbytes