diff --git a/README.md b/README.md index f88faf6e1..c772e98ff 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ Here is an example to finetune a GPT-2 base model. ```sh cd data && ./download.sh alpaca && cd - -./scripts/run_finetune.sh \ +bash ./scripts/run_finetune.sh \ --model_name_or_path gpt2 \ --dataset_path data/alpaca/train_conversation \ --output_model_path output_models/finetuned_gpt2 @@ -141,7 +141,7 @@ cd data && ./download.sh alpaca && cd - >```bash >cd data && ./download.sh alpaca && cd - > ->./scripts/run_finetune.sh \ +>bash ./scripts/run_finetune.sh \ > --model_name_or_path meta-llama/Meta-Llama-3-8B \ > --dataset_path data/alpaca/train_conversation \ > --conversation_template llama3 \ @@ -155,7 +155,7 @@ cd data && ./download.sh alpaca && cd - ```sh cd data && ./download.sh alpaca && cd - -./scripts/run_finetune_with_lisa.sh \ +bash ./scripts/run_finetune_with_lisa.sh \ --model_name_or_path meta-llama/Llama-2-7b-hf \ --dataset_path data/alpaca/train_conversation \ --output_model_path output_models/finetuned_llama2_7b \ @@ -169,7 +169,7 @@ cd data && ./download.sh alpaca && cd - >```bash >cd data && ./download.sh alpaca && cd - > ->./scripts/run_finetune_with_lisa.sh \ +>bash ./scripts/run_finetune_with_lisa.sh \ > --model_name_or_path meta-llama/Llama-2-7b-hf \ > --dataset_path data/alpaca/train_conversation \ > --conversation_template llama2 \ @@ -185,7 +185,7 @@ LoRA is a parameter-efficient finetuning algorithm and is more efficient than fu ```sh cd data && ./download.sh alpaca && cd - -./scripts/run_finetune_with_lora.sh \ +bash ./scripts/run_finetune_with_lora.sh \ --model_name_or_path facebook/galactica-1.3b \ --dataset_path data/alpaca/train_conversation \ --output_lora_path output_models/finetuned_galactica_lora @@ -197,7 +197,7 @@ cd data && ./download.sh alpaca && cd - >```bash >cd data && ./download.sh alpaca && cd - > ->./scripts/run_finetune_with_lora.sh \ +>bash ./scripts/run_finetune_with_lora.sh \ > --model_name_or_path meta-llama/Llama-2-7b-hf \ > --dataset_path data/alpaca/train_conversation \ > --conversation_template llama2 \ @@ -209,7 +209,7 @@ cd data && ./download.sh alpaca && cd - > >Merge LoRA weight and the base model into one using: >```sh ->./scripts/run_merge_lora.sh \ +>bash ./scripts/run_merge_lora.sh \ > --model_name_or_path Qwen/Qwen1.5-1.8B \ > --lora_model_path output_models/lora \ > --output_model_path output_models/lora_merged \ @@ -219,9 +219,22 @@ cd data && ./download.sh alpaca && cd - ### Inference After finetuning, you can run the following command to chat with the model. ```sh -./scripts/run_chatbot.sh output_models/finetuned_gpt2 +bash ./scripts/run_chatbot.sh output_models/finetuned_gpt2 ``` +> [!TIP] +> We recommend using vLLM for faster inference. +> +>
Faster inference using vLLM +> +>```bash +>bash ./scripts/run_vllm_inference.sh \ +> --model_name_or_path Qwen/Qwen2-0.5B \ +> --dataset_path data/alpaca/test_conversation \ +> --output_dir data/inference_results \ +>``` +>
+ ### Deployment If you want to deploy your own model locally, we provide a gradio-based UI for building chatbots. Running the following command will launch the demo for robin-7b: @@ -240,7 +253,7 @@ You can directly run the LMFlow benchmark evaluation to obtain the results to pa [LLM comparision](https://docs.google.com/spreadsheets/d/1JYh4_pxNzmNA9I0YM2epgRA7VXBIeIGS64gPJBg5NHA/edit?usp=sharing). For example, to run GPT2 XL, one may execute ```sh -./scripts/run_benchmark.sh --model_name_or_path gpt2-xl +bash ./scripts/run_benchmark.sh --model_name_or_path gpt2-xl ``` `--model_name_or_path` is required, you may fill in huggingface model name or local model path here. @@ -288,6 +301,10 @@ To check the evaluation results, you may check `benchmark.log` in `./output_dir/ LMFlow supports both FlashAttention-1 and the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details. +* vLLM + + Try vLLM for fast and easy-to-use LLM inference and serving. Thanks for the [great work](https://github.com/vllm-project/vllm)! +
Long Context diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py new file mode 100644 index 000000000..83f89b008 --- /dev/null +++ b/examples/vllm_inference.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. +import logging +import os +import sys + +from transformers import ( + HfArgumentParser +) + +from lmflow.datasets import Dataset +from lmflow.models.hf_decoder_model import HFDecoderModel +from lmflow.pipeline.auto_pipeline import AutoPipeline +from lmflow.args import ( + ModelArguments, + DatasetArguments, + AutoArguments, +) + + +logger = logging.getLogger(__name__) + + +def main(): + # Parses arguments + pipeline_name = "vllm_inferencer" + PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) + + parser = HfArgumentParser(( + ModelArguments, + DatasetArguments, + PipelineArguments + )) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses() + + dataset = Dataset(data_args) + model = HFDecoderModel(model_args) + inferencer = AutoPipeline.get_pipeline( + pipeline_name=pipeline_name, + model_args=model_args, + data_args=data_args, + pipeline_args=pipeline_args + ) + + res = inferencer.inference( + model, + dataset, + release_gpu=False, + enable_decode_inference_result=pipeline_args.enable_decode_inference_result, + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/run_vllm_inference.sh b/scripts/run_vllm_inference.sh new file mode 100644 index 000000000..681d2d5ec --- /dev/null +++ b/scripts/run_vllm_inference.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. + +# Parses arguments +run_name=vllm_inference +model_name_or_path='Qwen/Qwen2-0.5B' +dataset_path=data/alpaca/test_conversation +output_dir=data/inference_results +output_file_name=results.json +apply_chat_template=True + +# Safety related arguments +trust_remote_code=0 + +while [[ $# -ge 1 ]]; do + key="$1" + case ${key} in + -r|--run_name) + run_name="$2" + shift + ;; + -m|--model_name_or_path) + model_name_or_path="$2" + shift + ;; + -d|--dataset_path) + dataset_path="$2" + shift + ;; + --output_dir) + output_dir="$2" + shift + ;; + --output_file_name) + output_file_name="$2" + shift + ;; + --apply_chat_template) + apply_chat_template="$2" + shift + ;; + --trust_remote_code) + trust_remote_code="$2" + shift + ;; + *) + echo "error: unknown option \"${key}\"" 1>&2 + exit 1 + esac + shift +done + +# inference +project_dir=$(cd "$(dirname $0)"/..; pwd) +log_dir=${project_dir}/log/${run_name} +output_file_path=${output_dir}/${run_name}/${output_file_name} +mkdir -p ${output_dir}/${run_name} ${log_dir} + +python examples/vllm_inference.py \ + --use_vllm True \ + --trust_remote_code ${trust_remote_code} \ + --model_name_or_path ${model_name_or_path} \ + --dataset_path ${dataset_path} \ + --preprocessing_num_workers 16 \ + --random_seed 42 \ + --apply_chat_template ${apply_chat_template} \ + --num_output_sequences 2 \ + --use_beam_search False \ + --temperature 1.0 \ + --top_p 0.9 \ + --max_new_tokens 1024 \ + --save_results True \ + --results_path ${output_file_path} \ + --enable_decode_inference_result False \ + --vllm_gpu_memory_utilization 0.95 \ + --vllm_tensor_parallel_size 2 \ + 2>&1 | tee ${log_dir}/vllm_inference.log \ No newline at end of file diff --git a/src/lmflow/args.py b/src/lmflow/args.py index 7935a89b5..56d8d43e3 100644 --- a/src/lmflow/args.py +++ b/src/lmflow/args.py @@ -856,16 +856,12 @@ class InferencerArguments: Whether to save inference results, By default False. results_path : Optional[str] The **json file** path of inference results, By default None. - memory_safe_vllm_inference_detokenize : Optional[bool] - Whether to detokenize the memory safe vllm inference results. + enable_decode_inference_result : Optional[bool] + Whether to detokenize the inference results. NOTE: For iterative align pipelines, whether to detokenize depends on the homogeneity of the policy model and the reward model - (i.e., if they have the same tokenizer). - The reason why `detokenize` for memory safe vllm inference is - included in args is due to the its implementation (i.e., subprocess - rather than within the python codes, thus have to communicate through - command line arguments). + (i.e., if they have the same tokenizer). use_vllm: bool, optional Whether to use VLLM for inference, By default False. vllm_tensor_parallel_size: int, optional @@ -964,9 +960,9 @@ class InferencerArguments: default=True, metadata={"help": "whether to apply chat template"}, ) - memory_safe_vllm_inference_detokenize: Optional[bool] = field( + enable_decode_inference_result: Optional[bool] = field( default=False, - metadata={"help": "Whether to detokenize the memory safe vllm inference results."}, + metadata={"help": "Whether to decode the inference results."}, ) # vllm inference args @@ -1254,6 +1250,7 @@ class IterativeAlignerArguments(InferencerArguments): "finetuner": FinetunerArguments, "evaluator": EvaluatorArguments, "inferencer": InferencerArguments, + "vllm_inferencer": InferencerArguments, "raft_aligner": RaftAlignerArguments, "dpo_aligner": DPOAlignerArguments, "rm_tuner": RewardModelingArguments, diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py index 2721c1947..c01e916da 100644 --- a/src/lmflow/models/hf_model_mixin.py +++ b/src/lmflow/models/hf_model_mixin.py @@ -449,7 +449,7 @@ def __prepare_model_for_vllm_inference( self.backend_model_for_inference = LLM( model=model_args.model_name_or_path, tokenizer=model_args.model_name_or_path, - dtype=model_args.torch_dtype, + dtype=model_args.torch_dtype if model_args.torch_dtype else "auto", load_format="auto", gpu_memory_utilization=vllm_gpu_memory_utilization, tensor_parallel_size=vllm_tensor_parallel_size, diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py index 9d5dd007f..a4c053cf6 100644 --- a/src/lmflow/pipeline/auto_pipeline.py +++ b/src/lmflow/pipeline/auto_pipeline.py @@ -17,12 +17,14 @@ def is_package_version_at_least(package_name, min_version): from lmflow.pipeline.evaluator import Evaluator from lmflow.pipeline.finetuner import Finetuner from lmflow.pipeline.inferencer import Inferencer +from lmflow.pipeline.vllm_inferencer import VLLMInferencer from lmflow.pipeline.dpo_aligner import DPOAligner from lmflow.pipeline.rm_tuner import RewardModelingTuner PIPELINE_MAPPING = { "evaluator": Evaluator, "finetuner": Finetuner, "inferencer": Inferencer, + "vllm_inferencer": VLLMInferencer, "dpo_aligner": DPOAligner, "rm_tuner": RewardModelingTuner, } diff --git a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py index 74c3e7fc2..3502d13e2 100644 --- a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py +++ b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py @@ -31,7 +31,7 @@ def main(): # Parses arguments - pipeline_name = "inferencer" + pipeline_name = "vllm_inferencer" PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name) parser = HfArgumentParser(( @@ -48,13 +48,13 @@ def main(): dataset = Dataset(data_args) model = HFDecoderModel(model_args) - inferencer = VLLMInferencer(model_args, pipeline_args) + inferencer = VLLMInferencer(model_args, data_args, pipeline_args) res = inferencer.inference( model, dataset, release_gpu=False, - detokenize=pipeline_args.memory_safe_vllm_inference_detokenize, + enable_decode_inference_result=pipeline_args.enable_decode_inference_result, ) # use this as a flag, stdout will be captured by the pipeline diff --git a/src/lmflow/pipeline/vllm_inferencer.py b/src/lmflow/pipeline/vllm_inferencer.py index 6d4520a70..a109aef91 100644 --- a/src/lmflow/pipeline/vllm_inferencer.py +++ b/src/lmflow/pipeline/vllm_inferencer.py @@ -24,7 +24,7 @@ DatasetArguments, ) from lmflow.utils.common import make_shell_args_from_dataclass -from lmflow.utils.constants import MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG +from lmflow.utils.constants import RETURN_CODE_ERROR_BUFFER logger = logging.getLogger(__name__) @@ -34,9 +34,11 @@ class InferencerWithOffloading(BasePipeline): def __init__( self, model_args: ModelArguments, + data_args: DatasetArguments, inferencer_args: InferencerArguments, ): self.model_args = model_args + self.data_args = data_args self.inferencer_args = inferencer_args self.eos_token_id = AutoTokenizer.from_pretrained(model_args.model_name_or_path).eos_token_id @@ -54,10 +56,11 @@ class VLLMInferencer(InferencerWithOffloading): def __init__( self, model_args: ModelArguments, + data_args: DatasetArguments, inferencer_args: InferencerArguments, ): assert inferencer_args.use_vllm, "The inferencer_args.use_vllm must be True." - super().__init__(model_args, inferencer_args) + super().__init__(model_args, data_args, inferencer_args) self.sampling_params = self.parse_to_sampling_params(inferencer_args) @@ -81,7 +84,7 @@ def inference( self, model: HFDecoderModel, dataset: Dataset, - detokenize: bool = True, + enable_decode_inference_result: bool = True, release_gpu: bool = False, inference_args: Optional[InferencerArguments] = None, ) -> Union[List[List[str]], List[List[List[int]]]]: @@ -96,25 +99,22 @@ def inference( LMFlow Dataset object apply_chat_template : bool, optional Whether to apply chat template to the input, by default True. - detokenize : bool, optional + enable_decode_inference_result : bool, optional Whether to decode after generation, by default False. release_gpu : bool, optional Whether to release gpu resources, by default False. - NOTE: The reason why `release_gpu` and `detokenize` are not in `inference_args` is that - Inferencer may be used by other pipeline, and the pipeline may want to control these - two behaviors dynamically. inference_args : InferencerArguments, optional by default None Returns ------- Union[List[List[str]], List[List[List[int]]]] - When `detokenize = True`, return a list of list of strings. Inner list + When `enable_decode_inference_result = True`, return a list of list of strings. Inner list contains inference_args.num_output_sequences samples for a single prompt (i.e., `len(res[i]) = inference_args.num_output_sequences`). Outer list contains the results for all prompts (i.e., `len(res) = len(dataset)`). - When `detokenize = False`, return a list of list of list of ints + When `enable_decode_inference_result = False`, return a list of list of list of ints (token ids, no decoding after generation). """ if inference_args: @@ -125,7 +125,7 @@ def inference( else: sampling_params = self.sampling_params - sampling_params.detokenize = detokenize + sampling_params.detokenize = enable_decode_inference_result model_input = model.prepare_inputs_for_inference( dataset=dataset, @@ -177,8 +177,7 @@ def __init__( inferencer_args: InferencerArguments, ): assert inferencer_args.save_results, "For MemorySafeVLLMInferencer, `save_results` must be True." - super().__init__(model_args, inferencer_args) - self.data_args = data_args + super().__init__(model_args, data_args, inferencer_args) self.inferencer_file_path = pkg_resources.files("lmflow.pipeline.utils") / "memory_safe_vllm_inference.py" @@ -200,16 +199,20 @@ def inference(self): shell=True, preexec_fn=os.setsid ) - # wait for the subprocess to finish (kill cleanly, otherwise may leads to: - # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''> - # > at interpreter shutdown, possibly due to daemon threads - time.sleep(30) logger.info(f"MemorySafeVLLMInference subprocess run finished, info at finish: {cli_res}") - if cli_res.returncode != 0: - raise RuntimeError(f"Error during MemorySafeVLLMInference.") + if cli_res.returncode in RETURN_CODE_ERROR_BUFFER: + # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''> + # > at interpreter shutdown, possibly due to daemon threads + logger.warning( + "^^^^^^^^^^ Please ignore the above error, as it comes from the subprocess. " + "This may due a kill signal with unfinished stdout/stderr writing in the subprocess. " + ) else: - outputs = self.load_inference_results(self.inferencer_args.results_path) - logger.info("MemorySafeVLLMInference result captured.") - - return outputs \ No newline at end of file + if cli_res.returncode != 0: + raise RuntimeError(f"Error during MemorySafeVLLMInference: {cli_res}") + + outputs = self.load_inference_results(self.inferencer_args.results_path) + logger.info("MemorySafeVLLMInference result captured.") + + return outputs \ No newline at end of file diff --git a/src/lmflow/utils/constants.py b/src/lmflow/utils/constants.py index 5506eb55a..bb17899dc 100644 --- a/src/lmflow/utils/constants.py +++ b/src/lmflow/utils/constants.py @@ -316,4 +316,12 @@ } # vllm inference -MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE" \ No newline at end of file +MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE" +RETURN_CODE_ERROR_BUFFER = [ + 134 +] +# return code 134: +# > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''> +# > at interpreter shutdown, possibly due to daemon threads +# The above error, by our observation, is due to the kill signal with unfinished +# stdout/stderr writing in the subprocess \ No newline at end of file diff --git a/tests/pipeline/test_memory_safe_vllm_inferencer.py b/tests/pipeline/test_memory_safe_vllm_inferencer.py index b2c2b2803..19c9a6a1e 100644 --- a/tests/pipeline/test_memory_safe_vllm_inferencer.py +++ b/tests/pipeline/test_memory_safe_vllm_inferencer.py @@ -29,7 +29,7 @@ save_results=True, results_path='./data/mem_safe_vllm_res.json', use_vllm=True, - memory_safe_vllm_inference_detokenize=False, + enable_decode_inference_result=False, vllm_gpu_memory_utilization=0.95, vllm_tensor_parallel_size=2, ) @@ -58,7 +58,7 @@ def test_inference(self): logger.warning(f"test_inference: {test_res}") def test_inference_detokenize(self): - inferencer_args.memory_safe_vllm_inference_detokenize = True + inferencer_args.enable_decode_inference_result = True self.inferencer = MemorySafeVLLMInferencer( model_args=model_args, data_args=data_args,