diff --git a/README.md b/README.md
index f88faf6e1..c772e98ff 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ Here is an example to finetune a GPT-2 base model.
```sh
cd data && ./download.sh alpaca && cd -
-./scripts/run_finetune.sh \
+bash ./scripts/run_finetune.sh \
--model_name_or_path gpt2 \
--dataset_path data/alpaca/train_conversation \
--output_model_path output_models/finetuned_gpt2
@@ -141,7 +141,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
->./scripts/run_finetune.sh \
+>bash ./scripts/run_finetune.sh \
> --model_name_or_path meta-llama/Meta-Llama-3-8B \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama3 \
@@ -155,7 +155,7 @@ cd data && ./download.sh alpaca && cd -
```sh
cd data && ./download.sh alpaca && cd -
-./scripts/run_finetune_with_lisa.sh \
+bash ./scripts/run_finetune_with_lisa.sh \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--dataset_path data/alpaca/train_conversation \
--output_model_path output_models/finetuned_llama2_7b \
@@ -169,7 +169,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
->./scripts/run_finetune_with_lisa.sh \
+>bash ./scripts/run_finetune_with_lisa.sh \
> --model_name_or_path meta-llama/Llama-2-7b-hf \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama2 \
@@ -185,7 +185,7 @@ LoRA is a parameter-efficient finetuning algorithm and is more efficient than fu
```sh
cd data && ./download.sh alpaca && cd -
-./scripts/run_finetune_with_lora.sh \
+bash ./scripts/run_finetune_with_lora.sh \
--model_name_or_path facebook/galactica-1.3b \
--dataset_path data/alpaca/train_conversation \
--output_lora_path output_models/finetuned_galactica_lora
@@ -197,7 +197,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
->./scripts/run_finetune_with_lora.sh \
+>bash ./scripts/run_finetune_with_lora.sh \
> --model_name_or_path meta-llama/Llama-2-7b-hf \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama2 \
@@ -209,7 +209,7 @@ cd data && ./download.sh alpaca && cd -
>
>Merge LoRA weight and the base model into one using:
>```sh
->./scripts/run_merge_lora.sh \
+>bash ./scripts/run_merge_lora.sh \
> --model_name_or_path Qwen/Qwen1.5-1.8B \
> --lora_model_path output_models/lora \
> --output_model_path output_models/lora_merged \
@@ -219,9 +219,22 @@ cd data && ./download.sh alpaca && cd -
### Inference
After finetuning, you can run the following command to chat with the model.
```sh
-./scripts/run_chatbot.sh output_models/finetuned_gpt2
+bash ./scripts/run_chatbot.sh output_models/finetuned_gpt2
```
+> [!TIP]
+> We recommend using vLLM for faster inference.
+>
+> Faster inference using vLLM
+>
+>```bash
+>bash ./scripts/run_vllm_inference.sh \
+> --model_name_or_path Qwen/Qwen2-0.5B \
+> --dataset_path data/alpaca/test_conversation \
+> --output_dir data/inference_results \
+>```
+>
+
### Deployment
If you want to deploy your own model locally, we provide a gradio-based UI for building chatbots.
Running the following command will launch the demo for robin-7b:
@@ -240,7 +253,7 @@ You can directly run the LMFlow benchmark evaluation to obtain the results to pa
[LLM comparision](https://docs.google.com/spreadsheets/d/1JYh4_pxNzmNA9I0YM2epgRA7VXBIeIGS64gPJBg5NHA/edit?usp=sharing).
For example, to run GPT2 XL, one may execute
```sh
-./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
+bash ./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
```
`--model_name_or_path` is required, you may fill in huggingface model name or local model path here.
@@ -288,6 +301,10 @@ To check the evaluation results, you may check `benchmark.log` in `./output_dir/
LMFlow supports both FlashAttention-1 and the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
+* vLLM
+
+ Try vLLM for fast and easy-to-use LLM inference and serving. Thanks for the [great work](https://github.com/vllm-project/vllm)!
+
Long Context
diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py
new file mode 100644
index 000000000..83f89b008
--- /dev/null
+++ b/examples/vllm_inference.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import logging
+import os
+import sys
+
+from transformers import (
+ HfArgumentParser
+)
+
+from lmflow.datasets import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.args import (
+ ModelArguments,
+ DatasetArguments,
+ AutoArguments,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+ # Parses arguments
+ pipeline_name = "vllm_inferencer"
+ PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+ parser = HfArgumentParser((
+ ModelArguments,
+ DatasetArguments,
+ PipelineArguments
+ ))
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+ # If we pass only one argument to the script and it's the path to a json file,
+ # let's parse it to get our arguments.
+ model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+ else:
+ model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+ dataset = Dataset(data_args)
+ model = HFDecoderModel(model_args)
+ inferencer = AutoPipeline.get_pipeline(
+ pipeline_name=pipeline_name,
+ model_args=model_args,
+ data_args=data_args,
+ pipeline_args=pipeline_args
+ )
+
+ res = inferencer.inference(
+ model,
+ dataset,
+ release_gpu=False,
+ enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
+ )
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/scripts/run_vllm_inference.sh b/scripts/run_vllm_inference.sh
new file mode 100644
index 000000000..681d2d5ec
--- /dev/null
+++ b/scripts/run_vllm_inference.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+
+# Parses arguments
+run_name=vllm_inference
+model_name_or_path='Qwen/Qwen2-0.5B'
+dataset_path=data/alpaca/test_conversation
+output_dir=data/inference_results
+output_file_name=results.json
+apply_chat_template=True
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+ key="$1"
+ case ${key} in
+ -r|--run_name)
+ run_name="$2"
+ shift
+ ;;
+ -m|--model_name_or_path)
+ model_name_or_path="$2"
+ shift
+ ;;
+ -d|--dataset_path)
+ dataset_path="$2"
+ shift
+ ;;
+ --output_dir)
+ output_dir="$2"
+ shift
+ ;;
+ --output_file_name)
+ output_file_name="$2"
+ shift
+ ;;
+ --apply_chat_template)
+ apply_chat_template="$2"
+ shift
+ ;;
+ --trust_remote_code)
+ trust_remote_code="$2"
+ shift
+ ;;
+ *)
+ echo "error: unknown option \"${key}\"" 1>&2
+ exit 1
+ esac
+ shift
+done
+
+# inference
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${run_name}
+output_file_path=${output_dir}/${run_name}/${output_file_name}
+mkdir -p ${output_dir}/${run_name} ${log_dir}
+
+python examples/vllm_inference.py \
+ --use_vllm True \
+ --trust_remote_code ${trust_remote_code} \
+ --model_name_or_path ${model_name_or_path} \
+ --dataset_path ${dataset_path} \
+ --preprocessing_num_workers 16 \
+ --random_seed 42 \
+ --apply_chat_template ${apply_chat_template} \
+ --num_output_sequences 2 \
+ --use_beam_search False \
+ --temperature 1.0 \
+ --top_p 0.9 \
+ --max_new_tokens 1024 \
+ --save_results True \
+ --results_path ${output_file_path} \
+ --enable_decode_inference_result False \
+ --vllm_gpu_memory_utilization 0.95 \
+ --vllm_tensor_parallel_size 2 \
+ 2>&1 | tee ${log_dir}/vllm_inference.log
\ No newline at end of file
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 7935a89b5..56d8d43e3 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -856,16 +856,12 @@ class InferencerArguments:
Whether to save inference results, By default False.
results_path : Optional[str]
The **json file** path of inference results, By default None.
- memory_safe_vllm_inference_detokenize : Optional[bool]
- Whether to detokenize the memory safe vllm inference results.
+ enable_decode_inference_result : Optional[bool]
+ Whether to detokenize the inference results.
NOTE: For iterative align pipelines, whether to detokenize depends on
the homogeneity of the policy model and the reward model
- (i.e., if they have the same tokenizer).
- The reason why `detokenize` for memory safe vllm inference is
- included in args is due to the its implementation (i.e., subprocess
- rather than within the python codes, thus have to communicate through
- command line arguments).
+ (i.e., if they have the same tokenizer).
use_vllm: bool, optional
Whether to use VLLM for inference, By default False.
vllm_tensor_parallel_size: int, optional
@@ -964,9 +960,9 @@ class InferencerArguments:
default=True,
metadata={"help": "whether to apply chat template"},
)
- memory_safe_vllm_inference_detokenize: Optional[bool] = field(
+ enable_decode_inference_result: Optional[bool] = field(
default=False,
- metadata={"help": "Whether to detokenize the memory safe vllm inference results."},
+ metadata={"help": "Whether to decode the inference results."},
)
# vllm inference args
@@ -1254,6 +1250,7 @@ class IterativeAlignerArguments(InferencerArguments):
"finetuner": FinetunerArguments,
"evaluator": EvaluatorArguments,
"inferencer": InferencerArguments,
+ "vllm_inferencer": InferencerArguments,
"raft_aligner": RaftAlignerArguments,
"dpo_aligner": DPOAlignerArguments,
"rm_tuner": RewardModelingArguments,
diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py
index 2721c1947..c01e916da 100644
--- a/src/lmflow/models/hf_model_mixin.py
+++ b/src/lmflow/models/hf_model_mixin.py
@@ -449,7 +449,7 @@ def __prepare_model_for_vllm_inference(
self.backend_model_for_inference = LLM(
model=model_args.model_name_or_path,
tokenizer=model_args.model_name_or_path,
- dtype=model_args.torch_dtype,
+ dtype=model_args.torch_dtype if model_args.torch_dtype else "auto",
load_format="auto",
gpu_memory_utilization=vllm_gpu_memory_utilization,
tensor_parallel_size=vllm_tensor_parallel_size,
diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index 9d5dd007f..a4c053cf6 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -17,12 +17,14 @@ def is_package_version_at_least(package_name, min_version):
from lmflow.pipeline.evaluator import Evaluator
from lmflow.pipeline.finetuner import Finetuner
from lmflow.pipeline.inferencer import Inferencer
+from lmflow.pipeline.vllm_inferencer import VLLMInferencer
from lmflow.pipeline.dpo_aligner import DPOAligner
from lmflow.pipeline.rm_tuner import RewardModelingTuner
PIPELINE_MAPPING = {
"evaluator": Evaluator,
"finetuner": Finetuner,
"inferencer": Inferencer,
+ "vllm_inferencer": VLLMInferencer,
"dpo_aligner": DPOAligner,
"rm_tuner": RewardModelingTuner,
}
diff --git a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
index 74c3e7fc2..3502d13e2 100644
--- a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
+++ b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
@@ -31,7 +31,7 @@
def main():
# Parses arguments
- pipeline_name = "inferencer"
+ pipeline_name = "vllm_inferencer"
PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
parser = HfArgumentParser((
@@ -48,13 +48,13 @@ def main():
dataset = Dataset(data_args)
model = HFDecoderModel(model_args)
- inferencer = VLLMInferencer(model_args, pipeline_args)
+ inferencer = VLLMInferencer(model_args, data_args, pipeline_args)
res = inferencer.inference(
model,
dataset,
release_gpu=False,
- detokenize=pipeline_args.memory_safe_vllm_inference_detokenize,
+ enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
)
# use this as a flag, stdout will be captured by the pipeline
diff --git a/src/lmflow/pipeline/vllm_inferencer.py b/src/lmflow/pipeline/vllm_inferencer.py
index 6d4520a70..a109aef91 100644
--- a/src/lmflow/pipeline/vllm_inferencer.py
+++ b/src/lmflow/pipeline/vllm_inferencer.py
@@ -24,7 +24,7 @@
DatasetArguments,
)
from lmflow.utils.common import make_shell_args_from_dataclass
-from lmflow.utils.constants import MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG
+from lmflow.utils.constants import RETURN_CODE_ERROR_BUFFER
logger = logging.getLogger(__name__)
@@ -34,9 +34,11 @@ class InferencerWithOffloading(BasePipeline):
def __init__(
self,
model_args: ModelArguments,
+ data_args: DatasetArguments,
inferencer_args: InferencerArguments,
):
self.model_args = model_args
+ self.data_args = data_args
self.inferencer_args = inferencer_args
self.eos_token_id = AutoTokenizer.from_pretrained(model_args.model_name_or_path).eos_token_id
@@ -54,10 +56,11 @@ class VLLMInferencer(InferencerWithOffloading):
def __init__(
self,
model_args: ModelArguments,
+ data_args: DatasetArguments,
inferencer_args: InferencerArguments,
):
assert inferencer_args.use_vllm, "The inferencer_args.use_vllm must be True."
- super().__init__(model_args, inferencer_args)
+ super().__init__(model_args, data_args, inferencer_args)
self.sampling_params = self.parse_to_sampling_params(inferencer_args)
@@ -81,7 +84,7 @@ def inference(
self,
model: HFDecoderModel,
dataset: Dataset,
- detokenize: bool = True,
+ enable_decode_inference_result: bool = True,
release_gpu: bool = False,
inference_args: Optional[InferencerArguments] = None,
) -> Union[List[List[str]], List[List[List[int]]]]:
@@ -96,25 +99,22 @@ def inference(
LMFlow Dataset object
apply_chat_template : bool, optional
Whether to apply chat template to the input, by default True.
- detokenize : bool, optional
+ enable_decode_inference_result : bool, optional
Whether to decode after generation, by default False.
release_gpu : bool, optional
Whether to release gpu resources, by default False.
- NOTE: The reason why `release_gpu` and `detokenize` are not in `inference_args` is that
- Inferencer may be used by other pipeline, and the pipeline may want to control these
- two behaviors dynamically.
inference_args : InferencerArguments, optional
by default None
Returns
-------
Union[List[List[str]], List[List[List[int]]]]
- When `detokenize = True`, return a list of list of strings. Inner list
+ When `enable_decode_inference_result = True`, return a list of list of strings. Inner list
contains inference_args.num_output_sequences samples for a single prompt
(i.e., `len(res[i]) = inference_args.num_output_sequences`). Outer list
contains the results for all prompts (i.e., `len(res) = len(dataset)`).
- When `detokenize = False`, return a list of list of list of ints
+ When `enable_decode_inference_result = False`, return a list of list of list of ints
(token ids, no decoding after generation).
"""
if inference_args:
@@ -125,7 +125,7 @@ def inference(
else:
sampling_params = self.sampling_params
- sampling_params.detokenize = detokenize
+ sampling_params.detokenize = enable_decode_inference_result
model_input = model.prepare_inputs_for_inference(
dataset=dataset,
@@ -177,8 +177,7 @@ def __init__(
inferencer_args: InferencerArguments,
):
assert inferencer_args.save_results, "For MemorySafeVLLMInferencer, `save_results` must be True."
- super().__init__(model_args, inferencer_args)
- self.data_args = data_args
+ super().__init__(model_args, data_args, inferencer_args)
self.inferencer_file_path = pkg_resources.files("lmflow.pipeline.utils") / "memory_safe_vllm_inference.py"
@@ -200,16 +199,20 @@ def inference(self):
shell=True,
preexec_fn=os.setsid
)
- # wait for the subprocess to finish (kill cleanly, otherwise may leads to:
- # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''>
- # > at interpreter shutdown, possibly due to daemon threads
- time.sleep(30)
logger.info(f"MemorySafeVLLMInference subprocess run finished, info at finish: {cli_res}")
- if cli_res.returncode != 0:
- raise RuntimeError(f"Error during MemorySafeVLLMInference.")
+ if cli_res.returncode in RETURN_CODE_ERROR_BUFFER:
+ # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''>
+ # > at interpreter shutdown, possibly due to daemon threads
+ logger.warning(
+ "^^^^^^^^^^ Please ignore the above error, as it comes from the subprocess. "
+ "This may due a kill signal with unfinished stdout/stderr writing in the subprocess. "
+ )
else:
- outputs = self.load_inference_results(self.inferencer_args.results_path)
- logger.info("MemorySafeVLLMInference result captured.")
-
- return outputs
\ No newline at end of file
+ if cli_res.returncode != 0:
+ raise RuntimeError(f"Error during MemorySafeVLLMInference: {cli_res}")
+
+ outputs = self.load_inference_results(self.inferencer_args.results_path)
+ logger.info("MemorySafeVLLMInference result captured.")
+
+ return outputs
\ No newline at end of file
diff --git a/src/lmflow/utils/constants.py b/src/lmflow/utils/constants.py
index 5506eb55a..bb17899dc 100644
--- a/src/lmflow/utils/constants.py
+++ b/src/lmflow/utils/constants.py
@@ -316,4 +316,12 @@
}
# vllm inference
-MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
\ No newline at end of file
+MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
+RETURN_CODE_ERROR_BUFFER = [
+ 134
+]
+# return code 134:
+# > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name=''>
+# > at interpreter shutdown, possibly due to daemon threads
+# The above error, by our observation, is due to the kill signal with unfinished
+# stdout/stderr writing in the subprocess
\ No newline at end of file
diff --git a/tests/pipeline/test_memory_safe_vllm_inferencer.py b/tests/pipeline/test_memory_safe_vllm_inferencer.py
index b2c2b2803..19c9a6a1e 100644
--- a/tests/pipeline/test_memory_safe_vllm_inferencer.py
+++ b/tests/pipeline/test_memory_safe_vllm_inferencer.py
@@ -29,7 +29,7 @@
save_results=True,
results_path='./data/mem_safe_vllm_res.json',
use_vllm=True,
- memory_safe_vllm_inference_detokenize=False,
+ enable_decode_inference_result=False,
vllm_gpu_memory_utilization=0.95,
vllm_tensor_parallel_size=2,
)
@@ -58,7 +58,7 @@ def test_inference(self):
logger.warning(f"test_inference: {test_res}")
def test_inference_detokenize(self):
- inferencer_args.memory_safe_vllm_inference_detokenize = True
+ inferencer_args.enable_decode_inference_result = True
self.inferencer = MemorySafeVLLMInferencer(
model_args=model_args,
data_args=data_args,