diff --git a/README.md b/README.md
index f88faf6e1..c772e98ff 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ Here is an example to finetune a GPT-2 base model.
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune.sh \
+bash ./scripts/run_finetune.sh \
   --model_name_or_path gpt2 \
   --dataset_path data/alpaca/train_conversation \
   --output_model_path output_models/finetuned_gpt2
@@ -141,7 +141,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune.sh \
+>bash ./scripts/run_finetune.sh \
 >  --model_name_or_path meta-llama/Meta-Llama-3-8B \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama3 \
@@ -155,7 +155,7 @@ cd data && ./download.sh alpaca && cd -
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune_with_lisa.sh \
+bash ./scripts/run_finetune_with_lisa.sh \
   --model_name_or_path meta-llama/Llama-2-7b-hf \
   --dataset_path data/alpaca/train_conversation \
   --output_model_path output_models/finetuned_llama2_7b \
@@ -169,7 +169,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune_with_lisa.sh \
+>bash ./scripts/run_finetune_with_lisa.sh \
 >  --model_name_or_path meta-llama/Llama-2-7b-hf \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama2 \
@@ -185,7 +185,7 @@ LoRA is a parameter-efficient finetuning algorithm and is more efficient than fu
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune_with_lora.sh \
+bash ./scripts/run_finetune_with_lora.sh \
   --model_name_or_path facebook/galactica-1.3b \
   --dataset_path data/alpaca/train_conversation \
   --output_lora_path output_models/finetuned_galactica_lora
@@ -197,7 +197,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune_with_lora.sh \
+>bash ./scripts/run_finetune_with_lora.sh \
 >  --model_name_or_path meta-llama/Llama-2-7b-hf \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama2 \
@@ -209,7 +209,7 @@ cd data && ./download.sh alpaca && cd -
 >
 >Merge LoRA weight and the base model into one using:  
 >```sh
->./scripts/run_merge_lora.sh \
+>bash ./scripts/run_merge_lora.sh \
 >  --model_name_or_path Qwen/Qwen1.5-1.8B \
 >  --lora_model_path output_models/lora \
 >  --output_model_path output_models/lora_merged \
@@ -219,9 +219,22 @@ cd data && ./download.sh alpaca && cd -
 ### Inference
 After finetuning, you can run the following command to chat with the model.
 ```sh
-./scripts/run_chatbot.sh output_models/finetuned_gpt2
+bash ./scripts/run_chatbot.sh output_models/finetuned_gpt2
 ```
 
+> [!TIP]
+> We recommend using vLLM for faster inference.
+> 
+> <details><summary>Faster inference using vLLM</summary>  
+>
+>```bash
+>bash ./scripts/run_vllm_inference.sh \
+>   --model_name_or_path Qwen/Qwen2-0.5B \
+>   --dataset_path data/alpaca/test_conversation \
+>   --output_dir data/inference_results \
+>```
+> </details>
+
 ### Deployment
 If you want to deploy your own model locally, we provide a gradio-based UI for building chatbots. 
 Running the following command will launch the demo for robin-7b:
@@ -240,7 +253,7 @@ You can directly run the LMFlow benchmark evaluation to obtain the results to pa
 [LLM comparision](https://docs.google.com/spreadsheets/d/1JYh4_pxNzmNA9I0YM2epgRA7VXBIeIGS64gPJBg5NHA/edit?usp=sharing).
 For example, to run GPT2 XL, one may execute
 ```sh
-./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
+bash ./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
 ```
 `--model_name_or_path` is required, you may fill in huggingface model name or local model path here.
 
@@ -288,6 +301,10 @@ To check the evaluation results, you may check `benchmark.log` in `./output_dir/
 
   LMFlow supports both FlashAttention-1 and the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 
+* vLLM
+
+  Try vLLM for fast and easy-to-use LLM inference and serving. Thanks for the [great work](https://github.com/vllm-project/vllm)!
+
 </details>
 
 <details> <summary>Long Context</summary>
diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py
new file mode 100644
index 000000000..83f89b008
--- /dev/null
+++ b/examples/vllm_inference.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import logging
+import os
+import sys
+
+from transformers import (
+    HfArgumentParser
+)
+
+from lmflow.datasets import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.args import (
+    ModelArguments, 
+    DatasetArguments, 
+    AutoArguments,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # Parses arguments
+    pipeline_name = "vllm_inferencer"
+    PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+    parser = HfArgumentParser((
+        ModelArguments, 
+        DatasetArguments,
+        PipelineArguments
+    ))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+    dataset = Dataset(data_args)
+    model = HFDecoderModel(model_args)
+    inferencer = AutoPipeline.get_pipeline(
+        pipeline_name=pipeline_name,
+        model_args=model_args,
+        data_args=data_args,
+        pipeline_args=pipeline_args
+    )
+
+    res = inferencer.inference(
+        model,
+        dataset,
+        release_gpu=False,
+        enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
+    )
+    
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/run_vllm_inference.sh b/scripts/run_vllm_inference.sh
new file mode 100644
index 000000000..681d2d5ec
--- /dev/null
+++ b/scripts/run_vllm_inference.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+
+# Parses arguments
+run_name=vllm_inference
+model_name_or_path='Qwen/Qwen2-0.5B'
+dataset_path=data/alpaca/test_conversation
+output_dir=data/inference_results
+output_file_name=results.json
+apply_chat_template=True
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -r|--run_name)
+      run_name="$2"
+      shift
+      ;;
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    --output_dir)
+      output_dir="$2"
+      shift
+      ;;
+    --output_file_name)
+      output_file_name="$2"
+      shift
+      ;;
+    --apply_chat_template)
+      apply_chat_template="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# inference
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${run_name}
+output_file_path=${output_dir}/${run_name}/${output_file_name}
+mkdir -p ${output_dir}/${run_name} ${log_dir}
+
+python examples/vllm_inference.py \
+  --use_vllm True \
+  --trust_remote_code ${trust_remote_code} \
+  --model_name_or_path ${model_name_or_path} \
+  --dataset_path ${dataset_path} \
+  --preprocessing_num_workers 16 \
+  --random_seed 42 \
+  --apply_chat_template ${apply_chat_template} \
+  --num_output_sequences 2 \
+  --use_beam_search False \
+  --temperature 1.0 \
+  --top_p 0.9 \
+  --max_new_tokens 1024 \
+  --save_results True \
+  --results_path ${output_file_path} \
+  --enable_decode_inference_result False \
+  --vllm_gpu_memory_utilization 0.95 \
+  --vllm_tensor_parallel_size 2 \
+  2>&1 | tee ${log_dir}/vllm_inference.log
\ No newline at end of file
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 7935a89b5..56d8d43e3 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -856,16 +856,12 @@ class InferencerArguments:
         Whether to save inference results, By default False.
     results_path : Optional[str]
         The **json file** path of inference results, By default None.
-    memory_safe_vllm_inference_detokenize : Optional[bool]
-        Whether to detokenize the memory safe vllm inference results. 
+    enable_decode_inference_result : Optional[bool]
+        Whether to detokenize the inference results. 
 
         NOTE: For iterative align pipelines, whether to detokenize depends on 
         the homogeneity of the policy model and the reward model 
-        (i.e., if they have the same tokenizer). 
-        The reason why `detokenize` for memory safe vllm inference is 
-        included in args is due to the its implementation (i.e., subprocess 
-        rather than within the python codes, thus have to communicate through 
-        command line arguments).
+        (i.e., if they have the same tokenizer).
     use_vllm: bool, optional
         Whether to use VLLM for inference, By default False.
     vllm_tensor_parallel_size: int, optional
@@ -964,9 +960,9 @@ class InferencerArguments:
         default=True,
         metadata={"help": "whether to apply chat template"},
     )
-    memory_safe_vllm_inference_detokenize: Optional[bool] = field(
+    enable_decode_inference_result: Optional[bool] = field(
         default=False,
-        metadata={"help": "Whether to detokenize the memory safe vllm inference results."},
+        metadata={"help": "Whether to decode the inference results."},
     )
     
     # vllm inference args
@@ -1254,6 +1250,7 @@ class IterativeAlignerArguments(InferencerArguments):
     "finetuner": FinetunerArguments,
     "evaluator": EvaluatorArguments,
     "inferencer": InferencerArguments,
+    "vllm_inferencer": InferencerArguments,
     "raft_aligner": RaftAlignerArguments,
     "dpo_aligner": DPOAlignerArguments,
     "rm_tuner": RewardModelingArguments,
diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py
index 2721c1947..c01e916da 100644
--- a/src/lmflow/models/hf_model_mixin.py
+++ b/src/lmflow/models/hf_model_mixin.py
@@ -449,7 +449,7 @@ def __prepare_model_for_vllm_inference(
         self.backend_model_for_inference = LLM(
             model=model_args.model_name_or_path,
             tokenizer=model_args.model_name_or_path,
-            dtype=model_args.torch_dtype,
+            dtype=model_args.torch_dtype if model_args.torch_dtype else "auto",
             load_format="auto",
             gpu_memory_utilization=vllm_gpu_memory_utilization,
             tensor_parallel_size=vllm_tensor_parallel_size,
diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index 9d5dd007f..a4c053cf6 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -17,12 +17,14 @@ def is_package_version_at_least(package_name, min_version):
 from lmflow.pipeline.evaluator import Evaluator
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
+from lmflow.pipeline.vllm_inferencer import VLLMInferencer
 from lmflow.pipeline.dpo_aligner import DPOAligner
 from lmflow.pipeline.rm_tuner import RewardModelingTuner
 PIPELINE_MAPPING = {
     "evaluator": Evaluator,
     "finetuner": Finetuner,
     "inferencer": Inferencer,
+    "vllm_inferencer": VLLMInferencer,
     "dpo_aligner": DPOAligner,
     "rm_tuner": RewardModelingTuner,
 }
diff --git a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
index 74c3e7fc2..3502d13e2 100644
--- a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
+++ b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
@@ -31,7 +31,7 @@
 
 def main():
     # Parses arguments
-    pipeline_name = "inferencer"
+    pipeline_name = "vllm_inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
     parser = HfArgumentParser((
@@ -48,13 +48,13 @@ def main():
 
     dataset = Dataset(data_args)
     model = HFDecoderModel(model_args)
-    inferencer = VLLMInferencer(model_args, pipeline_args)
+    inferencer = VLLMInferencer(model_args, data_args, pipeline_args)
 
     res = inferencer.inference(
         model,
         dataset,
         release_gpu=False,
-        detokenize=pipeline_args.memory_safe_vllm_inference_detokenize,
+        enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
     )
     
     # use this as a flag, stdout will be captured by the pipeline
diff --git a/src/lmflow/pipeline/vllm_inferencer.py b/src/lmflow/pipeline/vllm_inferencer.py
index 6d4520a70..a109aef91 100644
--- a/src/lmflow/pipeline/vllm_inferencer.py
+++ b/src/lmflow/pipeline/vllm_inferencer.py
@@ -24,7 +24,7 @@
     DatasetArguments,
 )
 from lmflow.utils.common import make_shell_args_from_dataclass
-from lmflow.utils.constants import MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG
+from lmflow.utils.constants import RETURN_CODE_ERROR_BUFFER
 
 
 logger = logging.getLogger(__name__)
@@ -34,9 +34,11 @@ class InferencerWithOffloading(BasePipeline):
     def __init__(
         self, 
         model_args: ModelArguments,
+        data_args: DatasetArguments,
         inferencer_args: InferencerArguments,
     ):
         self.model_args = model_args
+        self.data_args = data_args
         self.inferencer_args = inferencer_args
         self.eos_token_id = AutoTokenizer.from_pretrained(model_args.model_name_or_path).eos_token_id
 
@@ -54,10 +56,11 @@ class VLLMInferencer(InferencerWithOffloading):
     def __init__(
         self,
         model_args: ModelArguments,
+        data_args: DatasetArguments,
         inferencer_args: InferencerArguments,
     ):
         assert inferencer_args.use_vllm, "The inferencer_args.use_vllm must be True."
-        super().__init__(model_args, inferencer_args)
+        super().__init__(model_args, data_args, inferencer_args)
         self.sampling_params = self.parse_to_sampling_params(inferencer_args)
         
     
@@ -81,7 +84,7 @@ def inference(
         self,
         model: HFDecoderModel, 
         dataset: Dataset, 
-        detokenize: bool = True,
+        enable_decode_inference_result: bool = True,
         release_gpu: bool = False,
         inference_args: Optional[InferencerArguments] = None,
     ) -> Union[List[List[str]], List[List[List[int]]]]:
@@ -96,25 +99,22 @@ def inference(
             LMFlow Dataset object
         apply_chat_template : bool, optional
             Whether to apply chat template to the input, by default True.
-        detokenize : bool, optional
+        enable_decode_inference_result : bool, optional
             Whether to decode after generation, by default False.
         release_gpu : bool, optional
             Whether to release gpu resources, by default False. 
-            NOTE: The reason why `release_gpu` and `detokenize` are not in `inference_args` is that
-            Inferencer may be used by other pipeline, and the pipeline may want to control these 
-            two behaviors dynamically. 
         inference_args : InferencerArguments, optional
             by default None
 
         Returns
         -------
         Union[List[List[str]], List[List[List[int]]]]
-            When `detokenize = True`, return a list of list of strings. Inner list
+            When `enable_decode_inference_result = True`, return a list of list of strings. Inner list
             contains inference_args.num_output_sequences samples for a single prompt 
             (i.e., `len(res[i]) = inference_args.num_output_sequences`). Outer list 
             contains the results for all prompts (i.e., `len(res) = len(dataset)`).
             
-            When `detokenize = False`, return a list of list of list of ints 
+            When `enable_decode_inference_result = False`, return a list of list of list of ints 
             (token ids, no decoding after generation).
         """
         if inference_args:
@@ -125,7 +125,7 @@ def inference(
         else:
             sampling_params = self.sampling_params
             
-        sampling_params.detokenize = detokenize
+        sampling_params.detokenize = enable_decode_inference_result
         
         model_input = model.prepare_inputs_for_inference(
             dataset=dataset, 
@@ -177,8 +177,7 @@ def __init__(
         inferencer_args: InferencerArguments,
     ):
         assert inferencer_args.save_results, "For MemorySafeVLLMInferencer, `save_results` must be True."
-        super().__init__(model_args, inferencer_args)
-        self.data_args = data_args
+        super().__init__(model_args, data_args, inferencer_args)
         self.inferencer_file_path = pkg_resources.files("lmflow.pipeline.utils") / "memory_safe_vllm_inference.py"
         
     
@@ -200,16 +199,20 @@ def inference(self):
             shell=True,
             preexec_fn=os.setsid
         )
-        # wait for the subprocess to finish (kill cleanly, otherwise may leads to:
-        # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> 
-        # > at interpreter shutdown, possibly due to daemon threads
-        time.sleep(30) 
         logger.info(f"MemorySafeVLLMInference subprocess run finished, info at finish: {cli_res}")
         
-        if cli_res.returncode != 0:
-            raise RuntimeError(f"Error during MemorySafeVLLMInference.")
+        if cli_res.returncode in RETURN_CODE_ERROR_BUFFER:
+            # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> 
+            # > at interpreter shutdown, possibly due to daemon threads
+            logger.warning(
+                "^^^^^^^^^^ Please ignore the above error, as it comes from the subprocess. "
+                "This may due a kill signal with unfinished stdout/stderr writing in the subprocess. "
+            )
         else:
-            outputs = self.load_inference_results(self.inferencer_args.results_path)
-            logger.info("MemorySafeVLLMInference result captured.")
-            
-            return outputs
\ No newline at end of file
+            if cli_res.returncode != 0:
+                raise RuntimeError(f"Error during MemorySafeVLLMInference: {cli_res}")
+                
+        outputs = self.load_inference_results(self.inferencer_args.results_path)
+        logger.info("MemorySafeVLLMInference result captured.")
+        
+        return outputs
\ No newline at end of file
diff --git a/src/lmflow/utils/constants.py b/src/lmflow/utils/constants.py
index 5506eb55a..bb17899dc 100644
--- a/src/lmflow/utils/constants.py
+++ b/src/lmflow/utils/constants.py
@@ -316,4 +316,12 @@
 }
 
 # vllm inference
-MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
\ No newline at end of file
+MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
+RETURN_CODE_ERROR_BUFFER = [
+    134
+]
+# return code 134:
+# > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> 
+# > at interpreter shutdown, possibly due to daemon threads
+# The above error, by our observation, is due to the kill signal with unfinished 
+# stdout/stderr writing in the subprocess
\ No newline at end of file
diff --git a/tests/pipeline/test_memory_safe_vllm_inferencer.py b/tests/pipeline/test_memory_safe_vllm_inferencer.py
index b2c2b2803..19c9a6a1e 100644
--- a/tests/pipeline/test_memory_safe_vllm_inferencer.py
+++ b/tests/pipeline/test_memory_safe_vllm_inferencer.py
@@ -29,7 +29,7 @@
     save_results=True,
     results_path='./data/mem_safe_vllm_res.json',
     use_vllm=True,
-    memory_safe_vllm_inference_detokenize=False,
+    enable_decode_inference_result=False,
     vllm_gpu_memory_utilization=0.95,
     vllm_tensor_parallel_size=2,
 )
@@ -58,7 +58,7 @@ def test_inference(self):
         logger.warning(f"test_inference: {test_res}")
         
     def test_inference_detokenize(self):
-        inferencer_args.memory_safe_vllm_inference_detokenize = True
+        inferencer_args.enable_decode_inference_result = True
         self.inferencer = MemorySafeVLLMInferencer(
             model_args=model_args,
             data_args=data_args,