From b7ed014b33d10d51a91302c7f8771d5944cea36e Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Tue, 1 Apr 2025 19:31:00 -0700 Subject: [PATCH 01/32] Update qwen2.md --- docs/source/en/model_doc/qwen2.md | 233 ++++++++++++++++++++++-------- 1 file changed, 175 insertions(+), 58 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index dc6201d0de5e..4465ed3d091e 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -14,85 +14,202 @@ rendered properly in your Markdown viewer. --> -# Qwen2 - -
-PyTorch -FlashAttention -SDPA +
+
+ PyTorch + FlashAttention + SDPA +
-## Overview - -Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc. - -### Model Details +# Qwen2 -Qwen2 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. +[Qwen2](https://huggingface.co/collections/qwen/qwen2-6659360b3352f8ffa74171a3) is a family of large language models (LLMs) developed by the Qwen team, Alibaba Cloud. Available in sizes from 0.5B to 72B parameters, including a Mixture-of-Experts model, Qwen2 offers both pretrained base models and instruction-tuned chat models. The models are built on the Transformer architecture featuring enhancements like SwiGLU activation, group query attention (GQA), and a mix of sliding window and full attention. Qwen2 models utilize an improved tokenizer optimized for multiple languages and code, and support context lengths up to 131,072 tokens (via YARN scaling for larger models). They demonstrate state-of-the-art performance across various benchmarks. +You can find all the official Qwen2 checkpoints under the [Qwen2 Collection](https://huggingface.co/collections/qwen/qwen2-6659360b3352f8ffa74171a3). -## Usage tips +> [!TIP] +> Click on the Qwen2 models in the right sidebar for more examples of how to apply Qwen2 to different language tasks. -`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line using the instruction-tuned models. -In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. + + ```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer ->>> device = "cuda" # the device to load the model onto - ->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto") ->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") - ->>> prompt = "Give me a short introduction to large language model." - ->>> messages = [{"role": "user", "content": prompt}] - ->>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - ->>> model_inputs = tokenizer([text], return_tensors="pt").to(device) - ->>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) - ->>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] - ->>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -``` - -## Qwen2Config - +import torch +from transformers import pipeline + +# Use a pipeline as a high-level helper +pipe = pipeline( + task="text-generation", + model="Qwen/Qwen2-7B-Instruct", + torch_dtype=torch.bfloat16, + device_map="auto" # or device=0 for single GPU +) + +messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me about the Qwen2 model family."}, +] +# We use the tokenizer's chat template to format the messages +# But the pipeline handles this automatically for chat models +outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) +print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's response + + + + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +device = "cuda" # or "cpu" or "mps" +model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen2-7B-Instruct", + torch_dtype="auto", # use torch.bfloat16 for optimal performance + device_map="auto", + attn_implementation="flash_attention_2" # use "sdpa" or None for CPU compatibility +) +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") + +prompt = "Give me a short introduction to large language models." +messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt} +] +text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True +) +model_inputs = tokenizer([text], return_tensors="pt").to(device) + +generated_ids = model.generate( + model_inputs.input_ids, + max_new_tokens=512, + do_sample=True, # Enable sampling + temperature=0.7, # Control randomness + top_k=50, # Control diversity + top_p=0.95 # Control diversity +) +# Slice the output to remove the input tokens +generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) +] + +response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(response) + + + + +# Make sure you are logged in (`huggingface-cli login`) +# Requires transformers>=4.37.0 +transformers-cli chat --model Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device auto +# You can then type your messages in the terminal + + + + +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the Quantization overview for more available quantization backends. + +The example below uses bitsandbytes to quantize the weights to 4-bits. Note that quantization usually works best with base models. + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig + +# Configure quantization +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, # Recommended compute dtype + bnb_4bit_quant_type="nf4", # Use NF4 quantization + bnb_4bit_use_double_quant=True, # Enable double quantization +) + +# Load tokenizer and quantized model +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") # Use base model +model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen2-7B", # Use base model + torch_dtype=torch.bfloat16, # Load weights in bfloat16 before quantizing + device_map="auto", + quantization_config=quantization_config, + attn_implementation="flash_attention_2" # Recommended for speed +) + +# Prepare input and generate +inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") # Adjust device if needed +outputs = model.generate(**inputs, max_new_tokens=100) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + +Processing Long Contexts +Larger Qwen2 models (like Qwen2-72B) support context lengths up to 131,072 tokens using YARN scaling. + +For deployment, using vLLM is recommended. To enable long-context capabilities: + +1. Install vLLM: + +pip install "vllm>=0.4.3" +# Or install from source: https://github.com/vllm-project/vllm/ + +2. Configure Model Settings: Modify the model's config.json file by adding the rope_scaling attribute: + + { + "architectures": [ + "Qwen2ForCausalLM" + ], + // ... other config ... + "vocab_size": 152064, // Example vocab size, check your model's config + + // --- Add this section --- + "rope_scaling": { + "factor": 4.0, // Adjust factor based on model/desired length if needed + "original_max_position_embeddings": 32768, // Base context length before scaling + "type": "yarn" + } + // --- End section --- + } + +3. Deploy with vLLM: Start a vLLM server pointing to the modified model weights directory: + +python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2-72B-Instruct --model /path/to/your/modified/qwen2/weights + +You can then interact with the API (e.g., via curl) using the extended context length. + +[!NOTE] +vLLM currently supports static YARN, meaning the scaling factor is fixed. This might slightly impact performance on very short texts compared to not using YARN scaling. Consider enabling rope_scaling only when long context processing is essential. + +Notes +Qwen2 requires transformers>=4.37.0 for full support. Ensure your transformers library is up-to-date. + +Qwen2Config [[autodoc]] Qwen2Config -## Qwen2Tokenizer - +Qwen2Tokenizer [[autodoc]] Qwen2Tokenizer - - save_vocabulary - -## Qwen2TokenizerFast +- save_vocabulary +Qwen2TokenizerFast [[autodoc]] Qwen2TokenizerFast -## Qwen2Model - +Qwen2Model [[autodoc]] Qwen2Model - - forward - -## Qwen2ForCausalLM +- forward +Qwen2ForCausalLM [[autodoc]] Qwen2ForCausalLM - - forward - -## Qwen2ForSequenceClassification +- forward +Qwen2ForSequenceClassification [[autodoc]] Qwen2ForSequenceClassification - - forward - -## Qwen2ForTokenClassification +- forward +Qwen2ForTokenClassification [[autodoc]] Qwen2ForTokenClassification - - forward - -## Qwen2ForQuestionAnswering +- forward +Qwen2ForQuestionAnswering [[autodoc]] Qwen2ForQuestionAnswering - - forward +- forward + + + From f575b89ec0f6aff5de2dadeba3c904339f381ed5 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Tue, 1 Apr 2025 19:32:04 -0700 Subject: [PATCH 02/32] Update qwen2.md --- docs/source/en/model_doc/qwen2.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 4465ed3d091e..e13481544e2d 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -56,6 +56,7 @@ messages = [ # But the pipeline handles this automatically for chat models outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's response +``` From fbd92fbb2ae74544598ea402023035a8faa8c164 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Tue, 1 Apr 2025 19:33:58 -0700 Subject: [PATCH 03/32] Update qwen2.md --- docs/source/en/model_doc/qwen2.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index e13481544e2d..a31aa554d105 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -61,6 +61,7 @@ print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's respo +``` import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -100,14 +101,17 @@ generated_ids = [ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) +``` +``` # Make sure you are logged in (`huggingface-cli login`) # Requires transformers>=4.37.0 transformers-cli chat --model Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device auto # You can then type your messages in the terminal +``` @@ -116,6 +120,7 @@ Quantization reduces the memory burden of large models by representing the weigh The example below uses bitsandbytes to quantize the weights to 4-bits. Note that quantization usually works best with base models. +``` import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig @@ -141,6 +146,7 @@ model = AutoModelForCausalLM.from_pretrained( inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") # Adjust device if needed outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` Processing Long Contexts Larger Qwen2 models (like Qwen2-72B) support context lengths up to 131,072 tokens using YARN scaling. @@ -149,11 +155,14 @@ For deployment, using vLLM is recommended. To enable long-context capabilities: 1. Install vLLM: +``` pip install "vllm>=0.4.3" # Or install from source: https://github.com/vllm-project/vllm/ +``` 2. Configure Model Settings: Modify the model's config.json file by adding the rope_scaling attribute: +``` { "architectures": [ "Qwen2ForCausalLM" @@ -169,10 +178,13 @@ pip install "vllm>=0.4.3" } // --- End section --- } +``` 3. Deploy with vLLM: Start a vLLM server pointing to the modified model weights directory: +``` python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2-72B-Instruct --model /path/to/your/modified/qwen2/weights +``` You can then interact with the API (e.g., via curl) using the extended context length. From 671239df70591a6a1d87d2b8f4a698388330af8b Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Tue, 1 Apr 2025 19:37:36 -0700 Subject: [PATCH 04/32] Update qwen2.md --- docs/source/en/model_doc/qwen2.md | 119 +++++++++++++++--------------- 1 file changed, 61 insertions(+), 58 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index a31aa554d105..983c0564cec6 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -61,7 +61,7 @@ print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's respo -``` +```python import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -106,7 +106,7 @@ print(response) -``` +```bash # Make sure you are logged in (`huggingface-cli login`) # Requires transformers>=4.37.0 transformers-cli chat --model Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device auto @@ -116,11 +116,11 @@ transformers-cli chat --model Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_i -Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the Quantization overview for more available quantization backends. +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. -The example below uses bitsandbytes to quantize the weights to 4-bits. Note that quantization usually works best with base models. +The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits. Note that quantization usually works best with base models. -``` +```python import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig @@ -148,81 +148,84 @@ outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` -Processing Long Contexts -Larger Qwen2 models (like Qwen2-72B) support context lengths up to 131,072 tokens using YARN scaling. - -For deployment, using vLLM is recommended. To enable long-context capabilities: - -1. Install vLLM: +## Processing Long Contexts -``` -pip install "vllm>=0.4.3" -# Or install from source: https://github.com/vllm-project/vllm/ -``` +Larger Qwen2 models (like Qwen2-72B) support context lengths up to 131,072 tokens using [YARN](https://arxiv.org/abs/2309.00071) scaling. -2. Configure Model Settings: Modify the model's config.json file by adding the rope_scaling attribute: +For deployment, using vLLM is recommended. To enable long-context capabilities: -``` - { - "architectures": [ - "Qwen2ForCausalLM" - ], - // ... other config ... - "vocab_size": 152064, // Example vocab size, check your model's config - - // --- Add this section --- - "rope_scaling": { - "factor": 4.0, // Adjust factor based on model/desired length if needed - "original_max_position_embeddings": 32768, // Base context length before scaling - "type": "yarn" +1. **Install vLLM**: + ```bash + pip install "vllm>=0.4.3" + # Or install from source: https://github.com/vllm-project/vllm/ + ``` + +2. **Configure Model Settings**: Modify the model's `config.json` file by adding the `rope_scaling` attribute: + ```json + { + "architectures": [ + "Qwen2ForCausalLM" + ], + // ... other config ... + "vocab_size": 152064, // Example vocab size, check your model's config + + // --- Add this section --- + "rope_scaling": { + "factor": 4.0, // Adjust factor based on model/desired length if needed + "original_max_position_embeddings": 32768, // Base context length before scaling + "type": "yarn" + } + // --- End section --- } - // --- End section --- - } -``` + ``` -3. Deploy with vLLM: Start a vLLM server pointing to the modified model weights directory: +3. **Deploy with vLLM**: Start a vLLM server pointing to the modified model weights directory: + ```bash + python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2-72B-Instruct --model /path/to/your/modified/qwen2/weights + ``` + You can then interact with the API (e.g., via `curl`) using the extended context length. -``` -python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2-72B-Instruct --model /path/to/your/modified/qwen2/weights -``` +> [!NOTE] +> vLLM currently supports static YARN, meaning the scaling factor is fixed. This might slightly impact performance on very short texts compared to not using YARN scaling. Consider enabling `rope_scaling` only when long context processing is essential. -You can then interact with the API (e.g., via curl) using the extended context length. +## Notes -[!NOTE] -vLLM currently supports static YARN, meaning the scaling factor is fixed. This might slightly impact performance on very short texts compared to not using YARN scaling. Consider enabling rope_scaling only when long context processing is essential. +- Qwen2 requires `transformers>=4.37.0` for full support. Ensure your `transformers` library is up-to-date. -Notes -Qwen2 requires transformers>=4.37.0 for full support. Ensure your transformers library is up-to-date. +## Qwen2Config -Qwen2Config [[autodoc]] Qwen2Config -Qwen2Tokenizer +## Qwen2Tokenizer + [[autodoc]] Qwen2Tokenizer -- save_vocabulary + - save_vocabulary + +## Qwen2TokenizerFast -Qwen2TokenizerFast [[autodoc]] Qwen2TokenizerFast -Qwen2Model +## Qwen2Model + [[autodoc]] Qwen2Model -- forward + - forward + +## Qwen2ForCausalLM -Qwen2ForCausalLM [[autodoc]] Qwen2ForCausalLM -- forward + - forward + +## Qwen2ForSequenceClassification -Qwen2ForSequenceClassification [[autodoc]] Qwen2ForSequenceClassification -- forward + - forward -Qwen2ForTokenClassification -[[autodoc]] Qwen2ForTokenClassification -- forward +## Qwen2ForTokenClassification -Qwen2ForQuestionAnswering -[[autodoc]] Qwen2ForQuestionAnswering -- forward +[[autodoc]] Qwen2ForTokenClassification + - forward +## Qwen2ForQuestionAnswering - +[[autodoc]] Qwen2ForQuestionAnswering + - forward From 9ecb39378a7d4dc181981bac69fb447b02ec3e37 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:20 -0700 Subject: [PATCH 05/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 983c0564cec6..5b702e31e2bb 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -24,7 +24,7 @@ rendered properly in your Markdown viewer. # Qwen2 -[Qwen2](https://huggingface.co/collections/qwen/qwen2-6659360b3352f8ffa74171a3) is a family of large language models (LLMs) developed by the Qwen team, Alibaba Cloud. Available in sizes from 0.5B to 72B parameters, including a Mixture-of-Experts model, Qwen2 offers both pretrained base models and instruction-tuned chat models. The models are built on the Transformer architecture featuring enhancements like SwiGLU activation, group query attention (GQA), and a mix of sliding window and full attention. Qwen2 models utilize an improved tokenizer optimized for multiple languages and code, and support context lengths up to 131,072 tokens (via YARN scaling for larger models). They demonstrate state-of-the-art performance across various benchmarks. +[Qwen2](https://huggingface.co/papers/2407.10671) is a family of large language models (pretrained, instruction-tuned and mixture-of-experts) available in sizes from 0.5B to 72B parameters. The models are built on the Transformer architecture featuring enhancements like group query attention (GQA), rotary positional embeddings (RoPE), a mix of sliding window and full attention, and dual chunk attention with YARN for training stability. Qwen2 models support multiple languages and context lengths up to 131,072 tokens. You can find all the official Qwen2 checkpoints under the [Qwen2 Collection](https://huggingface.co/collections/qwen/qwen2-6659360b3352f8ffa74171a3). From 3c2c2d01b445e77f63f5035a2fb2802b66aabd87 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:26 -0700 Subject: [PATCH 06/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 5b702e31e2bb..c5a24c0656e0 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -26,7 +26,7 @@ rendered properly in your Markdown viewer. [Qwen2](https://huggingface.co/papers/2407.10671) is a family of large language models (pretrained, instruction-tuned and mixture-of-experts) available in sizes from 0.5B to 72B parameters. The models are built on the Transformer architecture featuring enhancements like group query attention (GQA), rotary positional embeddings (RoPE), a mix of sliding window and full attention, and dual chunk attention with YARN for training stability. Qwen2 models support multiple languages and context lengths up to 131,072 tokens. -You can find all the official Qwen2 checkpoints under the [Qwen2 Collection](https://huggingface.co/collections/qwen/qwen2-6659360b3352f8ffa74171a3). +You can find all the official Qwen2 checkpoints under the [Qwen2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) collection. > [!TIP] > Click on the Qwen2 models in the right sidebar for more examples of how to apply Qwen2 to different language tasks. From f8757c20f5f245c8df5942ff310db24ea3e1a750 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:32 -0700 Subject: [PATCH 07/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index c5a24c0656e0..ce6b51427074 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -45,7 +45,7 @@ pipe = pipeline( task="text-generation", model="Qwen/Qwen2-7B-Instruct", torch_dtype=torch.bfloat16, - device_map="auto" # or device=0 for single GPU + device_map=0 ) messages = [ From 687c61428872b096b4aad5b1e5c52020571251ee Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:37 -0700 Subject: [PATCH 08/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index ce6b51427074..fb902a806138 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -109,7 +109,7 @@ print(response) ```bash # Make sure you are logged in (`huggingface-cli login`) # Requires transformers>=4.37.0 -transformers-cli chat --model Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device auto +transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0 # You can then type your messages in the terminal ``` From a9e82c08931cb180878a0643e1d199a619e8bd23 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:43 -0700 Subject: [PATCH 09/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index fb902a806138..f85d94e1333e 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -107,8 +107,7 @@ print(response) ```bash -# Make sure you are logged in (`huggingface-cli login`) -# Requires transformers>=4.37.0 +# pip install -U flash-attn --no-build-isolation transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0 # You can then type your messages in the terminal ``` From d5118230e22984f980603a5bbb3d062efe6e49db Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:52 -0700 Subject: [PATCH 10/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index f85d94e1333e..fabecfaa9818 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -43,7 +43,7 @@ from transformers import pipeline # Use a pipeline as a high-level helper pipe = pipeline( task="text-generation", - model="Qwen/Qwen2-7B-Instruct", + model="Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.bfloat16, device_map=0 ) From dc3c8fe2f817167e455a0295b3514ec4bcb015b5 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:37:57 -0700 Subject: [PATCH 11/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index fabecfaa9818..cbb2847a2b33 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -68,7 +68,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer device = "cuda" # or "cpu" or "mps" model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-7B-Instruct", - torch_dtype="auto", # use torch.bfloat16 for optimal performance + torch_dtype=torch.bfloat16, # use torch.bfloat16 for optimal performance device_map="auto", attn_implementation="flash_attention_2" # use "sdpa" or None for CPU compatibility ) From 11696de442c127bd318e353254180e36df59875c Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:38:02 -0700 Subject: [PATCH 12/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index cbb2847a2b33..c8b78ff2eff6 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -88,6 +88,7 @@ model_inputs = tokenizer([text], return_tensors="pt").to(device) generated_ids = model.generate( model_inputs.input_ids, + cache_implementation="static", max_new_tokens=512, do_sample=True, # Enable sampling temperature=0.7, # Control randomness From 285509107d3844da22b91c7fd6195720005a5caf Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:38:17 -0700 Subject: [PATCH 13/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index c8b78ff2eff6..5bcd7eaa06d9 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -67,7 +67,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer device = "cuda" # or "cpu" or "mps" model = AutoModelForCausalLM.from_pretrained( - "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.bfloat16, # use torch.bfloat16 for optimal performance device_map="auto", attn_implementation="flash_attention_2" # use "sdpa" or None for CPU compatibility From d805987cfc45ca7a8d1f365a26606f7c6d56e592 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:48:23 -0700 Subject: [PATCH 14/32] Update qwen2.md --- docs/source/en/model_doc/qwen2.md | 63 +++++++------------------------ 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 5bcd7eaa06d9..a4924a247c1c 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -65,10 +65,10 @@ print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's respo import torch from transformers import AutoModelForCausalLM, AutoTokenizer -device = "cuda" # or "cpu" or "mps" +device = "cuda" model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B-Instruct", - torch_dtype=torch.bfloat16, # use torch.bfloat16 for optimal performance + torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2" # use "sdpa" or None for CPU compatibility ) @@ -90,10 +90,10 @@ generated_ids = model.generate( model_inputs.input_ids, cache_implementation="static", max_new_tokens=512, - do_sample=True, # Enable sampling - temperature=0.7, # Control randomness - top_k=50, # Control diversity - top_p=0.95 # Control diversity + do_sample=True, + temperature=0.7, + top_k=50, + top_p=0.95 ) # Slice the output to remove the input tokens generated_ids = [ @@ -127,64 +127,27 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig # Configure quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, - bnb_4bit_compute_dtype=torch.bfloat16, # Recommended compute dtype - bnb_4bit_quant_type="nf4", # Use NF4 quantization - bnb_4bit_use_double_quant=True, # Enable double quantization + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, ) # Load tokenizer and quantized model -tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") # Use base model +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-7B", # Use base model - torch_dtype=torch.bfloat16, # Load weights in bfloat16 before quantizing + torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config, - attn_implementation="flash_attention_2" # Recommended for speed + attn_implementation="flash_attention_2" ) # Prepare input and generate -inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") # Adjust device if needed +inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` -## Processing Long Contexts - -Larger Qwen2 models (like Qwen2-72B) support context lengths up to 131,072 tokens using [YARN](https://arxiv.org/abs/2309.00071) scaling. - -For deployment, using vLLM is recommended. To enable long-context capabilities: - -1. **Install vLLM**: - ```bash - pip install "vllm>=0.4.3" - # Or install from source: https://github.com/vllm-project/vllm/ - ``` - -2. **Configure Model Settings**: Modify the model's `config.json` file by adding the `rope_scaling` attribute: - ```json - { - "architectures": [ - "Qwen2ForCausalLM" - ], - // ... other config ... - "vocab_size": 152064, // Example vocab size, check your model's config - - // --- Add this section --- - "rope_scaling": { - "factor": 4.0, // Adjust factor based on model/desired length if needed - "original_max_position_embeddings": 32768, // Base context length before scaling - "type": "yarn" - } - // --- End section --- - } - ``` - -3. **Deploy with vLLM**: Start a vLLM server pointing to the modified model weights directory: - ```bash - python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2-72B-Instruct --model /path/to/your/modified/qwen2/weights - ``` - You can then interact with the API (e.g., via `curl`) using the extended context length. - > [!NOTE] > vLLM currently supports static YARN, meaning the scaling factor is fixed. This might slightly impact performance on very short texts compared to not using YARN scaling. Consider enabling `rope_scaling` only when long context processing is essential. From 7d5cf1fdd70ad95f028464a3f31ac23ac2b09b9c Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:51:01 -0700 Subject: [PATCH 15/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index a4924a247c1c..b54bc169aad7 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -70,7 +70,7 @@ model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.bfloat16, device_map="auto", - attn_implementation="flash_attention_2" # use "sdpa" or None for CPU compatibility + attn_implementation="sdpa" # use "sdpa" or None for CPU compatibility ) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") From ebebcf3849ecbe513fc1a6c7ed4459c48ecc6f6c Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 12:51:12 -0700 Subject: [PATCH 16/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index b54bc169aad7..6c7edf7b6f3c 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -118,7 +118,7 @@ transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. -The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits. Note that quantization usually works best with base models. +The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits. ```python import torch From c2dd36f2cddf86f94edd422e6acefe670a0d8c92 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:02 -0700 Subject: [PATCH 17/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 6c7edf7b6f3c..2e1b088a73a7 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -40,7 +40,6 @@ The example below demonstrates how to generate text with [`Pipeline`], [`AutoMod import torch from transformers import pipeline -# Use a pipeline as a high-level helper pipe = pipeline( task="text-generation", model="Qwen/Qwen2-1.5B-Instruct", From c38d33a5e33d9c3ca521e9acb70e757d1aeb36e7 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:12 -0700 Subject: [PATCH 18/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 2e1b088a73a7..757966904f85 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -51,8 +51,6 @@ messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me about the Qwen2 model family."}, ] -# We use the tokenizer's chat template to format the messages -# But the pipeline handles this automatically for chat models outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's response ``` From 3fecacb90df6d85f66c2f4bef8ecc71443e12116 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:21 -0700 Subject: [PATCH 19/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 757966904f85..363c48e78617 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -67,7 +67,7 @@ model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.bfloat16, device_map="auto", - attn_implementation="sdpa" # use "sdpa" or None for CPU compatibility + attn_implementation="sdpa" ) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") From 33de8efadd010a8c31806d7779f68be96b03f1eb Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:34 -0700 Subject: [PATCH 20/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 363c48e78617..4616c3fb9a79 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -92,7 +92,6 @@ generated_ids = model.generate( top_k=50, top_p=0.95 ) -# Slice the output to remove the input tokens generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] From 9438cb3094f50c5b602cd4cdac5c05a64809edff Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:42 -0700 Subject: [PATCH 21/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 4616c3fb9a79..287d0d6aa063 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -106,7 +106,6 @@ print(response) ```bash # pip install -U flash-attn --no-build-isolation transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0 -# You can then type your messages in the terminal ``` From b894f4b3d729a197cca9e89f9ea9a342b60dc800 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:52 -0700 Subject: [PATCH 22/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 287d0d6aa063..423d3b29fe1f 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -119,7 +119,6 @@ The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -# Configure quantization quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, From c94b34e4f0d16d09dabcdef718c2eb1743538e6c Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:51:58 -0700 Subject: [PATCH 23/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 423d3b29fe1f..fc820d0afcef 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -126,7 +126,6 @@ quantization_config = BitsAndBytesConfig( bnb_4bit_use_double_quant=True, ) -# Load tokenizer and quantized model tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-7B", # Use base model From c5c463e215f9cd53b526942599ff0b062778b871 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:52:05 -0700 Subject: [PATCH 24/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index fc820d0afcef..afb58d0f681e 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -128,7 +128,7 @@ quantization_config = BitsAndBytesConfig( tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") model = AutoModelForCausalLM.from_pretrained( - "Qwen/Qwen2-7B", # Use base model + "Qwen/Qwen2-7B", torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config, From c75e622af1b125ad53da66d133119a5e0a0c4fc1 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:52:10 -0700 Subject: [PATCH 25/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index afb58d0f681e..9d1dd6653b66 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -135,7 +135,6 @@ model = AutoModelForCausalLM.from_pretrained( attn_implementation="flash_attention_2" ) -# Prepare input and generate inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) From fa1109725c816d3054b8e7b2fcdf0b935db9ae62 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:52:16 -0700 Subject: [PATCH 26/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 9d1dd6653b66..d92a279247be 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -140,8 +140,6 @@ outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` -> [!NOTE] -> vLLM currently supports static YARN, meaning the scaling factor is fixed. This might slightly impact performance on very short texts compared to not using YARN scaling. Consider enabling `rope_scaling` only when long context processing is essential. ## Notes From ef96105586e25eaa8a7e17e0feefdef2bdad315f Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 14:52:22 -0700 Subject: [PATCH 27/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index d92a279247be..47c9170d6b72 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -143,7 +143,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ## Notes -- Qwen2 requires `transformers>=4.37.0` for full support. Ensure your `transformers` library is up-to-date. +- Ensure your Transformers library version is up-to-date. Qwen2 requires Transformers>=4.37.0 for full support. ## Qwen2Config From f9ae4f8be962896bf54c3d73f50bf81f06966c90 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 16:51:54 -0700 Subject: [PATCH 28/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 47c9170d6b72..856b33dc67e1 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -69,7 +69,7 @@ model = AutoModelForCausalLM.from_pretrained( device_map="auto", attn_implementation="sdpa" ) -tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct") prompt = "Give me a short introduction to large language models." messages = [ From 2ebbfc3439a8f7ec39174e142556f7bde189cc1b Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 16:52:08 -0700 Subject: [PATCH 29/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 856b33dc67e1..30d59b4555eb 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -52,7 +52,7 @@ messages = [ {"role": "user", "content": "Tell me about the Qwen2 model family."}, ] outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) -print(outputs[0]["generated_text"][-1]['content']) # Print the assistant's response +print(outputs[0]["generated_text"][-1]['content']) ``` From 1ddbd9ef6f04cb1b252e7677fc7cdec06c8fa2cb Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 16:52:14 -0700 Subject: [PATCH 30/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 30d59b4555eb..1d6884475966 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -81,7 +81,7 @@ text = tokenizer.apply_chat_template( tokenize=False, add_generation_prompt=True ) -model_inputs = tokenizer([text], return_tensors="pt").to(device) +model_inputs = tokenizer([text], return_tensors="pt").to("cuda") generated_ids = model.generate( model_inputs.input_ids, From e182841ab4380fa3c50b2971542c714021a14ad7 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 16:52:25 -0700 Subject: [PATCH 31/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 1d6884475966..d72dac01b04a 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -62,7 +62,6 @@ print(outputs[0]["generated_text"][-1]['content']) import torch from transformers import AutoModelForCausalLM, AutoTokenizer -device = "cuda" model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.bfloat16, From 80941765d002757e95763b1d41d1246432250c01 Mon Sep 17 00:00:00 2001 From: ARAVINDHAN T Date: Wed, 2 Apr 2025 16:52:30 -0700 Subject: [PATCH 32/32] Update docs/source/en/model_doc/qwen2.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/qwen2.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index d72dac01b04a..1c5e3880938d 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -115,6 +115,7 @@ Quantization reduces the memory burden of large models by representing the weigh The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits. ```python +# pip install -U flash-attn --no-build-isolation import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig