The error exists when I try to use the qwen2-vl with qwen2-vl liger kernel to generate text.
The following code got the following error. But the same code if I change the liger kernel to qwen2 instead of qwen2-vl. It will correct.
Here is the way I used qwen2 liger kernel.
here is the way I used qwen2-vl liger kernel.
# Reference: https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List
import torch
from PIL import Image, ImageDraw, ImageFont
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
from mmdoc.data.xdataset_base import PageRecord
from mmdoc.data.xdataset_entity import normalize_bbox
from mmdoc.data.xdataset_gpt_entity import GPTXDatasetBuilder
from mmdoc.paths import DATA_DIR
from mmdoc.trainers.eval.gpt_evaluation import get_content_dict, get_value_with_grounding, load_prediction
from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
from liger_kernel.transformers import apply_liger_kernel_to_qwen2
from liger_kernel.transformers import monkey_patch
import transformers
# apply_liger_kernel_to_qwen2()
print("Applying Liger Kernel to Qwen2-VL model")
# monkey_patch.apply_liger_kernel_to_qwen2_vl(
# # These args can be used to override the default Liger settings
# # cross_entropy=True,
# # fused_linear_cross_entropy=False,
# )
apply_liger_kernel_to_qwen2_vl(
rope= True,
cross_entropy = False,
fused_linear_cross_entropy = True,
rms_norm = True,
layer_norm = True,
swiglu = True,
)
def get_data(
dataset_name,
doc_id,
page_id,
header_only: bool = False,
line_only: bool = False,
filter_labels: List[str] = None,
end_to_end: bool = False,
num_pages: int = None,
text_grounding: bool = False, # Add text grounding
require_grounding: bool = False, # Ignore typed texts without bounding box
):
dataset_dir = f"{DATA_DIR}/{dataset_name}"
iocr_dir = f"{dataset_dir}/iocr_json"
field_data_dir = f"{dataset_dir}/field_data"
page = PageRecord.load_from_disk(doc_id, page_id, iocr_dir, field_data_dir, load_json=True)
add_page_promt = False if num_pages is None else True
builder = GPTXDatasetBuilder(
name=dataset_name,
data_dir=f"{DATA_DIR}",
header_only=header_only,
line_only=line_only,
filter_labels=filter_labels,
add_page_promt=add_page_promt,
text_grounding=text_grounding,
end_to_end=end_to_end,
require_grounding=require_grounding,
)
doc_id_pg_str = str(doc_id) + "_" + str(page_id)
data_json_dir = f"{dataset_dir}/dataset.json"
with Path(data_json_dir).open(encoding="utf-8") as f:
dataset_info = json.load(f)
field_configs = builder._get_field_configs(dataset_info)
annos = builder._get_annotations(
page=page,
field_configs=field_configs,
page_idx=page_id,
num_pages=num_pages,
)
prompt = annos["prompt"]
json_string = annos["response_json"]
image_path = f"{dataset_dir}/images/{doc_id_pg_str}.jpg"
return prompt, json_string, image_path
def draw_box(
pil_img, draw: ImageDraw, bbox, text: str, color: str, font: ImageFont, text_above: bool = True, y_shift: int = 10
):
x0, y0, x1, y1 = normalize_bbox(bbox=bbox, from_size=(1000, 1000), to_size=pil_img.size)
draw.rectangle(((x0, y0), (x1, y1)), outline=color, width=2)
if text_above: # Draw text above
draw.text((x0 + 10, y0 - y_shift), text=text, font=font, fill=color)
else: # Draw text below
draw.text((x0 + 10, y0 + y_shift), text=text, font=font, fill=color)
def _draw_lines(
lines: List[Dict],
color: str,
pil_img,
draw: ImageDraw,
font: ImageFont,
text_above: bool = True,
y_shift: int = 10,
delta: int = 0,
):
for _row in lines:
cnt = 0
for field_name, _value in _row.items():
_txt_value, _coord = get_value_with_grounding(_value)
if _coord:
txt_value = field_name + ": " + _txt_value
y_shift_final = y_shift + cnt * delta # Shift different field differently
draw_box(pil_img, draw, _coord, txt_value, color, font, text_above, y_shift_final)
cnt += 1
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.05,
r=8,
bias="none",
target_modules=["q_proj", "v_proj"],
task_type="CAUSAL_LM",
)
# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = "Qwen/Qwen2-VL-2B-Instruct"
# device_map = split_model('InternVL2-1B')
model_path = ( # Header only
"/home/paperspace/mmdoc/outputs/internvl/Mainclass40GlobalShop_SPpromptTGPr/checkpoint-14800"
)
header_only = True
end_to_end = False
filter_labels = None # "".split(",")
num_pages = 1
text_grounding = True
doc_id = 366023271
page_id = 1
max_seq_length = 1024
require_grounding = True
prompt, json_string, image_path = get_data(
"GlobalShopWF976934_USER_50USER_20241002",
doc_id,
page_id,
header_only=header_only,
filter_labels=filter_labels,
end_to_end=end_to_end,
num_pages=num_pages,
text_grounding=text_grounding,
require_grounding=require_grounding,
)
# model_cls = InternVLChatModel
# model = model_cls.from_pretrained(
# # model_path,
# path,
# torch_dtype=torch.bfloat16,
# low_cpu_mem_usage=True,
# trust_remote_code=True,
# ).eval()
model = Qwen2VLForConditionalGeneration.from_pretrained(
path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
model = get_peft_model(model, peft_config)
model.to("cuda")
processor = transformers.AutoProcessor.from_pretrained(path)
processor.tokenizer.pad_token = processor.tokenizer.eos_token
# processor = AutoProcessor.from_pretrained(path)
# tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
start_time = time.time()
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image_path,
},
{"type": "text", "text": prompt},
],
},
{"role": "assistant", "content": [{"type": "text", "text": json_string}]},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs.data, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
🐛 Describe the bug
The error exists when I try to use the qwen2-vl with qwen2-vl liger kernel to generate text.
The following code got the following error. But the same code if I change the liger kernel to qwen2 instead of qwen2-vl. It will correct.
Then I add the cache_position to the lce_forward function and it got the following error.
Here is the way I used qwen2 liger kernel.
here is the way I used qwen2-vl liger kernel.
Reproduce
Versions
transformers=4.47.1
liger_kernel=0.5.2