Skip to content

IndexError: The shape of the mask [7387] at index 0 does not match the shape of the indexed tensor [1] at index 0 #515

@14H034160212

Description

@14H034160212

🐛 Describe the bug

The error exists when I try to use the qwen2-vl with qwen2-vl liger kernel to generate text.

The following code got the following error. But the same code if I change the liger kernel to qwen2 instead of qwen2-vl. It will correct.

image
Then I add the cache_position to the lce_forward function and it got the following error.
image

Here is the way I used qwen2 liger kernel.

from liger_kernel.transformers import apply_liger_kernel_to_qwen2
apply_liger_kernel_to_qwen2()

here is the way I used qwen2-vl liger kernel.

from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
apply_liger_kernel_to_qwen2_vl()

Reproduce

# Reference: https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html

import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import torch
from PIL import Image, ImageDraw, ImageFont
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration

from mmdoc.data.xdataset_base import PageRecord
from mmdoc.data.xdataset_entity import normalize_bbox
from mmdoc.data.xdataset_gpt_entity import GPTXDatasetBuilder
from mmdoc.paths import DATA_DIR
from mmdoc.trainers.eval.gpt_evaluation import get_content_dict, get_value_with_grounding, load_prediction
from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel

from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
from liger_kernel.transformers import apply_liger_kernel_to_qwen2
from liger_kernel.transformers import monkey_patch
import transformers


# apply_liger_kernel_to_qwen2()

print("Applying Liger Kernel to Qwen2-VL model")
# monkey_patch.apply_liger_kernel_to_qwen2_vl(
#     # These args can be used to override the default Liger settings
#     # cross_entropy=True,
#     # fused_linear_cross_entropy=False,
# )
apply_liger_kernel_to_qwen2_vl(
    rope= True,
    cross_entropy = False,
    fused_linear_cross_entropy = True,
    rms_norm = True,
    layer_norm = True,
    swiglu = True,
)
def get_data(
    dataset_name,
    doc_id,
    page_id,
    header_only: bool = False,
    line_only: bool = False,
    filter_labels: List[str] = None,
    end_to_end: bool = False,
    num_pages: int = None,
    text_grounding: bool = False,  # Add text grounding
    require_grounding: bool = False,  # Ignore typed texts without bounding box
):
    dataset_dir = f"{DATA_DIR}/{dataset_name}"
    iocr_dir = f"{dataset_dir}/iocr_json"
    field_data_dir = f"{dataset_dir}/field_data"

    page = PageRecord.load_from_disk(doc_id, page_id, iocr_dir, field_data_dir, load_json=True)
    add_page_promt = False if num_pages is None else True
    builder = GPTXDatasetBuilder(
        name=dataset_name,
        data_dir=f"{DATA_DIR}",
        header_only=header_only,
        line_only=line_only,
        filter_labels=filter_labels,
        add_page_promt=add_page_promt,
        text_grounding=text_grounding,
        end_to_end=end_to_end,
        require_grounding=require_grounding,
    )
    doc_id_pg_str = str(doc_id) + "_" + str(page_id)
    data_json_dir = f"{dataset_dir}/dataset.json"
    with Path(data_json_dir).open(encoding="utf-8") as f:
        dataset_info = json.load(f)
        field_configs = builder._get_field_configs(dataset_info)
        annos = builder._get_annotations(
            page=page,
            field_configs=field_configs,
            page_idx=page_id,
            num_pages=num_pages,
        )
        prompt = annos["prompt"]
        json_string = annos["response_json"]
        image_path = f"{dataset_dir}/images/{doc_id_pg_str}.jpg"
        return prompt, json_string, image_path


def draw_box(
    pil_img, draw: ImageDraw, bbox, text: str, color: str, font: ImageFont, text_above: bool = True, y_shift: int = 10
):
    x0, y0, x1, y1 = normalize_bbox(bbox=bbox, from_size=(1000, 1000), to_size=pil_img.size)
    draw.rectangle(((x0, y0), (x1, y1)), outline=color, width=2)
    if text_above:  # Draw text above
        draw.text((x0 + 10, y0 - y_shift), text=text, font=font, fill=color)
    else:  # Draw text below
        draw.text((x0 + 10, y0 + y_shift), text=text, font=font, fill=color)


def _draw_lines(
    lines: List[Dict],
    color: str,
    pil_img,
    draw: ImageDraw,
    font: ImageFont,
    text_above: bool = True,
    y_shift: int = 10,
    delta: int = 0,
):
    for _row in lines:
        cnt = 0
        for field_name, _value in _row.items():
            _txt_value, _coord = get_value_with_grounding(_value)
            if _coord:
                txt_value = field_name + ": " + _txt_value
                y_shift_final = y_shift + cnt * delta  # Shift different field differently
                draw_box(pil_img, draw, _coord, txt_value, color, font, text_above, y_shift_final)
                cnt += 1


peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
    )

# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = "Qwen/Qwen2-VL-2B-Instruct"
# device_map = split_model('InternVL2-1B')
model_path = (  # Header only
    "/home/paperspace/mmdoc/outputs/internvl/Mainclass40GlobalShop_SPpromptTGPr/checkpoint-14800"
)
header_only = True
end_to_end = False
filter_labels = None  # "".split(",")
num_pages = 1
text_grounding = True
doc_id = 366023271
page_id = 1
max_seq_length = 1024
require_grounding = True

prompt, json_string, image_path = get_data(
    "GlobalShopWF976934_USER_50USER_20241002",
    doc_id,
    page_id,
    header_only=header_only,
    filter_labels=filter_labels,
    end_to_end=end_to_end,
    num_pages=num_pages,
    text_grounding=text_grounding,
    require_grounding=require_grounding,
)

# model_cls = InternVLChatModel
# model = model_cls.from_pretrained(
#     # model_path,
#     path,
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
# ).eval()


model = Qwen2VLForConditionalGeneration.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
model = get_peft_model(model, peft_config)
model.to("cuda")
processor = transformers.AutoProcessor.from_pretrained(path)
processor.tokenizer.pad_token = processor.tokenizer.eos_token
# processor = AutoProcessor.from_pretrained(path)
# tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

start_time = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
            },
            {"type": "text", "text": prompt},
        ],
    },
    {"role": "assistant", "content": [{"type": "text", "text": json_string}]},
]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs.data, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Versions

transformers=4.47.1
liger_kernel=0.5.2

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workinggood first issueGood for newcomers

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions