IndexError: The shape of the mask [7387] at index 0 does not match the shape of the indexed tensor [1] at index 0

### 🐛 Describe the bug

The error exists when I try to use the qwen2-vl with qwen2-vl liger kernel to generate text.

The following code got the following error. But the same code if I change the liger kernel to qwen2 instead of qwen2-vl. It will correct.

![image](https://github.com/user-attachments/assets/a9128ef8-505c-41c5-9178-648dbfff5e91)
Then I add the cache_position to the lce_forward function and it got the following error.
![image](https://github.com/user-attachments/assets/81ba4e89-f6ec-48f0-99d1-f9498ce88888)

Here is the way I used qwen2 liger kernel.
```
from liger_kernel.transformers import apply_liger_kernel_to_qwen2
apply_liger_kernel_to_qwen2()
```
here is the way I used qwen2-vl liger kernel.
```
from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
apply_liger_kernel_to_qwen2_vl()
```






### Reproduce

```
# Reference: https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html

import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List

import torch
from PIL import Image, ImageDraw, ImageFont
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration

from mmdoc.data.xdataset_base import PageRecord
from mmdoc.data.xdataset_entity import normalize_bbox
from mmdoc.data.xdataset_gpt_entity import GPTXDatasetBuilder
from mmdoc.paths import DATA_DIR
from mmdoc.trainers.eval.gpt_evaluation import get_content_dict, get_value_with_grounding, load_prediction
from peft import PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from peft import LoraConfig, PeftModel

from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
from liger_kernel.transformers import apply_liger_kernel_to_qwen2
from liger_kernel.transformers import monkey_patch
import transformers


# apply_liger_kernel_to_qwen2()

print("Applying Liger Kernel to Qwen2-VL model")
# monkey_patch.apply_liger_kernel_to_qwen2_vl(
#     # These args can be used to override the default Liger settings
#     # cross_entropy=True,
#     # fused_linear_cross_entropy=False,
# )
apply_liger_kernel_to_qwen2_vl(
    rope= True,
    cross_entropy = False,
    fused_linear_cross_entropy = True,
    rms_norm = True,
    layer_norm = True,
    swiglu = True,
)
def get_data(
    dataset_name,
    doc_id,
    page_id,
    header_only: bool = False,
    line_only: bool = False,
    filter_labels: List[str] = None,
    end_to_end: bool = False,
    num_pages: int = None,
    text_grounding: bool = False,  # Add text grounding
    require_grounding: bool = False,  # Ignore typed texts without bounding box
):
    dataset_dir = f"{DATA_DIR}/{dataset_name}"
    iocr_dir = f"{dataset_dir}/iocr_json"
    field_data_dir = f"{dataset_dir}/field_data"

    page = PageRecord.load_from_disk(doc_id, page_id, iocr_dir, field_data_dir, load_json=True)
    add_page_promt = False if num_pages is None else True
    builder = GPTXDatasetBuilder(
        name=dataset_name,
        data_dir=f"{DATA_DIR}",
        header_only=header_only,
        line_only=line_only,
        filter_labels=filter_labels,
        add_page_promt=add_page_promt,
        text_grounding=text_grounding,
        end_to_end=end_to_end,
        require_grounding=require_grounding,
    )
    doc_id_pg_str = str(doc_id) + "_" + str(page_id)
    data_json_dir = f"{dataset_dir}/dataset.json"
    with Path(data_json_dir).open(encoding="utf-8") as f:
        dataset_info = json.load(f)
        field_configs = builder._get_field_configs(dataset_info)
        annos = builder._get_annotations(
            page=page,
            field_configs=field_configs,
            page_idx=page_id,
            num_pages=num_pages,
        )
        prompt = annos["prompt"]
        json_string = annos["response_json"]
        image_path = f"{dataset_dir}/images/{doc_id_pg_str}.jpg"
        return prompt, json_string, image_path


def draw_box(
    pil_img, draw: ImageDraw, bbox, text: str, color: str, font: ImageFont, text_above: bool = True, y_shift: int = 10
):
    x0, y0, x1, y1 = normalize_bbox(bbox=bbox, from_size=(1000, 1000), to_size=pil_img.size)
    draw.rectangle(((x0, y0), (x1, y1)), outline=color, width=2)
    if text_above:  # Draw text above
        draw.text((x0 + 10, y0 - y_shift), text=text, font=font, fill=color)
    else:  # Draw text below
        draw.text((x0 + 10, y0 + y_shift), text=text, font=font, fill=color)


def _draw_lines(
    lines: List[Dict],
    color: str,
    pil_img,
    draw: ImageDraw,
    font: ImageFont,
    text_above: bool = True,
    y_shift: int = 10,
    delta: int = 0,
):
    for _row in lines:
        cnt = 0
        for field_name, _value in _row.items():
            _txt_value, _coord = get_value_with_grounding(_value)
            if _coord:
                txt_value = field_name + ": " + _txt_value
                y_shift_final = y_shift + cnt * delta  # Shift different field differently
                draw_box(pil_img, draw, _coord, txt_value, color, font, text_above, y_shift_final)
                cnt += 1


peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM",
    )

# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
# Otherwise, you need to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = "Qwen/Qwen2-VL-2B-Instruct"
# device_map = split_model('InternVL2-1B')
model_path = (  # Header only
    "/home/paperspace/mmdoc/outputs/internvl/Mainclass40GlobalShop_SPpromptTGPr/checkpoint-14800"
)
header_only = True
end_to_end = False
filter_labels = None  # "".split(",")
num_pages = 1
text_grounding = True
doc_id = 366023271
page_id = 1
max_seq_length = 1024
require_grounding = True

prompt, json_string, image_path = get_data(
    "GlobalShopWF976934_USER_50USER_20241002",
    doc_id,
    page_id,
    header_only=header_only,
    filter_labels=filter_labels,
    end_to_end=end_to_end,
    num_pages=num_pages,
    text_grounding=text_grounding,
    require_grounding=require_grounding,
)

# model_cls = InternVLChatModel
# model = model_cls.from_pretrained(
#     # model_path,
#     path,
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
# ).eval()


model = Qwen2VLForConditionalGeneration.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
model = get_peft_model(model, peft_config)
model.to("cuda")
processor = transformers.AutoProcessor.from_pretrained(path)
processor.tokenizer.pad_token = processor.tokenizer.eos_token
# processor = AutoProcessor.from_pretrained(path)
# tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

start_time = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image_path,
            },
            {"type": "text", "text": prompt},
        ],
    },
    {"role": "assistant", "content": [{"type": "text", "text": json_string}]},
]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs.data, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
```


### Versions

transformers=4.47.1
liger_kernel=0.5.2

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

IndexError: The shape of the mask [7387] at index 0 does not match the shape of the indexed tensor [1] at index 0 #515

🐛 Describe the bug

Reproduce

Versions

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

IndexError: The shape of the mask [7387] at index 0 does not match the shape of the indexed tensor [1] at index 0 #515

Description

🐛 Describe the bug

Reproduce

Versions

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions