From 08db66c865fdcd2485d41ba063aef818d97fa4c0 Mon Sep 17 00:00:00 2001
From: zhushaohao <shaohao9.zhu@gmail.com>
Date: Tue, 16 Dec 2025 15:52:41 +0800
Subject: [PATCH 1/9] add VL-LN Bench training code

---
 internnav/dataset/vlln_lerobot_dataset.py     | 769 ++++++++++++++++++
 internnav/trainer/internvla_n1_argument.py    |   1 +
 internnav/trainer/internvla_vlln_trainer.py   | 239 ++++++
 .../train/qwenvl_train/train_system2_vlln.sh  |  81 ++
 traj_data                                     |   1 +
 5 files changed, 1091 insertions(+)
 create mode 100644 internnav/dataset/vlln_lerobot_dataset.py
 create mode 100644 internnav/trainer/internvla_vlln_trainer.py
 create mode 100644 scripts/train/qwenvl_train/train_system2_vlln.sh
 create mode 120000 traj_data
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
new file mode 100644
index 00000000..b0e78843
--- /dev/null
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -0,0 +1,769 @@
+import copy
+import itertools
+import json
+import os
+import random
+import re
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import torch
+import transformers
+from decord import VideoReader
+from PIL import Image
+from torch.utils.data import Dataset
+from torchcodec.decoders import VideoDecoder
+from transformers.image_utils import to_numpy_array
+from bisect import bisect_left
+
+from .rope2d import get_rope_index_2, get_rope_index_25
+
+# Define placeholders for dataset paths
+IION_split1 = {
+    "data_path": "traj_data/mp3d_split1",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+IION_split2 = {
+    "data_path": "traj_data/mp3d_split2",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+IION_split3 = {
+    "data_path": "traj_data/mp3d_split3",
+    "height": 125,
+    "pitch_1": 0,
+    "pitch_2": 30,
+}
+
+data_dict = {
+    "iion_split1": IION_split1,
+    "iion_split2": IION_split2,
+    "iion_split3": IION_split3,
+}
+
+
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+
+def data_list(dataset_names):
+    config_list = []
+    for dataset_name in dataset_names:
+        sampling_rate = parse_sampling_rate(dataset_name)
+        dataset_name = re.sub(r"%(\d+)$", "", dataset_name)
+        if dataset_name in data_dict.keys():
+            config = data_dict[dataset_name].copy()
+            config["sampling_rate"] = sampling_rate
+            config_list.append(config)
+        else:
+            raise ValueError(f"do not find {dataset_name}")
+    return config_list
+
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+TRAJ_TOKEN_INDEX = 151667
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+_ORACLE_BLOCK = re.compile(r'<\|oracle\|>.*?<\|dialog_end\|>', re.DOTALL)
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
+def preprocess_qwen_2_visual(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    grid_thw_image: List = [],
+    grid_thw_video: List = [],
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    system_message = "You are a helpful assistant."
+
+    tokenizer = copy.deepcopy(tokenizer)
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+
+    visual_replicate_index_image = 0
+    visual_replicate_index_video = 0
+    input_ids, targets = [], []
+
+    for i, source in enumerate(sources):
+        try:
+            if roles[source[0]["from"]] != roles["human"]:
+                source = source[1:]
+        except:
+            print(sources)
+
+        input_id, target = [], []
+
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+        )
+        target += [IGNORE_INDEX] * len(input_id)
+
+        for conv in source:
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+
+            role = roles.get(role, role)
+            if role == "user":
+                if "<image>" in content:
+                    parts = content.split("<image>")
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + f"<|image_pad|>"
+                            * grid_thw_image[visual_replicate_index_image]
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index_image += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+                if "<video>" in content:
+                    parts = content.split("<video>")
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + f"<|video_pad|>"
+                            * grid_thw_video[visual_replicate_index_video]
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index_video += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target_mask = encode_id.copy()
+                target_mask[:3] = [IGNORE_INDEX] * 3
+                target += target_mask
+
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        input_ids.append(input_id)
+        targets.append(target)
+
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+def clip_or_pad(arr, fixed_len):
+    T, D = arr.shape
+    if T >= fixed_len:
+        return arr[:fixed_len]
+    else:
+        pad = np.zeros((fixed_len - T, D), dtype=arr.dtype)
+        return np.concatenate([arr, pad], axis=0)
+
+
+def get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height):
+    import pyarrow.parquet as pq
+    import pandas as pd
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    setting = f'{height}cm_{pitch_2}deg'
+    setting_horizon = setting.replace(str(pitch_2), str(pitch_1))
+    annotations = {
+        "axis_align_matrix": [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]],    
+        "episodes": []
+    }
+    scene_ids = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
+
+    def process_scene(scene_id):
+        scene_path = os.path.join(data_path, scene_id)
+        episodes = read_jsonl(os.path.join(scene_path, "meta", "episodes.jsonl"))
+        scene_annotations = []
+
+        for ep in episodes:
+            ep_id = ep["episode_index"]
+            ep_instructions = ep["tasks"][0].split(";")
+            ep_len = ep["length"]
+            ep_dialogs = ep["dialogs"]
+            parquet_path = os.path.join(scene_path, "data", f"chunk-{ep_id // 1000:03d}", f"episode_{ep_id:06d}.parquet")
+            
+            table = pq.read_table(parquet_path)
+            df = table.to_pandas()
+
+            ep_actions = df["action"].tolist()
+            pose_key = f"pose.{setting}"
+            goal_key = f"goal.{setting}"
+            relative_goal_frame_id_key = f"relative_goal_frame_id.{setting}"
+            
+            ep_poses_horizon = df[f"pose.{setting_horizon}"].apply(lambda x: x.tolist()).tolist()
+            if pose_key in df.columns and goal_key in df.columns and relative_goal_frame_id_key in df.columns:
+                ep_poses = df[pose_key].apply(lambda x: x.tolist()).tolist()
+                ep_pixel_goals = [
+                    [df[relative_goal_frame_id_key][idx].tolist(), df[goal_key][idx].tolist()]
+                    for idx in range(len(df))
+                ]
+            else:
+                print(f"Warning: Missing data for setting {setting} in episode {ep_id}, filling with defaults.")
+
+            assert len(ep_actions) == ep_len, f"Action length mismatch in episode {ep_id}"
+
+            episode = {
+                "id": ep_id,
+                "video": f"{data_path}/{scene_id}/videos/chunk-{ep_id // 1000:03d}",
+                "instructions": ep_instructions[0],
+                "actions": ep_actions,
+                "length": ep_len,
+                f"poses_{setting}": ep_poses,
+                f"poses_{setting_horizon}": ep_poses_horizon,
+                "pixel_goals": ep_pixel_goals,
+                "dialogs": ep_dialogs
+            }
+            scene_annotations.append(episode)
+        
+        return scene_annotations
+
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        futures = {executor.submit(process_scene, scene_id): scene_id for scene_id in scene_ids}
+        for future in as_completed(futures):
+            scene_id = futures[future]
+            try:
+                scene_annotations = future.result()
+                annotations["episodes"].extend(scene_annotations)
+            except Exception as e:
+                print(f"Error processing scene {scene_id}: {e}")
+
+    return annotations
+
+def get_turn_actions(actions, start_frame_id, num_future_steps):
+    if not (0 <= start_frame_id < len(actions)):
+        return []
+    s = actions[start_frame_id : start_frame_id + num_future_steps]
+    first = s[0]
+    i = next((k for k, x in enumerate(s) if x != first), len(s))
+    return s[:i]
+
+def sort_dialogs_by_true_idx(dialogs):
+    groups = []
+    i, n = 0, len(dialogs)
+    while i < n:
+        groups.append(dialogs[i:i+2])
+        i += 2
+
+    def group_key(g):
+        return max(d.get("true_idx", float("inf")) for d in g)
+
+    keyed = [(g, group_key(g)) for g in groups]
+    keyed.sort(key=lambda x: x[1])
+
+    sorted_dialogs = []
+    unique_true_idx = []
+    seen = set()
+    for g, k in keyed:
+        sorted_dialogs.extend(g)
+        if k not in seen:
+            unique_true_idx.append(k)
+            seen.add(k)
+
+    return sorted_dialogs, unique_true_idx
+
+def get_history_dialogs(start_frame_id, dialogs, dia_idx):
+    i = bisect_left(dia_idx, start_frame_id) 
+    if i != 0:
+        return dialogs[:2*i]      
+    else:
+        return []
+
+def build_dialog_history(history_id, dialog_id, dialogs):
+    placeholder = [''] * (len(history_id)+1)
+    for n in dialog_id:
+        pos = history_id.index(n)
+        output = ""
+        for dialog in dialogs:
+            if dialog['true_idx'] == n:
+                output += f"<|{dialog['role']}|>{dialog['message']}"
+        placeholder[pos+1] = "<|dialog_start|>" + output + "<|dialog_end|>"
+    placeholder = ('<image>\n').join(placeholder)
+    return placeholder
+
+def enforce_simple_limit(conv, limit,
+    sorry_msg: str = "Sorry, you have reached the question limit. No further answers are available."):
+
+    conv = [dict(m) for m in conv[0]]  
+    answer_indices = []
+    replaced_indices = []
+
+    first_val = conv[0].get('value', '') if len(conv) >= 1 else ''
+    blocks = list(_ORACLE_BLOCK.finditer(first_val))
+    for i in range(len(blocks)):
+        answer_indices.append(('oracle', (0, i)))
+
+    talk_human_indices: List[int] = []
+    for k in range(len(conv) - 1):
+        if conv[k].get('from', '') == 'gpt' and conv[k].get('value', '').lstrip().startswith('<talk>'):
+            if conv[k + 1].get('from', '') == 'human':
+                talk_human_indices.append(k + 1)
+                answer_indices.append(('more', k + 1))
+
+    total_answers = len(answer_indices)
+    to_replace = {idx for idx, _ in enumerate(answer_indices) if idx >= limit}
+
+    if blocks:
+        block_idx = -1
+        def _repl(m):
+            nonlocal block_idx
+            block_idx += 1  
+            if block_idx in to_replace:
+                replaced_indices.append(('oracle', (0, block_idx)))
+                return '<|oracle|>' + sorry_msg + '<|dialog_end|>'
+            return m.group(0)
+
+        new_first_val = _ORACLE_BLOCK.sub(_repl, first_val)
+        if new_first_val != first_val:
+            conv[0]['value'] = new_first_val
+
+    for global_idx, (tag, loc) in enumerate(answer_indices):
+        if tag == 'more' and global_idx in to_replace:
+            human_idx = loc
+            if 0 <= human_idx < len(conv):
+                conv[human_idx]['value'] = sorry_msg
+                replaced_indices.append(('more', human_idx))
+
+    return [conv]
+
+class VLLN_Dataset(Dataset):
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(VLLN_Dataset, self).__init__()
+        dataset = data_args.vln_dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(
+            data_args, "video_max_total_pixels", 1664 * 28 * 28
+        )
+        self.video_min_total_pixels = getattr(
+            data_args, "video_min_total_pixels", 256 * 28 * 28
+        )
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+        
+        self.sample_step = data_args.sample_step
+        self.predict_step_num = data_args.predict_step_num
+        self.pixel_goal_only = data_args.pixel_goal_only
+        self.num_future_steps = data_args.num_future_steps
+        self.max_dialog_turns = data_args.max_dialog_turns
+
+        self.list_data_dict = []
+
+        for data in dataset_list:
+            sampling_rate = data.get("sampling_rate", 1.0)
+            height = data.get("height", None)
+            pitch_1 = data.get("pitch_1", None)
+            pitch_2 = data.get("pitch_2", None)
+            
+            data_path = data['data_path']
+            
+            annotations = get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height)
+
+            pixel_goal_list = []
+            turn_list = []
+            stop_list = []
+            list_data_dict = []
+            dialog_list = []
+            
+
+            for ep_id, item in enumerate(annotations['episodes']):
+                ep_id = item['id']
+                instruction = item['instructions']
+                video = item['video']
+                dialogs = item['dialogs']
+                dialogs, dia_idx = sort_dialogs_by_true_idx(dialogs)
+                actions = item['actions'][1:] + [0]
+                pixel_goals = item['pixel_goals']
+                poses = item[f'poses_{height}cm_{pitch_1}deg']
+                
+                actions_len = len(actions)
+                if actions_len < 4:
+                    continue
+        
+                num_rounds = actions_len // self.sample_step
+                for n in range(num_rounds+1):
+                    if n * self.sample_step == actions_len or n * self.sample_step == actions_len - 1:
+                        continue
+                    start_frame_id = n * self.sample_step
+                    action_flag = actions[start_frame_id]
+                    pixel_goal = pixel_goals[start_frame_id]
+                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
+                    if pixel_goal[0]==-1:
+                        if action_flag == 1:
+                            continue
+                        else:
+                            turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
+                            turn_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, None))
+                    else:
+                        goal_len = pixel_goal[0]
+                        action = pixel_goal[1]
+                        pose = poses[start_frame_id:start_frame_id+goal_len]
+                        pixel_goal_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, None))
+                stop_frame = actions_len - 1
+                stop_history = get_history_dialogs(stop_frame, dialogs, dia_idx)
+                stop_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, actions_len-1, 0, None, stop_history, None))
+                for n in range(len(dia_idx)):
+                    start_frame_id = dia_idx[n]
+                    action = actions[start_frame_id : start_frame_id + self.num_future_steps]
+                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
+                    current_dialog = [sentence for sentence in dialogs if sentence['true_idx'] == start_frame_id]
+                    if action[0] == 1:
+                        pixel_goal = pixel_goals[start_frame_id]
+                        if pixel_goal[0] != -1:
+                            goal_len = pixel_goal[0]
+                            pose = poses[start_frame_id:start_frame_id+goal_len]
+                            dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, pixel_goal[1], pose, history_dialogs, current_dialog))
+                        else:
+                            continue
+                    elif action[0] == 0:
+                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, 0, None, history_dialogs, current_dialog))
+                    else:
+                        turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
+                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, current_dialog))
+
+            list_data_dict = pixel_goal_list
+            rank0_print(len(turn_list), len(pixel_goal_list), len(stop_list), len(dialog_list))
+            if not self.pixel_goal_only:
+                list_data_dict += turn_list
+                list_data_dict += stop_list * 10
+                list_data_dict += dialog_list * 10
+            if sampling_rate < 1.0:
+                list_data_dict = random.sample(
+                    list_data_dict, int(len(list_data_dict) * sampling_rate)
+                )
+                print(f"sampling {len(list_data_dict)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+                
+            self.list_data_dict.extend(list_data_dict)
+
+        self.num_history = data_args.num_history
+        self.idx2actions = {
+            0: 'STOP',
+            1: "↑",
+            2: "←",
+            3: "→",
+            5: "↓"
+        }
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is '
+        ]
+        self.data_args = data_args
+        self.tokenizer = tokenizer
+    
+    def __len__(self):
+        return len(self.list_data_dict)   
+    
+    def process_image_unified(self, image):
+        processor = copy.deepcopy(self.data_args.image_processor)
+
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+    
+    def __getitem__(self, i):
+        ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, current_dialog = self.list_data_dict[i]
+        dialogs_id = np.array([dialog['true_idx'] for dialog in history_dialogs])[::2]
+        if start_frame_id != 0:
+            history_id = np.unique(np.concatenate([np.linspace(0, start_frame_id-1, self.num_history, dtype=np.int32),dialogs_id])).tolist()
+        else:
+            history_id = []
+        
+        images = []
+        grid_thws = []
+
+        for id in range(0, start_frame_id + 1):
+            image_file = os.path.join(video, f"observation.images.rgb.{height}cm_{pitch_1}deg", f"episode_{ep_id:06d}_{id}.jpg")
+            if id in history_id or id == start_frame_id:
+                image = Image.open(image_file).convert('RGB')  
+                lookdown_image = Image.open(image_file.replace(f'_{pitch_1}deg',f'_{pitch_2}deg')).convert('RGB')
+                if self.data_args.transform_train is not None:
+                    image = self.data_args.transform_train(image)  
+                image, grid_thw = self.process_image_unified(image)
+                images.append(image)
+                grid_thws.append(grid_thw)
+                if id == start_frame_id and pose is not None: 
+                    image, grid_thw = self.process_image_unified(lookdown_image)
+                    images.append(image)
+                    grid_thws.append(grid_thw)
+        
+        if history_dialogs:
+            history_imgs = build_dialog_history(history_id, dialogs_id, history_dialogs)
+        else:
+            history_imgs = "<image>\n"*len(history_id)
+        
+        if start_frame_id != 0:
+            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. These are your historical observations: <history>. {random.choice(self.conjunctions)}<image>."}]]
+            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction).replace('<history>', history_imgs)
+        else:
+            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. {random.choice(self.conjunctions)}<image>."}]]
+            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction)
+        
+        if current_dialog is not None:
+            for turn in range(len(current_dialog) // 2):
+                chat_sources[0].extend([{'from': 'gpt', 'value': '<talk>' + current_dialog[2*turn]['message']}])
+                chat_sources[0].extend([{'from': 'human', 'value': current_dialog[2*turn+1]['message']}])
+
+        if pose is not None:
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[5]}, {'from': 'human', 'value': f'{random.choice(self.conjunctions)}<image>.'}, {'from': 'gpt', 'value': '<move>' + f'{action[0]} {action[1]}'}])
+        elif action == 0:
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[action]}])
+        else:
+            turn_action_text = ''.join([self.idx2actions[idx] for idx in action])
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + turn_action_text}])
+        chat_sources = enforce_simple_limit(chat_sources, limit = random.randint(0, self.max_dialog_turns))
+        
+        grid_thw_merged = copy.deepcopy(grid_thws)
+        
+        if not isinstance(grid_thws, Sequence):
+            grid_thw_merged = [grid_thw_merged]
+            grid_thws = [grid_thws]
+
+        grid_thw_merged = [
+            merged_thw.prod() // self.data_args.image_processor.merge_size**2
+            for merged_thw in grid_thw_merged
+        ]
+        data_dict = preprocess_qwen_2_visual(
+            chat_sources,
+            self.tokenizer,
+            grid_thw_image=grid_thw_merged if grid_thw_merged else None,
+        )
+    
+        position_ids, _ = self.get_rope_index(
+            self.data_args.image_processor.merge_size,
+            data_dict["input_ids"],
+            image_grid_thw=torch.stack(grid_thws, dim=0) if grid_thws else None,
+        )
+        
+        data_dict["position_ids"] = position_ids
+        data_dict["attention_mask"] = [data_dict["input_ids"][0].size(0)]
+        data_dict["pixel_values"] = torch.cat(images, dim=0)
+        data_dict["image_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in grid_thws], dim=0)
+        
+        return data_dict
+
+
+def pad_and_cat(tensor_list):
+    max_length = max(tensor.shape[2] for tensor in tensor_list)
+
+    padded_tensors = []
+    for tensor in tensor_list:
+        pad_length = max_length - tensor.shape[2]
+        padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), "constant", 1)
+        padded_tensors.append(padded_tensor)
+
+    stacked_tensor = torch.cat(padded_tensors, dim=1)
+
+    return stacked_tensor
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances]
+            for key in ("input_ids", "labels", "position_ids")
+        )
+        input_ids = [ids.squeeze(0) for ids in input_ids]
+        labels = [ids.squeeze(0) for ids in labels]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+        position_ids = pad_and_cat(position_ids)
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        position_ids = position_ids[:, :, : self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        images = list(
+            instance["pixel_values"]
+            for instance in instances
+            if "pixel_values" in instance
+        )
+        videos = list(
+            instance["pixel_values_videos"]
+            for instance in instances
+            if "pixel_values_videos" in instance
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = [
+                instance["image_grid_thw"]
+                for instance in instances
+                if "image_grid_thw" in instance
+            ]
+            grid_thw = torch.cat(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = [
+                instance["video_grid_thw"]
+                for instance in instances
+                if "video_grid_thw" in instance
+            ]
+            video_grid_thw = torch.cat(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+        batch["position_ids"] = position_ids
+        return batch
+
+
+@dataclass
+class FlattenedDataCollatorForSupervisedDataset(DataCollatorForSupervisedDataset):
+    """Collate examples into packed sequence with multi-modal support."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids, attention_mask = tuple(
+            [instance[key] for instance in instances]
+            for key in ("input_ids", "labels", "position_ids", "attention_mask")
+        )
+        attention_mask = list(
+            itertools.chain(
+                *(
+                    instance["attention_mask"]
+                    for instance in instances
+                    if "attention_mask" in instance
+                )
+            )
+        )
+        seq_lens = torch.tensor([0] + attention_mask, dtype=torch.int32)
+        cumsum_seq_lens = torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
+        input_ids = torch.cat(input_ids, dim=1)
+        labels = torch.cat(labels, dim=1)
+        position_ids = torch.cat(position_ids, dim=2)
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=cumsum_seq_lens,
+            position_ids=position_ids,
+        )
+        images = list(
+            instance["pixel_values"]
+            for instance in instances
+            if "pixel_values" in instance
+        )
+        videos = list(
+            instance["pixel_values_videos"]
+            for instance in instances
+            if "pixel_values_videos" in instance
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = [
+                instance["image_grid_thw"]
+                for instance in instances
+                if "image_grid_thw" in instance
+            ]
+            grid_thw = torch.cat(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = [
+                instance["video_grid_thw"]
+                for instance in instances
+                if "video_grid_thw" in instance
+            ]
+            video_grid_thw = torch.cat(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+
+        return batch
+
+
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer, data_args
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = VLLN_Dataset(tokenizer=tokenizer, data_args=data_args)
+    if data_args.data_flatten:
+        data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
+        return dict(
+            train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
+        )
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(
+        train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
+    )
+
+
+if __name__ == "__main__":
+    pass
diff --git a/internnav/trainer/internvla_n1_argument.py b/internnav/trainer/internvla_n1_argument.py
index 7bdd68ad..b6c07e4a 100644
--- a/internnav/trainer/internvla_n1_argument.py
+++ b/internnav/trainer/internvla_n1_argument.py
@@ -38,6 +38,7 @@ class DataArguments:
     resize_h: Optional[int] = field(default=384)
     resize_w: Optional[int] = field(default=384)
     num_future_steps: Optional[int] = field(default=4)
+    max_dialog_turns: Optional[int] = field(default=6)
 
 
 @dataclass
diff --git a/internnav/trainer/internvla_vlln_trainer.py b/internnav/trainer/internvla_vlln_trainer.py
new file mode 100644
index 00000000..cf602ff6
--- /dev/null
+++ b/internnav/trainer/internvla_vlln_trainer.py
@@ -0,0 +1,239 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import logging
+import os
+import pathlib
+import sys
+from pathlib import Path
+from typing import Dict
+
+import torch
+import transformers
+from torchvision.transforms import v2
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.append(str(project_root))
+
+from qwenvl_base import replace_qwen2_vl_attention_class
+from transformers import (
+    AutoProcessor,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+    Qwen2VLImageProcessor,
+    Trainer,
+)
+
+from internnav.dataset.vlln_lerobot_dataset import make_supervised_data_module
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.trainer.internvla_n1_argument import (
+    DataArguments,
+    ModelArguments,
+    TrainingArguments,
+)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+
+
+def set_model(model_args, model):
+    if model_args.tune_mm_vision:
+        for n, p in model.visual.named_parameters():
+            p.requires_grad = True
+    else:
+        for n, p in model.visual.named_parameters():
+            p.requires_grad = False
+
+    if model_args.tune_mm_mlp:
+        for n, p in model.visual.merger.named_parameters():
+            p.requires_grad = True
+    else:
+        for n, p in model.visual.merger.named_parameters():
+            p.requires_grad = False
+
+    if model_args.tune_mm_llm:
+        for n, p in model.model.named_parameters():
+            p.requires_grad = True
+        model.lm_head.requires_grad = True
+    else:
+        for n, p in model.model.named_parameters():
+            p.requires_grad = False
+        # model.lm_head.requires_grad = False
+        for n, p in model.lm_head.named_parameters():
+            p.requires_grad = False
+
+    if 'nextdit' in model_args.system1:
+        modules = [
+            'action_encoder',
+            'action_decoder',
+            'traj_dit',
+            'cond_projector',
+            'memory_encoder',
+            'rgb_resampler',
+            'rgb_model',
+        ]
+        for n, p in model.model.named_parameters():
+            if any(k in n for k in modules):
+                p.requires_grad = True
+        model.model.latent_queries.requires_grad = True
+    elif 'navdp' in model_args.system1:
+        for n, p in model.model.navdp.named_parameters():
+            if "rgb_model" not in n:
+                p.requires_grad = True
+        model.model.latent_queries.requires_grad = True
+
+
+def train(attn_implementation="flash_attention_2"):
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    local_rank = training_args.local_rank
+    os.makedirs(training_args.output_dir, exist_ok=True)
+
+    if data_args.data_augmentation:
+        data_args.transform_train = v2.Compose(
+            [
+                v2.ToImage(),
+                v2.ColorJitter(brightness=0.2, saturation=0.2),
+                v2.RandomPosterize(bits=4),
+                v2.RandomAdjustSharpness(sharpness_factor=1.5),
+                v2.RandomAutocontrast(),
+                v2.ToPILImage(),
+                v2.Resize((data_args.resize_h, data_args.resize_w)),
+            ]
+        )
+    else:
+        data_args.transform_train = v2.Resize((data_args.resize_h, data_args.resize_w))
+
+    if 'internvla-n1-system2' in model_args.model_name_or_path.lower():
+        model = InternVLAN1ForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        ).image_processor
+        data_args.model_type = "internvla-n1"
+    elif "qwen2.5" in model_args.model_name_or_path.lower():
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        ).image_processor
+        data_args.model_type = "qwen2.5vl"
+    else:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+        )
+        data_args.image_processor = Qwen2VLImageProcessor.from_pretrained(
+            model_args.model_name_or_path,
+        )
+        data_args.model_type = "qwen2vl"
+
+    if data_args.data_flatten:
+        replace_qwen2_vl_attention_class()
+    model.config.use_cache = False
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+
+    if data_args.model_type == "internvla-n1":
+        model.get_model().initialize_vision_modules(model_args=model_args)
+    set_model(model_args, model)
+
+    if torch.distributed.get_rank() == 0:
+        model.visual.print_trainable_parameters()
+        model.model.print_trainable_parameters()
+
+    if data_args.data_packing:
+        data_module = make_supervised_data_module_packed(tokenizer=tokenizer, data_args=data_args)  # noqa: F821
+    else:
+        data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    trainer = Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module)
+    from tabulate import tabulate
+
+    if trainer.is_world_process_zero():
+        stat = []
+        for i, (n, p) in enumerate(trainer.model.named_parameters()):
+            stat.append([i, n, p.shape, p.requires_grad])
+        print(tabulate(stat, headers=["idx", "name", "shape", "trainable"]))
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        logging.info("checkpoint found, resume training")
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    data_args.image_processor.save_pretrained(training_args.output_dir)
+
+    model.config.use_cache = True
+
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")
diff --git a/scripts/train/qwenvl_train/train_system2_vlln.sh b/scripts/train/qwenvl_train/train_system2_vlln.sh
new file mode 100644
index 00000000..a1a8760e
--- /dev/null
+++ b/scripts/train/qwenvl_train/train_system2_vlln.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#SBATCH -J qwenvl
+#SBATCH -p gpu_partition
+#SBATCH -N 8
+#SBATCH --gres=gpu:8
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=1
+#SBATCH -o ./slurm-%j.out
+#SBATCH -e ./slurm-%j.err
+
+# Distributed training configuration
+MASTER_ADDR=`scontrol show hostname $SLURM_JOB_NODELIST | head -n1`
+MASTER_PORT=$((RANDOM % 101 + 20001))
+
+# DeepSpeed configuration
+deepspeed=scripts/train/qwenvl_train/zero2.json
+
+# Model configuration
+llm=Qwen/Qwen2.5-VL-7B-Instruct
+
+# Training hyperparameters
+lr=2e-5
+vision_tower_lr=5e-6
+batch_size=2
+grad_accum_steps=1
+max_pixels=313600
+min_pixels=3136
+
+# Dataset configuration (replace with public dataset names)
+vln_datasets=iion_split1,iion_split2 #,iion_split3
+
+# Output configuration
+run_name=InternVLA-N1-System2
+output_dir=checkpoints/${run_name}
+
+srun torchrun --nnodes=$SLURM_NNODES --nproc_per_node=8 \
+    --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
+    internnav/trainer/internvla_vlln_trainer.py \
+    --deepspeed ${deepspeed} \
+    --model_name_or_path "${llm}" \
+    --vln_dataset_use ${vln_datasets} \
+    --data_flatten False \
+    --tune_mm_vision True \
+    --tune_mm_mlp True \
+    --tune_mm_llm True \
+    --bf16 \
+    \
+    --max_dialog_turns 6 \
+    --num_history 8 \
+    --data_augmentation True \
+    --resize_h 384 \
+    --resize_w 384 \
+    --sample_step 4 \
+    --num_future_steps 4 \
+    --predict_step_num 32 \
+    --pixel_goal_only False \
+    --system1 "none" \
+    \
+    --output_dir ${output_dir} \
+    --num_train_epochs 6.0 \
+    --per_device_train_batch_size ${batch_size} \
+    --per_device_eval_batch_size $((batch_size*2)) \
+    --gradient_accumulation_steps ${grad_accum_steps} \
+    --max_pixels ${max_pixels} \
+    --min_pixels ${min_pixels} \
+    --eval_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 5000 \
+    --save_total_limit 5 \
+    --learning_rate ${lr} \
+    --vision_tower_lr ${vision_tower_lr} \
+    --weight_decay 0 \
+    --warmup_ratio 0.003 \
+    --max_grad_norm 1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 8192 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 8 \
+    --run_name ${run_name} \
+    --report_to wandb
diff --git a/traj_data b/traj_data
new file mode 120000
index 00000000..3ef1857d
--- /dev/null
+++ b/traj_data
@@ -0,0 +1 @@
+/mnt/inspurfs/mozi_t/zhushaohao/000ICRA_training_data/traj_data
\ No newline at end of file

From 8202df4d41bb739f5b15683ea530fd097a784332 Mon Sep 17 00:00:00 2001
From: zhushaohao <shaohao9.zhu@gmail.com>
Date: Tue, 16 Dec 2025 16:04:41 +0800
Subject: [PATCH 2/9] add VL-LN Bench training code

---
 traj_data | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 traj_data

diff --git a/traj_data b/traj_data
deleted file mode 120000
index 3ef1857d..00000000
--- a/traj_data
+++ /dev/null
@@ -1 +0,0 @@
-/mnt/inspurfs/mozi_t/zhushaohao/000ICRA_training_data/traj_data
\ No newline at end of file

From 620a2fa2af6db46c771c59d7e55295faaa724f8e Mon Sep 17 00:00:00 2001
From: zhushaohao <shaohao9.zhu@gmail.com>
Date: Tue, 16 Dec 2025 19:35:56 +0800
Subject: [PATCH 3/9] "Remove VLLN trainer; unify training for VLN and IION
 datasets."

---
 .../dataset/internvla_n1_lerobot_dataset.py   |  37 ++-
 internnav/dataset/vlln_lerobot_dataset.py     |   4 +-
 internnav/trainer/internvla_n1_argument.py    |   1 +
 internnav/trainer/internvla_vlln_trainer.py   | 239 ------------------
 .../train/qwenvl_train/train_system2_vlln.sh  |   4 +-
 5 files changed, 40 insertions(+), 245 deletions(-)
 delete mode 100644 internnav/trainer/internvla_vlln_trainer.py

diff --git a/internnav/dataset/internvla_n1_lerobot_dataset.py b/internnav/dataset/internvla_n1_lerobot_dataset.py
index 8cd39a67..025ed383 100644
--- a/internnav/dataset/internvla_n1_lerobot_dataset.py
+++ b/internnav/dataset/internvla_n1_lerobot_dataset.py
@@ -16,7 +16,7 @@
 from torch.utils.data import Dataset
 from torchcodec.decoders import VideoDecoder
 from transformers.image_utils import to_numpy_array
-
+from .vlln_lerobot_dataset import VLLN_Dataset
 from .rope2d import get_rope_index_2, get_rope_index_25
 
 # Define placeholders for dataset paths
@@ -1329,10 +1329,43 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         return batch
 
+class CombineDataset(Dataset):
+    def __init__(self, datasets, shuffle=False):
+        super(CombineDataset, self).__init__()
+        self.datasets = datasets
+        self.lengths = [len(dataset) for dataset in datasets]
+        self.cum_lengths = np.cumsum(self.lengths)
+        self.total_length = self.cum_lengths[-1]
+        self.shuffle_enabled = shuffle
+        self.indices = np.arange(self.total_length)
+        if self.shuffle_enabled:
+            self.shuffle()
+
+    def shuffle(self):
+        np.random.shuffle(self.indices)
+
+    def _map_index(self, idx):
+        return self.indices[idx]
+
+    def __len__(self):
+        return self.cum_lengths[-1]
+
+    def __getitem__(self, i):
+        real_idx = self._map_index(i)
+        for idx, cum_len in enumerate(self.cum_lengths):
+            if real_idx < cum_len:
+                return self.datasets[idx][real_idx - cum_len + self.lengths[idx]]
+        raise ValueError(f"Index {real_idx} out of bound")
+
 
 def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
-    train_dataset = NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args)
+    train_datasets = []
+    if data_args.iion_dataset_use:
+        train_datasets.append(VLLN_Dataset(tokenizer=tokenizer, data_args=data_args))
+    if data_args.vln_dataset_use:
+        train_datasets.append(NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args))
+    train_dataset = CombineDataset(train_datasets, shuffle=False)
     # train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
     if data_args.data_flatten:
         data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index b0e78843..e1455cee 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -365,7 +365,7 @@ def _repl(m):
 class VLLN_Dataset(Dataset):
     def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
         super(VLLN_Dataset, self).__init__()
-        dataset = data_args.vln_dataset_use.split(",")
+        dataset = data_args.iion_dataset_use.split(",")
         dataset_list = data_list(dataset)
         rank0_print(f"Loading datasets: {dataset_list}")
         self.video_max_total_pixels = getattr(
@@ -749,7 +749,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-def make_supervised_data_module(
+def make_supervised_data_module_vlln(
     tokenizer: transformers.PreTrainedTokenizer, data_args
 ) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
diff --git a/internnav/trainer/internvla_n1_argument.py b/internnav/trainer/internvla_n1_argument.py
index b6c07e4a..bc8e0361 100644
--- a/internnav/trainer/internvla_n1_argument.py
+++ b/internnav/trainer/internvla_n1_argument.py
@@ -29,6 +29,7 @@ class DataArguments:
     video_min_frame_pixels: int = field(default=4 * 28 * 28)
 
     vln_dataset_use: str = field(default="")
+    iion_dataset_use: str = field(default="")
     sample_step: int = field(default=4)
     num_history: Optional[int] = field(default=8)
     predict_step_num: Optional[int] = field(default=32)
diff --git a/internnav/trainer/internvla_vlln_trainer.py b/internnav/trainer/internvla_vlln_trainer.py
deleted file mode 100644
index cf602ff6..00000000
--- a/internnav/trainer/internvla_vlln_trainer.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
-# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
-#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import logging
-import os
-import pathlib
-import sys
-from pathlib import Path
-from typing import Dict
-
-import torch
-import transformers
-from torchvision.transforms import v2
-
-project_root = Path(__file__).parent.parent.parent
-sys.path.append(str(project_root))
-
-from qwenvl_base import replace_qwen2_vl_attention_class
-from transformers import (
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
-    Qwen2VLImageProcessor,
-    Trainer,
-)
-
-from internnav.dataset.vlln_lerobot_dataset import make_supervised_data_module
-from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.trainer.internvla_n1_argument import (
-    DataArguments,
-    ModelArguments,
-    TrainingArguments,
-)
-
-
-def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
-    """Collects the state dict and dump to disk."""
-
-    if trainer.deepspeed:
-        torch.cuda.synchronize()
-        trainer.save_model(output_dir)
-        return
-
-    state_dict = trainer.model.state_dict()
-    if trainer.args.should_save:
-        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
-        del state_dict
-        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
-
-
-def smart_tokenizer_and_embedding_resize(
-    special_tokens_dict: Dict,
-    tokenizer: transformers.PreTrainedTokenizer,
-    model: transformers.PreTrainedModel,
-):
-    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    model.resize_token_embeddings(len(tokenizer))
-
-    if num_new_tokens > 0:
-        input_embeddings = model.get_input_embeddings().weight.data
-        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-        input_embeddings[-num_new_tokens:] = input_embeddings_avg
-
-
-def set_model(model_args, model):
-    if model_args.tune_mm_vision:
-        for n, p in model.visual.named_parameters():
-            p.requires_grad = True
-    else:
-        for n, p in model.visual.named_parameters():
-            p.requires_grad = False
-
-    if model_args.tune_mm_mlp:
-        for n, p in model.visual.merger.named_parameters():
-            p.requires_grad = True
-    else:
-        for n, p in model.visual.merger.named_parameters():
-            p.requires_grad = False
-
-    if model_args.tune_mm_llm:
-        for n, p in model.model.named_parameters():
-            p.requires_grad = True
-        model.lm_head.requires_grad = True
-    else:
-        for n, p in model.model.named_parameters():
-            p.requires_grad = False
-        # model.lm_head.requires_grad = False
-        for n, p in model.lm_head.named_parameters():
-            p.requires_grad = False
-
-    if 'nextdit' in model_args.system1:
-        modules = [
-            'action_encoder',
-            'action_decoder',
-            'traj_dit',
-            'cond_projector',
-            'memory_encoder',
-            'rgb_resampler',
-            'rgb_model',
-        ]
-        for n, p in model.model.named_parameters():
-            if any(k in n for k in modules):
-                p.requires_grad = True
-        model.model.latent_queries.requires_grad = True
-    elif 'navdp' in model_args.system1:
-        for n, p in model.model.navdp.named_parameters():
-            if "rgb_model" not in n:
-                p.requires_grad = True
-        model.model.latent_queries.requires_grad = True
-
-
-def train(attn_implementation="flash_attention_2"):
-    global local_rank
-
-    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    local_rank = training_args.local_rank
-    os.makedirs(training_args.output_dir, exist_ok=True)
-
-    if data_args.data_augmentation:
-        data_args.transform_train = v2.Compose(
-            [
-                v2.ToImage(),
-                v2.ColorJitter(brightness=0.2, saturation=0.2),
-                v2.RandomPosterize(bits=4),
-                v2.RandomAdjustSharpness(sharpness_factor=1.5),
-                v2.RandomAutocontrast(),
-                v2.ToPILImage(),
-                v2.Resize((data_args.resize_h, data_args.resize_w)),
-            ]
-        )
-    else:
-        data_args.transform_train = v2.Resize((data_args.resize_h, data_args.resize_w))
-
-    if 'internvla-n1-system2' in model_args.model_name_or_path.lower():
-        model = InternVLAN1ForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            attn_implementation=attn_implementation,
-            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-        )
-        data_args.image_processor = AutoProcessor.from_pretrained(
-            model_args.model_name_or_path,
-        ).image_processor
-        data_args.model_type = "internvla-n1"
-    elif "qwen2.5" in model_args.model_name_or_path.lower():
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            attn_implementation=attn_implementation,
-            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-        )
-        data_args.image_processor = AutoProcessor.from_pretrained(
-            model_args.model_name_or_path,
-        ).image_processor
-        data_args.model_type = "qwen2.5vl"
-    else:
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            attn_implementation=attn_implementation,
-            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
-        )
-        data_args.image_processor = Qwen2VLImageProcessor.from_pretrained(
-            model_args.model_name_or_path,
-        )
-        data_args.model_type = "qwen2vl"
-
-    if data_args.data_flatten:
-        replace_qwen2_vl_attention_class()
-    model.config.use_cache = False
-
-    if training_args.gradient_checkpointing:
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
-
-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
-
-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
-        padding_side="right",
-        use_fast=False,
-    )
-
-    if data_args.model_type == "internvla-n1":
-        model.get_model().initialize_vision_modules(model_args=model_args)
-    set_model(model_args, model)
-
-    if torch.distributed.get_rank() == 0:
-        model.visual.print_trainable_parameters()
-        model.model.print_trainable_parameters()
-
-    if data_args.data_packing:
-        data_module = make_supervised_data_module_packed(tokenizer=tokenizer, data_args=data_args)  # noqa: F821
-    else:
-        data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
-    trainer = Trainer(model=model, processing_class=tokenizer, args=training_args, **data_module)
-    from tabulate import tabulate
-
-    if trainer.is_world_process_zero():
-        stat = []
-        for i, (n, p) in enumerate(trainer.model.named_parameters()):
-            stat.append([i, n, p.shape, p.requires_grad])
-        print(tabulate(stat, headers=["idx", "name", "shape", "trainable"]))
-    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
-        logging.info("checkpoint found, resume training")
-        trainer.train(resume_from_checkpoint=True)
-    else:
-        trainer.train()
-    trainer.save_state()
-    data_args.image_processor.save_pretrained(training_args.output_dir)
-
-    model.config.use_cache = True
-
-    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
-
-
-if __name__ == "__main__":
-    train(attn_implementation="flash_attention_2")
diff --git a/scripts/train/qwenvl_train/train_system2_vlln.sh b/scripts/train/qwenvl_train/train_system2_vlln.sh
index a1a8760e..ccc82aee 100644
--- a/scripts/train/qwenvl_train/train_system2_vlln.sh
+++ b/scripts/train/qwenvl_train/train_system2_vlln.sh
@@ -27,7 +27,7 @@ max_pixels=313600
 min_pixels=3136
 
 # Dataset configuration (replace with public dataset names)
-vln_datasets=iion_split1,iion_split2 #,iion_split3
+iion_datasets=iion_split1,iion_split2 #,iion_split3
 
 # Output configuration
 run_name=InternVLA-N1-System2
@@ -38,7 +38,7 @@ srun torchrun --nnodes=$SLURM_NNODES --nproc_per_node=8 \
     internnav/trainer/internvla_vlln_trainer.py \
     --deepspeed ${deepspeed} \
     --model_name_or_path "${llm}" \
-    --vln_dataset_use ${vln_datasets} \
+    --iion_dataset_use ${iion_datasets} \
     --data_flatten False \
     --tune_mm_vision True \
     --tune_mm_mlp True \

From 5afb2270d7ade3269f06e961022a624b7c46c0d4 Mon Sep 17 00:00:00 2001
From: zhushaohao <shaohao9.zhu@gmail.com>
Date: Wed, 17 Dec 2025 16:01:24 +0800
Subject: [PATCH 4/9] solve the issue from kellyiss and kew6688

---
 .../dataset/internvla_n1_lerobot_dataset.py      |  8 +++++++-
 internnav/dataset/vlln_lerobot_dataset.py        | 16 ----------------
 scripts/train/qwenvl_train/train_system2_vlln.sh |  2 +-
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/internnav/dataset/internvla_n1_lerobot_dataset.py b/internnav/dataset/internvla_n1_lerobot_dataset.py
index 025ed383..92599657 100644
--- a/internnav/dataset/internvla_n1_lerobot_dataset.py
+++ b/internnav/dataset/internvla_n1_lerobot_dataset.py
@@ -1330,6 +1330,13 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 class CombineDataset(Dataset):
+    """
+    Combine multiple datasets into a single dataset interface.
+
+    This class is used to merge different datasets for joint training.
+    It concatenates samples from all provided datasets and optionally shuffles
+    the global index mapping (without changing the underlying datasets).
+    """
     def __init__(self, datasets, shuffle=False):
         super(CombineDataset, self).__init__()
         self.datasets = datasets
@@ -1366,7 +1373,6 @@ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, dat
     if data_args.vln_dataset_use:
         train_datasets.append(NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args))
     train_dataset = CombineDataset(train_datasets, shuffle=False)
-    # train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
     if data_args.data_flatten:
         data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
         return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index e1455cee..c75c4735 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -749,21 +749,5 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-def make_supervised_data_module_vlln(
-    tokenizer: transformers.PreTrainedTokenizer, data_args
-) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
-    train_dataset = VLLN_Dataset(tokenizer=tokenizer, data_args=data_args)
-    if data_args.data_flatten:
-        data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
-        return dict(
-            train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
-        )
-    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
-    return dict(
-        train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
-    )
-
-
 if __name__ == "__main__":
     pass
diff --git a/scripts/train/qwenvl_train/train_system2_vlln.sh b/scripts/train/qwenvl_train/train_system2_vlln.sh
index ccc82aee..2ac79998 100644
--- a/scripts/train/qwenvl_train/train_system2_vlln.sh
+++ b/scripts/train/qwenvl_train/train_system2_vlln.sh
@@ -30,7 +30,7 @@ min_pixels=3136
 iion_datasets=iion_split1,iion_split2 #,iion_split3
 
 # Output configuration
-run_name=InternVLA-N1-System2
+run_name=InternVLA-N1-vlln
 output_dir=checkpoints/${run_name}
 
 srun torchrun --nnodes=$SLURM_NNODES --nproc_per_node=8 \

From 2ff6105b49ba413fca35f6d5522f73596a8d3d6b Mon Sep 17 00:00:00 2001
From: DuangZhu <shaohao9.zhu@gmail.com>
Date: Thu, 18 Dec 2025 11:43:07 +0800
Subject: [PATCH 5/9] solve the issue from Tai-Wang

---
 internnav/dataset/dataset_utils.py            |  12 +
 .../dataset/internvla_n1_lerobot_dataset.py   |  19 +-
 internnav/dataset/vlln_lerobot_dataset.py     | 679 +++++++-----------
 3 files changed, 278 insertions(+), 432 deletions(-)
 create mode 100644 internnav/dataset/dataset_utils.py

diff --git a/internnav/dataset/dataset_utils.py b/internnav/dataset/dataset_utils.py
new file mode 100644
index 00000000..a8465b9c
--- /dev/null
+++ b/internnav/dataset/dataset_utils.py
@@ -0,0 +1,12 @@
+import json
+import re
+
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
\ No newline at end of file
diff --git a/internnav/dataset/internvla_n1_lerobot_dataset.py b/internnav/dataset/internvla_n1_lerobot_dataset.py
index 92599657..c001d3aa 100644
--- a/internnav/dataset/internvla_n1_lerobot_dataset.py
+++ b/internnav/dataset/internvla_n1_lerobot_dataset.py
@@ -16,6 +16,7 @@
 from torch.utils.data import Dataset
 from torchcodec.decoders import VideoDecoder
 from transformers.image_utils import to_numpy_array
+from .dataset_utils import parse_sampling_rate, read_jsonl
 from .vlln_lerobot_dataset import VLLN_Dataset
 from .rope2d import get_rope_index_2, get_rope_index_25
 
@@ -143,13 +144,6 @@
 }
 
 
-def parse_sampling_rate(dataset_name):
-    match = re.search(r"%(\d+)$", dataset_name)
-    if match:
-        return int(match.group(1)) / 100.0
-    return 1.0
-
-
 def data_list(dataset_names):
     config_list = []
     for dataset_name in dataset_names:
@@ -180,11 +174,6 @@ def rank0_print(*args):
         print(*args)
 
 
-def read_jsonl(path):
-    with open(path, "r") as f:
-        return [json.loads(line) for line in f]
-
-
 def preprocess_qwen_2_visual(
     sources,
     tokenizer: transformers.PreTrainedTokenizer,
@@ -1329,7 +1318,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
         return batch
 
-class CombineDataset(Dataset):
+class CombinedDataset(Dataset):
     """
     Combine multiple datasets into a single dataset interface.
 
@@ -1338,7 +1327,7 @@ class CombineDataset(Dataset):
     the global index mapping (without changing the underlying datasets).
     """
     def __init__(self, datasets, shuffle=False):
-        super(CombineDataset, self).__init__()
+        super(CombinedDataset, self).__init__()
         self.datasets = datasets
         self.lengths = [len(dataset) for dataset in datasets]
         self.cum_lengths = np.cumsum(self.lengths)
@@ -1372,7 +1361,7 @@ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, dat
         train_datasets.append(VLLN_Dataset(tokenizer=tokenizer, data_args=data_args))
     if data_args.vln_dataset_use:
         train_datasets.append(NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args))
-    train_dataset = CombineDataset(train_datasets, shuffle=False)
+    train_dataset = CombinedDataset(train_datasets, shuffle=False)
     if data_args.data_flatten:
         data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
         return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index c75c4735..db5c784e 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -17,8 +17,8 @@
 from torchcodec.decoders import VideoDecoder
 from transformers.image_utils import to_numpy_array
 from bisect import bisect_left
-
 from .rope2d import get_rope_index_2, get_rope_index_25
+from .dataset_utils import parse_sampling_rate, read_jsonl
 
 # Define placeholders for dataset paths
 IION_split1 = {
@@ -48,12 +48,247 @@
     "iion_split3": IION_split3,
 }
 
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+TRAJ_TOKEN_INDEX = 151667
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+_ORACLE_BLOCK = re.compile(r'<\|oracle\|>.*?<\|dialog_end\|>', re.DOTALL)
+
+local_rank = None
+
+
+class VLLN_Dataset(Dataset):
+    """
+    Dataset for Vision Language-Language Navigation (VL-LN) / IION-style training.
+    """
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(VLLN_Dataset, self).__init__()
+        dataset = data_args.iion_dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(
+            data_args, "video_max_total_pixels", 1664 * 28 * 28
+        )
+        self.video_min_total_pixels = getattr(
+            data_args, "video_min_total_pixels", 256 * 28 * 28
+        )
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+        
+        self.sample_step = data_args.sample_step
+        self.predict_step_num = data_args.predict_step_num
+        self.pixel_goal_only = data_args.pixel_goal_only
+        self.num_future_steps = data_args.num_future_steps
+        self.max_dialog_turns = data_args.max_dialog_turns
+
+        self.list_data_dict = []
+
+        for data in dataset_list:
+            sampling_rate = data.get("sampling_rate", 1.0)
+            height = data.get("height", None)
+            pitch_1 = data.get("pitch_1", None)
+            pitch_2 = data.get("pitch_2", None)
+            
+            data_path = data['data_path']
+            
+            annotations = get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height)
+
+            pixel_goal_list = []
+            turn_list = []
+            stop_list = []
+            list_data_dict = []
+            dialog_list = []
+            
+
+            for ep_id, item in enumerate(annotations['episodes']):
+                ep_id = item['id']
+                instruction = item['instructions']
+                video = item['video']
+                dialogs = item['dialogs']
+                dialogs, dia_idx = sort_dialogs_by_true_idx(dialogs)
+                actions = item['actions'][1:] + [0]
+                pixel_goals = item['pixel_goals']
+                poses = item[f'poses_{height}cm_{pitch_1}deg']
+                
+                actions_len = len(actions)
+                if actions_len < 4:
+                    continue
+        
+                num_rounds = actions_len // self.sample_step
+                for n in range(num_rounds+1):
+                    if n * self.sample_step == actions_len or n * self.sample_step == actions_len - 1:
+                        continue
+                    start_frame_id = n * self.sample_step
+                    action_flag = actions[start_frame_id]
+                    pixel_goal = pixel_goals[start_frame_id]
+                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
+                    if pixel_goal[0]==-1:
+                        if action_flag == 1:
+                            continue
+                        else:
+                            turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
+                            turn_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, None))
+                    else:
+                        goal_len = pixel_goal[0]
+                        action = pixel_goal[1]
+                        pose = poses[start_frame_id:start_frame_id+goal_len]
+                        pixel_goal_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, None))
+                stop_frame = actions_len - 1
+                stop_history = get_history_dialogs(stop_frame, dialogs, dia_idx)
+                stop_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, actions_len-1, 0, None, stop_history, None))
+                for n in range(len(dia_idx)):
+                    start_frame_id = dia_idx[n]
+                    action = actions[start_frame_id : start_frame_id + self.num_future_steps]
+                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
+                    current_dialog = [sentence for sentence in dialogs if sentence['true_idx'] == start_frame_id]
+                    if action[0] == 1:
+                        pixel_goal = pixel_goals[start_frame_id]
+                        if pixel_goal[0] != -1:
+                            goal_len = pixel_goal[0]
+                            pose = poses[start_frame_id:start_frame_id+goal_len]
+                            dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, pixel_goal[1], pose, history_dialogs, current_dialog))
+                        else:
+                            continue
+                    elif action[0] == 0:
+                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, 0, None, history_dialogs, current_dialog))
+                    else:
+                        turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
+                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, current_dialog))
+
+            list_data_dict = pixel_goal_list
+            rank0_print(len(turn_list), len(pixel_goal_list), len(stop_list), len(dialog_list))
+            if not self.pixel_goal_only:
+                list_data_dict += turn_list
+                list_data_dict += stop_list * 10
+                list_data_dict += dialog_list * 10
+            if sampling_rate < 1.0:
+                list_data_dict = random.sample(
+                    list_data_dict, int(len(list_data_dict) * sampling_rate)
+                )
+                print(f"sampling {len(list_data_dict)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+                
+            self.list_data_dict.extend(list_data_dict)
+
+        self.num_history = data_args.num_history
+        self.idx2actions = {
+            0: 'STOP',
+            1: "↑",
+            2: "←",
+            3: "→",
+            5: "↓"
+        }
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is '
+        ]
+        self.data_args = data_args
+        self.tokenizer = tokenizer
+    
+    def __len__(self):
+        return len(self.list_data_dict)   
+    
+    def process_image_unified(self, image):
+        processor = copy.deepcopy(self.data_args.image_processor)
+
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+    
+    def __getitem__(self, i):
+        ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, current_dialog = self.list_data_dict[i]
+        dialogs_id = np.array([dialog['true_idx'] for dialog in history_dialogs])[::2]
+        if start_frame_id != 0:
+            history_id = np.unique(np.concatenate([np.linspace(0, start_frame_id-1, self.num_history, dtype=np.int32),dialogs_id])).tolist()
+        else:
+            history_id = []
+        
+        images = []
+        grid_thws = []
+
+        for id in range(0, start_frame_id + 1):
+            image_file = os.path.join(video, f"observation.images.rgb.{height}cm_{pitch_1}deg", f"episode_{ep_id:06d}_{id}.jpg")
+            if id in history_id or id == start_frame_id:
+                image = Image.open(image_file).convert('RGB')  
+                lookdown_image = Image.open(image_file.replace(f'_{pitch_1}deg',f'_{pitch_2}deg')).convert('RGB')
+                if self.data_args.transform_train is not None:
+                    image = self.data_args.transform_train(image)  
+                image, grid_thw = self.process_image_unified(image)
+                images.append(image)
+                grid_thws.append(grid_thw)
+                if id == start_frame_id and pose is not None: 
+                    image, grid_thw = self.process_image_unified(lookdown_image)
+                    images.append(image)
+                    grid_thws.append(grid_thw)
+        
+        if history_dialogs:
+            history_imgs = build_dialog_history(history_id, dialogs_id, history_dialogs)
+        else:
+            history_imgs = "<image>\n"*len(history_id)
+        
+        if start_frame_id != 0:
+            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. These are your historical observations: <history>. {random.choice(self.conjunctions)}<image>."}]]
+            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction).replace('<history>', history_imgs)
+        else:
+            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. {random.choice(self.conjunctions)}<image>."}]]
+            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction)
+        
+        if current_dialog is not None:
+            for turn in range(len(current_dialog) // 2):
+                chat_sources[0].extend([{'from': 'gpt', 'value': '<talk>' + current_dialog[2*turn]['message']}])
+                chat_sources[0].extend([{'from': 'human', 'value': current_dialog[2*turn+1]['message']}])
+
+        if pose is not None:
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[5]}, {'from': 'human', 'value': f'{random.choice(self.conjunctions)}<image>.'}, {'from': 'gpt', 'value': '<move>' + f'{action[0]} {action[1]}'}])
+        elif action == 0:
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[action]}])
+        else:
+            turn_action_text = ''.join([self.idx2actions[idx] for idx in action])
+            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + turn_action_text}])
+        chat_sources = enforce_simple_limit(chat_sources, limit = random.randint(0, self.max_dialog_turns))
+        
+        grid_thw_merged = copy.deepcopy(grid_thws)
+        
+        if not isinstance(grid_thws, Sequence):
+            grid_thw_merged = [grid_thw_merged]
+            grid_thws = [grid_thws]
 
-def parse_sampling_rate(dataset_name):
-    match = re.search(r"%(\d+)$", dataset_name)
-    if match:
-        return int(match.group(1)) / 100.0
-    return 1.0
+        grid_thw_merged = [
+            merged_thw.prod() // self.data_args.image_processor.merge_size**2
+            for merged_thw in grid_thw_merged
+        ]
+        data_dict = preprocess_qwen_2_visual(
+            chat_sources,
+            self.tokenizer,
+            grid_thw_image=grid_thw_merged if grid_thw_merged else None,
+        )
+    
+        position_ids, _ = self.get_rope_index(
+            self.data_args.image_processor.merge_size,
+            data_dict["input_ids"],
+            image_grid_thw=torch.stack(grid_thws, dim=0) if grid_thws else None,
+        )
+        
+        data_dict["position_ids"] = position_ids
+        data_dict["attention_mask"] = [data_dict["input_ids"][0].size(0)]
+        data_dict["pixel_values"] = torch.cat(images, dim=0)
+        data_dict["image_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in grid_thws], dim=0)
+        
+        return data_dict
 
 
 def data_list(dataset_names):
@@ -70,27 +305,11 @@ def data_list(dataset_names):
     return config_list
 
 
-IGNORE_INDEX = -100
-IMAGE_TOKEN_INDEX = 151655
-VIDEO_TOKEN_INDEX = 151656
-TRAJ_TOKEN_INDEX = 151667
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_VIDEO_TOKEN = "<video>"
-_ORACLE_BLOCK = re.compile(r'<\|oracle\|>.*?<\|dialog_end\|>', re.DOTALL)
-
-local_rank = None
-
-
 def rank0_print(*args):
     if local_rank == 0:
         print(*args)
 
 
-def read_jsonl(path):
-    with open(path, "r") as f:
-        return [json.loads(line) for line in f]
-
-
 def preprocess_qwen_2_visual(
     sources,
     tokenizer: transformers.PreTrainedTokenizer,
@@ -186,16 +405,11 @@ def preprocess_qwen_2_visual(
         labels=targets,
     )
 
-def clip_or_pad(arr, fixed_len):
-    T, D = arr.shape
-    if T >= fixed_len:
-        return arr[:fixed_len]
-    else:
-        pad = np.zeros((fixed_len - T, D), dtype=arr.dtype)
-        return np.concatenate([arr, pad], axis=0)
-
 
 def get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height):
+    """
+    Load LeRobot-style dataset and convert it into a unified annotations dict.
+    """
     import pyarrow.parquet as pq
     import pandas as pd
     from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -266,7 +480,11 @@ def process_scene(scene_id):
 
     return annotations
 
+
 def get_turn_actions(actions, start_frame_id, num_future_steps):
+    """
+    Return the longest prefix of future actions that are identical to the first action.
+    """
     if not (0 <= start_frame_id < len(actions)):
         return []
     s = actions[start_frame_id : start_frame_id + num_future_steps]
@@ -275,6 +493,9 @@ def get_turn_actions(actions, start_frame_id, num_future_steps):
     return s[:i]
 
 def sort_dialogs_by_true_idx(dialogs):
+    """
+    Sort dialog messages by their true_idx in pairs.
+    """
     groups = []
     i, n = 0, len(dialogs)
     while i < n:
@@ -298,6 +519,7 @@ def group_key(g):
 
     return sorted_dialogs, unique_true_idx
 
+
 def get_history_dialogs(start_frame_id, dialogs, dia_idx):
     i = bisect_left(dia_idx, start_frame_id) 
     if i != 0:
@@ -305,7 +527,12 @@ def get_history_dialogs(start_frame_id, dialogs, dia_idx):
     else:
         return []
 
+
 def build_dialog_history(history_id, dialog_id, dialogs):
+    """
+    Build a serialized string that interleaves visual placeholders (<image>) with
+    dialog blocks (<|dialog_start|>...<|dialog_end|>) aligned to history frames.
+    """
     placeholder = [''] * (len(history_id)+1)
     for n in dialog_id:
         pos = history_id.index(n)
@@ -317,9 +544,13 @@ def build_dialog_history(history_id, dialog_id, dialogs):
     placeholder = ('<image>\n').join(placeholder)
     return placeholder
 
+
 def enforce_simple_limit(conv, limit,
     sorry_msg: str = "Sorry, you have reached the question limit. No further answers are available."):
-
+    """
+    Truncate / limit the number of answer-like items (oracle blocks and talk-human pairs)
+    by replacing extra parts with a fixed apology message.
+    """
     conv = [dict(m) for m in conv[0]]  
     answer_indices = []
     replaced_indices = []
@@ -362,392 +593,6 @@ def _repl(m):
 
     return [conv]
 
-class VLLN_Dataset(Dataset):
-    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
-        super(VLLN_Dataset, self).__init__()
-        dataset = data_args.iion_dataset_use.split(",")
-        dataset_list = data_list(dataset)
-        rank0_print(f"Loading datasets: {dataset_list}")
-        self.video_max_total_pixels = getattr(
-            data_args, "video_max_total_pixels", 1664 * 28 * 28
-        )
-        self.video_min_total_pixels = getattr(
-            data_args, "video_min_total_pixels", 256 * 28 * 28
-        )
-        self.model_type = data_args.model_type
-        if data_args.model_type == "qwen2.5vl":
-            self.get_rope_index = get_rope_index_25
-        else:
-            self.get_rope_index = get_rope_index_2
-        
-        self.sample_step = data_args.sample_step
-        self.predict_step_num = data_args.predict_step_num
-        self.pixel_goal_only = data_args.pixel_goal_only
-        self.num_future_steps = data_args.num_future_steps
-        self.max_dialog_turns = data_args.max_dialog_turns
-
-        self.list_data_dict = []
-
-        for data in dataset_list:
-            sampling_rate = data.get("sampling_rate", 1.0)
-            height = data.get("height", None)
-            pitch_1 = data.get("pitch_1", None)
-            pitch_2 = data.get("pitch_2", None)
-            
-            data_path = data['data_path']
-            
-            annotations = get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height)
-
-            pixel_goal_list = []
-            turn_list = []
-            stop_list = []
-            list_data_dict = []
-            dialog_list = []
-            
-
-            for ep_id, item in enumerate(annotations['episodes']):
-                ep_id = item['id']
-                instruction = item['instructions']
-                video = item['video']
-                dialogs = item['dialogs']
-                dialogs, dia_idx = sort_dialogs_by_true_idx(dialogs)
-                actions = item['actions'][1:] + [0]
-                pixel_goals = item['pixel_goals']
-                poses = item[f'poses_{height}cm_{pitch_1}deg']
-                
-                actions_len = len(actions)
-                if actions_len < 4:
-                    continue
-        
-                num_rounds = actions_len // self.sample_step
-                for n in range(num_rounds+1):
-                    if n * self.sample_step == actions_len or n * self.sample_step == actions_len - 1:
-                        continue
-                    start_frame_id = n * self.sample_step
-                    action_flag = actions[start_frame_id]
-                    pixel_goal = pixel_goals[start_frame_id]
-                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
-                    if pixel_goal[0]==-1:
-                        if action_flag == 1:
-                            continue
-                        else:
-                            turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
-                            turn_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, None))
-                    else:
-                        goal_len = pixel_goal[0]
-                        action = pixel_goal[1]
-                        pose = poses[start_frame_id:start_frame_id+goal_len]
-                        pixel_goal_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, None))
-                stop_frame = actions_len - 1
-                stop_history = get_history_dialogs(stop_frame, dialogs, dia_idx)
-                stop_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, actions_len-1, 0, None, stop_history, None))
-                for n in range(len(dia_idx)):
-                    start_frame_id = dia_idx[n]
-                    action = actions[start_frame_id : start_frame_id + self.num_future_steps]
-                    history_dialogs = get_history_dialogs(start_frame_id, dialogs, dia_idx)
-                    current_dialog = [sentence for sentence in dialogs if sentence['true_idx'] == start_frame_id]
-                    if action[0] == 1:
-                        pixel_goal = pixel_goals[start_frame_id]
-                        if pixel_goal[0] != -1:
-                            goal_len = pixel_goal[0]
-                            pose = poses[start_frame_id:start_frame_id+goal_len]
-                            dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, pixel_goal[1], pose, history_dialogs, current_dialog))
-                        else:
-                            continue
-                    elif action[0] == 0:
-                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, 0, None, history_dialogs, current_dialog))
-                    else:
-                        turn_actions = get_turn_actions(actions, start_frame_id, self.num_future_steps)
-                        dialog_list.append((ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, turn_actions, None, history_dialogs, current_dialog))
-
-            list_data_dict = pixel_goal_list
-            rank0_print(len(turn_list), len(pixel_goal_list), len(stop_list), len(dialog_list))
-            if not self.pixel_goal_only:
-                list_data_dict += turn_list
-                list_data_dict += stop_list * 10
-                list_data_dict += dialog_list * 10
-            if sampling_rate < 1.0:
-                list_data_dict = random.sample(
-                    list_data_dict, int(len(list_data_dict) * sampling_rate)
-                )
-                print(f"sampling {len(list_data_dict)} examples from dataset {data}")
-            else:
-                rank0_print(f"dataset name: {data}")
-                
-            self.list_data_dict.extend(list_data_dict)
-
-        self.num_history = data_args.num_history
-        self.idx2actions = {
-            0: 'STOP',
-            1: "↑",
-            2: "←",
-            3: "→",
-            5: "↓"
-        }
-        self.conjunctions = [
-            'you can see ',
-            'in front of you is ',
-            'there is ',
-            'you can spot ',
-            'you are toward the ',
-            'ahead of you is ',
-            'in your sight is '
-        ]
-        self.data_args = data_args
-        self.tokenizer = tokenizer
-    
-    def __len__(self):
-        return len(self.list_data_dict)   
-    
-    def process_image_unified(self, image):
-        processor = copy.deepcopy(self.data_args.image_processor)
-
-        visual_processed = processor.preprocess(image, return_tensors="pt")
-        image_tensor = visual_processed["pixel_values"]
-        if isinstance(image_tensor, List):
-            image_tensor = image_tensor[0]
-        grid_thw = visual_processed["image_grid_thw"][0]
-        return image_tensor, grid_thw
-    
-    def __getitem__(self, i):
-        ep_id, data_path, video, height, pitch_1, pitch_2, instruction, start_frame_id, action, pose, history_dialogs, current_dialog = self.list_data_dict[i]
-        dialogs_id = np.array([dialog['true_idx'] for dialog in history_dialogs])[::2]
-        if start_frame_id != 0:
-            history_id = np.unique(np.concatenate([np.linspace(0, start_frame_id-1, self.num_history, dtype=np.int32),dialogs_id])).tolist()
-        else:
-            history_id = []
-        
-        images = []
-        grid_thws = []
-
-        for id in range(0, start_frame_id + 1):
-            image_file = os.path.join(video, f"observation.images.rgb.{height}cm_{pitch_1}deg", f"episode_{ep_id:06d}_{id}.jpg")
-            if id in history_id or id == start_frame_id:
-                image = Image.open(image_file).convert('RGB')  
-                lookdown_image = Image.open(image_file.replace(f'_{pitch_1}deg',f'_{pitch_2}deg')).convert('RGB')
-                if self.data_args.transform_train is not None:
-                    image = self.data_args.transform_train(image)  
-                image, grid_thw = self.process_image_unified(image)
-                images.append(image)
-                grid_thws.append(grid_thw)
-                if id == start_frame_id and pose is not None: 
-                    image, grid_thw = self.process_image_unified(lookdown_image)
-                    images.append(image)
-                    grid_thws.append(grid_thw)
-        
-        if history_dialogs:
-            history_imgs = build_dialog_history(history_id, dialogs_id, history_dialogs)
-        else:
-            history_imgs = "<image>\n"*len(history_id)
-        
-        if start_frame_id != 0:
-            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. These are your historical observations: <history>. {random.choice(self.conjunctions)}<image>."}]]
-            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction).replace('<history>', history_imgs)
-        else:
-            chat_sources = [[{'from': 'human', 'value': f"You are an autonomous navigation assistant. Your task is to <instruction> There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 30 degrees. Please output STOP when you have successfully completed the task. {random.choice(self.conjunctions)}<image>."}]]
-            chat_sources[0][0]['value'] = chat_sources[0][0]['value'].replace('<instruction>', instruction)
-        
-        if current_dialog is not None:
-            for turn in range(len(current_dialog) // 2):
-                chat_sources[0].extend([{'from': 'gpt', 'value': '<talk>' + current_dialog[2*turn]['message']}])
-                chat_sources[0].extend([{'from': 'human', 'value': current_dialog[2*turn+1]['message']}])
-
-        if pose is not None:
-            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[5]}, {'from': 'human', 'value': f'{random.choice(self.conjunctions)}<image>.'}, {'from': 'gpt', 'value': '<move>' + f'{action[0]} {action[1]}'}])
-        elif action == 0:
-            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + self.idx2actions[action]}])
-        else:
-            turn_action_text = ''.join([self.idx2actions[idx] for idx in action])
-            chat_sources[0].extend([{'from': 'gpt', 'value': '<move>' + turn_action_text}])
-        chat_sources = enforce_simple_limit(chat_sources, limit = random.randint(0, self.max_dialog_turns))
-        
-        grid_thw_merged = copy.deepcopy(grid_thws)
-        
-        if not isinstance(grid_thws, Sequence):
-            grid_thw_merged = [grid_thw_merged]
-            grid_thws = [grid_thws]
-
-        grid_thw_merged = [
-            merged_thw.prod() // self.data_args.image_processor.merge_size**2
-            for merged_thw in grid_thw_merged
-        ]
-        data_dict = preprocess_qwen_2_visual(
-            chat_sources,
-            self.tokenizer,
-            grid_thw_image=grid_thw_merged if grid_thw_merged else None,
-        )
-    
-        position_ids, _ = self.get_rope_index(
-            self.data_args.image_processor.merge_size,
-            data_dict["input_ids"],
-            image_grid_thw=torch.stack(grid_thws, dim=0) if grid_thws else None,
-        )
-        
-        data_dict["position_ids"] = position_ids
-        data_dict["attention_mask"] = [data_dict["input_ids"][0].size(0)]
-        data_dict["pixel_values"] = torch.cat(images, dim=0)
-        data_dict["image_grid_thw"] = torch.cat([thw.unsqueeze(0) for thw in grid_thws], dim=0)
-        
-        return data_dict
-
-
-def pad_and_cat(tensor_list):
-    max_length = max(tensor.shape[2] for tensor in tensor_list)
-
-    padded_tensors = []
-    for tensor in tensor_list:
-        pad_length = max_length - tensor.shape[2]
-        padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), "constant", 1)
-        padded_tensors.append(padded_tensor)
-
-    stacked_tensor = torch.cat(padded_tensors, dim=1)
-
-    return stacked_tensor
-
-
-@dataclass
-class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
-
-    tokenizer: transformers.PreTrainedTokenizer
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels, position_ids = tuple(
-            [instance[key] for instance in instances]
-            for key in ("input_ids", "labels", "position_ids")
-        )
-        input_ids = [ids.squeeze(0) for ids in input_ids]
-        labels = [ids.squeeze(0) for ids in labels]
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
-        )
-        labels = torch.nn.utils.rnn.pad_sequence(
-            labels, batch_first=True, padding_value=IGNORE_INDEX
-        )
-        position_ids = pad_and_cat(position_ids)
-        input_ids = input_ids[:, : self.tokenizer.model_max_length]
-        labels = labels[:, : self.tokenizer.model_max_length]
-        position_ids = position_ids[:, :, : self.tokenizer.model_max_length]
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
-        )
-        images = list(
-            instance["pixel_values"]
-            for instance in instances
-            if "pixel_values" in instance
-        )
-        videos = list(
-            instance["pixel_values_videos"]
-            for instance in instances
-            if "pixel_values_videos" in instance
-        )
-        if len(images) != 0:
-            concat_images = torch.cat([image for image in images], dim=0)
-            grid_thw = [
-                instance["image_grid_thw"]
-                for instance in instances
-                if "image_grid_thw" in instance
-            ]
-            grid_thw = torch.cat(grid_thw, dim=0)
-        else:
-            concat_images = None
-            grid_thw = None
-
-        if len(videos) != 0:
-            concat_videos = torch.cat([video for video in videos], dim=0)
-            video_grid_thw = [
-                instance["video_grid_thw"]
-                for instance in instances
-                if "video_grid_thw" in instance
-            ]
-            video_grid_thw = torch.cat(video_grid_thw, dim=0)
-        else:
-            concat_videos = None
-            video_grid_thw = None
-
-        batch["pixel_values"] = concat_images
-        batch["image_grid_thw"] = grid_thw
-        batch["pixel_values_videos"] = concat_videos
-        batch["video_grid_thw"] = video_grid_thw
-        batch["position_ids"] = position_ids
-        return batch
-
-
-@dataclass
-class FlattenedDataCollatorForSupervisedDataset(DataCollatorForSupervisedDataset):
-    """Collate examples into packed sequence with multi-modal support."""
-
-    tokenizer: transformers.PreTrainedTokenizer
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels, position_ids, attention_mask = tuple(
-            [instance[key] for instance in instances]
-            for key in ("input_ids", "labels", "position_ids", "attention_mask")
-        )
-        attention_mask = list(
-            itertools.chain(
-                *(
-                    instance["attention_mask"]
-                    for instance in instances
-                    if "attention_mask" in instance
-                )
-            )
-        )
-        seq_lens = torch.tensor([0] + attention_mask, dtype=torch.int32)
-        cumsum_seq_lens = torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
-        input_ids = torch.cat(input_ids, dim=1)
-        labels = torch.cat(labels, dim=1)
-        position_ids = torch.cat(position_ids, dim=2)
-
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=cumsum_seq_lens,
-            position_ids=position_ids,
-        )
-        images = list(
-            instance["pixel_values"]
-            for instance in instances
-            if "pixel_values" in instance
-        )
-        videos = list(
-            instance["pixel_values_videos"]
-            for instance in instances
-            if "pixel_values_videos" in instance
-        )
-        if len(images) != 0:
-            concat_images = torch.cat([image for image in images], dim=0)
-            grid_thw = [
-                instance["image_grid_thw"]
-                for instance in instances
-                if "image_grid_thw" in instance
-            ]
-            grid_thw = torch.cat(grid_thw, dim=0)
-        else:
-            concat_images = None
-            grid_thw = None
-
-        if len(videos) != 0:
-            concat_videos = torch.cat([video for video in videos], dim=0)
-            video_grid_thw = [
-                instance["video_grid_thw"]
-                for instance in instances
-                if "video_grid_thw" in instance
-            ]
-            video_grid_thw = torch.cat(video_grid_thw, dim=0)
-        else:
-            concat_videos = None
-            video_grid_thw = None
-
-        batch["pixel_values"] = concat_images
-        batch["image_grid_thw"] = grid_thw
-        batch["pixel_values_videos"] = concat_videos
-        batch["video_grid_thw"] = video_grid_thw
-
-        return batch
-
 
 if __name__ == "__main__":
     pass

From 4cff66c8b7e2de272ace9246e635379de539613f Mon Sep 17 00:00:00 2001
From: DuangZhu <shaohao9.zhu@gmail.com>
Date: Fri, 19 Dec 2025 13:23:58 +0800
Subject: [PATCH 6/9] (1) Remove `dataset_utils.py`. (2) Add standard
 docstrings to the main class and key functions.

---
 internnav/dataset/dataset_utils.py            |  12 --
 .../dataset/internvla_n1_lerobot_dataset.py   |  14 ++-
 internnav/dataset/vlln_lerobot_dataset.py     | 103 +++++++++++++++---
 3 files changed, 103 insertions(+), 26 deletions(-)
 delete mode 100644 internnav/dataset/dataset_utils.py

diff --git a/internnav/dataset/dataset_utils.py b/internnav/dataset/dataset_utils.py
deleted file mode 100644
index a8465b9c..00000000
--- a/internnav/dataset/dataset_utils.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import json
-import re
-
-def parse_sampling_rate(dataset_name):
-    match = re.search(r"%(\d+)$", dataset_name)
-    if match:
-        return int(match.group(1)) / 100.0
-    return 1.0
-
-def read_jsonl(path):
-    with open(path, "r") as f:
-        return [json.loads(line) for line in f]
\ No newline at end of file
diff --git a/internnav/dataset/internvla_n1_lerobot_dataset.py b/internnav/dataset/internvla_n1_lerobot_dataset.py
index c001d3aa..f0897ddf 100644
--- a/internnav/dataset/internvla_n1_lerobot_dataset.py
+++ b/internnav/dataset/internvla_n1_lerobot_dataset.py
@@ -16,7 +16,7 @@
 from torch.utils.data import Dataset
 from torchcodec.decoders import VideoDecoder
 from transformers.image_utils import to_numpy_array
-from .dataset_utils import parse_sampling_rate, read_jsonl
+
 from .vlln_lerobot_dataset import VLLN_Dataset
 from .rope2d import get_rope_index_2, get_rope_index_25
 
@@ -144,6 +144,18 @@
 }
 
 
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
 def data_list(dataset_names):
     config_list = []
     for dataset_name in dataset_names:
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index db5c784e..45d88dba 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -18,7 +18,7 @@
 from transformers.image_utils import to_numpy_array
 from bisect import bisect_left
 from .rope2d import get_rope_index_2, get_rope_index_25
-from .dataset_utils import parse_sampling_rate, read_jsonl
+
 
 # Define placeholders for dataset paths
 IION_split1 = {
@@ -62,6 +62,20 @@
 class VLLN_Dataset(Dataset):
     """
     Dataset for Vision Language-Language Navigation (VL-LN) / IION-style training.
+    
+    Args:
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer used to encode
+            the chat template and produce `input_ids` / `labels`.
+        data_args: A config-like object that must provide at least:
+            - iion_dataset_use (str): comma-separated dataset names, optionally
+              with sampling rate suffix like `iion_split1%50`.
+            - model_type (str): decides which rope-index function to use.
+            - sample_step (int): stride for sampling start frames.
+            - pixel_goal_only (bool): whether to keep only pixel-goal samples.
+            - num_future_steps (int): horizon for turn-action extraction.
+            - max_dialog_turns (int): max number of answers the agent can get from oracle.
+            - num_history (int): number of history frames in prompt.
+
     """
     def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
         super(VLLN_Dataset, self).__init__()
@@ -81,7 +95,6 @@ def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
             self.get_rope_index = get_rope_index_2
         
         self.sample_step = data_args.sample_step
-        self.predict_step_num = data_args.predict_step_num
         self.pixel_goal_only = data_args.pixel_goal_only
         self.num_future_steps = data_args.num_future_steps
         self.max_dialog_turns = data_args.max_dialog_turns
@@ -291,6 +304,18 @@ def __getitem__(self, i):
         return data_dict
 
 
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
 def data_list(dataset_names):
     config_list = []
     for dataset_name in dataset_names:
@@ -316,6 +341,24 @@ def preprocess_qwen_2_visual(
     grid_thw_image: List = [],
     grid_thw_video: List = [],
 ) -> Dict:
+    """Tokenize multi-modal chat sources for Qwen2.5-VL style training.
+
+    Args:
+        sources (list): Conversation sources. Expected structure is a list of
+            conversations, where each conversation is a list of dict messages.
+            The dict keys may be either:
+            - {"from": "human"/"gpt", "value": "..."}, or
+            - {"role": "user"/"assistant", "value": "..."}
+        tokenizer (transformers.PreTrainedTokenizer): Tokenizer instance.
+        grid_thw_image (List[int]): For each "<image>" placeholder, provides the
+            number of visual tokens (after merge) to replicate `<|image_pad|>`.
+        grid_thw_video (List[int]): Same as above for "<video>".
+
+    Returns:
+        Dict[str, torch.Tensor]:
+            - input_ids: LongTensor of shape [B, L]
+            - labels: LongTensor of shape [B, L]
+    """
     roles = {"human": "user", "gpt": "assistant"}
     system_message = "You are a helpful assistant."
 
@@ -407,8 +450,29 @@ def preprocess_qwen_2_visual(
 
 
 def get_annotations_from_lerobot_data(data_path, pitch_1, pitch_2, height):
-    """
-    Load LeRobot-style dataset and convert it into a unified annotations dict.
+    """Load LeRobot-format dataset and convert it into unified annotations.
+
+    It scans scene directories under `data_path`, and for each scene:
+    - Reads `meta/episodes.jsonl` to get episode metadata, instructions, dialogs.
+    - Reads `data/chunk-xxx/episode_XXXXXX.parquet` to get actions, poses, goals.
+    - Constructs a unified dict `annotations` with an `episodes` list.
+
+    The output `annotations["episodes"]` items include:
+    - id, video, instructions, actions, length
+    - poses for both horizon and look-down settings
+    - pixel_goals in `[relative_goal_frame_id, goal]` format
+    - dialogs (list)
+
+    Args:
+        data_path (str): Root directory containing multiple scene folders.
+        pitch_1 (int): Horizon camera pitch (e.g., 0).
+        pitch_2 (int): Look-down camera pitch (e.g., 30).
+        height (int): Camera height in centimeters (e.g., 125).
+
+    Returns:
+        dict: A dict with keys:
+            - axis_align_matrix (List[List[float]]): identity by default
+            - episodes (List[dict]): unified episode entries
     """
     import pyarrow.parquet as pq
     import pandas as pd
@@ -482,9 +546,6 @@ def process_scene(scene_id):
 
 
 def get_turn_actions(actions, start_frame_id, num_future_steps):
-    """
-    Return the longest prefix of future actions that are identical to the first action.
-    """
     if not (0 <= start_frame_id < len(actions)):
         return []
     s = actions[start_frame_id : start_frame_id + num_future_steps]
@@ -492,10 +553,8 @@ def get_turn_actions(actions, start_frame_id, num_future_steps):
     i = next((k for k, x in enumerate(s) if x != first), len(s))
     return s[:i]
 
+
 def sort_dialogs_by_true_idx(dialogs):
-    """
-    Sort dialog messages by their true_idx in pairs.
-    """
     groups = []
     i, n = 0, len(dialogs)
     while i < n:
@@ -532,6 +591,14 @@ def build_dialog_history(history_id, dialog_id, dialogs):
     """
     Build a serialized string that interleaves visual placeholders (<image>) with
     dialog blocks (<|dialog_start|>...<|dialog_end|>) aligned to history frames.
+
+    Args:
+        history_id (List[int]): History frame ids (sorted/unique).
+        dialog_id (Sequence[int]): Frame indices that have dialogs (true_idx).
+        dialogs (List[dict]): Dialog messages.
+
+    Returns:
+        str: Serialized history string aligned to `history_id`.
     """
     placeholder = [''] * (len(history_id)+1)
     for n in dialog_id:
@@ -547,9 +614,19 @@ def build_dialog_history(history_id, dialog_id, dialogs):
 
 def enforce_simple_limit(conv, limit,
     sorry_msg: str = "Sorry, you have reached the question limit. No further answers are available."):
-    """
-    Truncate / limit the number of answer-like items (oracle blocks and talk-human pairs)
-    by replacing extra parts with a fixed apology message.
+    """Limit the number of answer-like parts in a conversation.
+
+    This function truncates answer-like content beyond a given `limit`.
+    Extra units beyond `limit` are replaced by a fixed `sorry_msg`.
+
+    Args:
+        conv (list): A single conversation packed as `[conv0]`, where `conv0`
+            is a list of message dicts using keys `from/value`.
+        limit (int): Maximum number of answer-like units to keep.
+        sorry_msg (str): Replacement message inserted for truncated content.
+
+    Returns:
+        list: The updated conversation in the same format as input, i.e. `[conv0]`.
     """
     conv = [dict(m) for m in conv[0]]  
     answer_indices = []

From ab8d90b7141527266cf36ee0541c1c8f735c3a5e Mon Sep 17 00:00:00 2001
From: DuangZhu <shaohao9.zhu@gmail.com>
Date: Fri, 19 Dec 2025 15:59:47 +0800
Subject: [PATCH 7/9] solve the issue from Tai-Wang

---
 internnav/dataset/vlln_lerobot_dataset.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index 45d88dba..8991e83f 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -352,8 +352,11 @@ def preprocess_qwen_2_visual(
         tokenizer (transformers.PreTrainedTokenizer): Tokenizer instance.
         grid_thw_image (List[int]): For each "<image>" placeholder, provides the
             number of visual tokens (after merge) to replicate `<|image_pad|>`.
+            Here "thw" refers to the visual token grid shape:
+            - t: temporal length in the visual grid
+            - h: grid height (number of patch rows)
+            - w: grid width (number of patch columns)
         grid_thw_video (List[int]): Same as above for "<video>".
-
     Returns:
         Dict[str, torch.Tensor]:
             - input_ids: LongTensor of shape [B, L]
@@ -668,8 +671,4 @@ def _repl(m):
                 conv[human_idx]['value'] = sorry_msg
                 replaced_indices.append(('more', human_idx))
 
-    return [conv]
-
-
-if __name__ == "__main__":
-    pass
+    return [conv]
\ No newline at end of file

From 30533795d4d968c8d52b06a9010018019535baa3 Mon Sep 17 00:00:00 2001
From: DuangZhu <shaohao9.zhu@gmail.com>
Date: Mon, 22 Dec 2025 10:13:27 +0800
Subject: [PATCH 8/9] solve the issue from Tai-Wang

---
 internnav/dataset/internvla_n1_lerobot_dataset.py | 4 ++--
 internnav/dataset/vlln_lerobot_dataset.py         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/internnav/dataset/internvla_n1_lerobot_dataset.py b/internnav/dataset/internvla_n1_lerobot_dataset.py
index f0897ddf..52981731 100644
--- a/internnav/dataset/internvla_n1_lerobot_dataset.py
+++ b/internnav/dataset/internvla_n1_lerobot_dataset.py
@@ -17,7 +17,7 @@
 from torchcodec.decoders import VideoDecoder
 from transformers.image_utils import to_numpy_array
 
-from .vlln_lerobot_dataset import VLLN_Dataset
+from .vlln_lerobot_dataset import VLLNDataset
 from .rope2d import get_rope_index_2, get_rope_index_25
 
 # Define placeholders for dataset paths
@@ -1370,7 +1370,7 @@ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, dat
     """Make dataset and collator for supervised fine-tuning."""
     train_datasets = []
     if data_args.iion_dataset_use:
-        train_datasets.append(VLLN_Dataset(tokenizer=tokenizer, data_args=data_args))
+        train_datasets.append(VLLNDataset(tokenizer=tokenizer, data_args=data_args))
     if data_args.vln_dataset_use:
         train_datasets.append(NavPixelGoalDataset(tokenizer=tokenizer, data_args=data_args))
     train_dataset = CombinedDataset(train_datasets, shuffle=False)
diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index 8991e83f..684a9103 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -59,7 +59,7 @@
 local_rank = None
 
 
-class VLLN_Dataset(Dataset):
+class VLLNDataset(Dataset):
     """
     Dataset for Vision Language-Language Navigation (VL-LN) / IION-style training.
     
@@ -78,7 +78,7 @@ class VLLN_Dataset(Dataset):
 
     """
     def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
-        super(VLLN_Dataset, self).__init__()
+        super(VLLNDataset, self).__init__()
         dataset = data_args.iion_dataset_use.split(",")
         dataset_list = data_list(dataset)
         rank0_print(f"Loading datasets: {dataset_list}")

From d69e59f4e13a7be8ceaf2bb3886f14fd38063fbb Mon Sep 17 00:00:00 2001
From: DuangZhu <shaohao9.zhu@gmail.com>
Date: Mon, 22 Dec 2025 10:22:58 +0800
Subject: [PATCH 9/9] refine the docstring

---
 internnav/dataset/vlln_lerobot_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py
index 684a9103..fa670502 100644
--- a/internnav/dataset/vlln_lerobot_dataset.py
+++ b/internnav/dataset/vlln_lerobot_dataset.py
@@ -61,7 +61,7 @@
 
 class VLLNDataset(Dataset):
     """
-    Dataset for Vision Language-Language Navigation (VL-LN) / IION-style training.
+    Dataset for 'Vision-Language'-'Language-Navigation' (VL-LN) / IION-style training.
     
     Args:
         tokenizer (transformers.PreTrainedTokenizer): Tokenizer used to encode