diff --git a/README.md b/README.md index f8cdfd91..e46c75a3 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,10 @@ The toolbox supports the most advanced high-quality navigation dataset, InternDa - [🏠 Introduction](#-introduction) - [πŸ”₯ News](#-news) - [πŸ“š Getting Started](#-getting-started) -- [πŸ“¦ Overview of Benchmark \& Model Zoo](#-overview-of-benchmark-and-model-zoo) +- [πŸ“¦ Overview of Benchmark \& Model Zoo](#-overview) - [πŸ”§ Customization](#-customization) - [πŸ‘₯ Contribute](#-contribute) +- [πŸš€ Community Deployment & Best Practices](#-community-deployment--best-practices) - [πŸ”— Citation](#-citation) - [πŸ“„ License](#-license) - [πŸ‘ Acknowledgements](#-acknowledgements) @@ -213,6 +214,23 @@ For example, raising issues, fixing bugs in the framework, and adapting or addin **Note:** We welcome the feedback of the model's zero-shot performance when deploying in your own environment. Please show us your results and offer us your future demands regarding the model's capability. We will select the most valuable ones and collaborate with users together to solve them in the next few months :) +## πŸš€ Community Deployment & Best Practices + +We are excited to see InternNav being deployed and extended by the community across different robots and real-world scenarios. +Below are selected community-driven deployment guides and solution write-ups, which may serve as practical references for advanced users. + +- **IROS Challenge Nav Track: Champion Solution (2025)** + A complete system-level solution and design analysis for Vision-and-Language Navigation in Physical Environments. + πŸ”— https://zhuanlan.zhihu.com/p/1969046543286907790 + +- **Go2 Series Deployment Tutorial (ShanghaiTech University)** + Step-by-step edge deployment guide for InternNav-based perception and navigation. + πŸ”— https://github.com/cmjang/InternNav-deploy + +- **G1 Series Deployment Tutorial (Wuhan University)** + Detailed educational materials on vision-language navigation deployment. + πŸ”— [*Chapter 5: Vision-Language Navigation (Part II)*](https://mp.weixin.qq.com/s/p3cJzbRvecMajiTh9mXoAw) + ## πŸ”— Citation If you find our work helpful, please cite: diff --git a/internnav/agent/dialog_agent.py b/internnav/agent/dialog_agent.py index 6d189227..8eec7a92 100644 --- a/internnav/agent/dialog_agent.py +++ b/internnav/agent/dialog_agent.py @@ -11,21 +11,16 @@ import quaternion import torch from PIL import Image, ImageDraw -from transformers import ( - AutoProcessor, - AutoTokenizer, - Qwen2_5_VLForConditionalGeneration, -) from internnav.agent import Agent from internnav.configs.agent import AgentCfg try: - pass -except Exception as e: - print(f"Warning: ({e}), Ignore this if not using dual_system.") - -try: + from transformers import ( + AutoProcessor, + AutoTokenizer, + Qwen2_5_VLForConditionalGeneration, + ) from depth_camera_filtering import filter_depth from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower except Exception as e: diff --git a/internnav/dataset/vlln_lerobot_dataset.py b/internnav/dataset/vlln_lerobot_dataset.py index fa670502..f57622ad 100644 --- a/internnav/dataset/vlln_lerobot_dataset.py +++ b/internnav/dataset/vlln_lerobot_dataset.py @@ -22,21 +22,21 @@ # Define placeholders for dataset paths IION_split1 = { - "data_path": "traj_data/mp3d_split1", + "data_path": "projects/VL-LN-Bench/traj_data/mp3d_split1", "height": 125, "pitch_1": 0, "pitch_2": 30, } IION_split2 = { - "data_path": "traj_data/mp3d_split2", + "data_path": "projects/VL-LN-Bench/traj_data/mp3d_split2", "height": 125, "pitch_1": 0, "pitch_2": 30, } IION_split3 = { - "data_path": "traj_data/mp3d_split3", + "data_path": "projects/VL-LN-Bench/traj_data/mp3d_split3", "height": 125, "pitch_1": 0, "pitch_2": 30, diff --git a/internnav/evaluator/utils/result_logger.py b/internnav/evaluator/utils/result_logger.py index e3826a6d..cfc2193b 100644 --- a/internnav/evaluator/utils/result_logger.py +++ b/internnav/evaluator/utils/result_logger.py @@ -319,5 +319,5 @@ def finalize_all_results(self, rank, world_size): } # write log content to file - with open(f"{self.name}_result.json", "w") as f: + with open(f"{PROJECT_ROOT_PATH}/logs/{self.name}/result.json", "w") as f: json.dump(json_data, f, indent=2, ensure_ascii=False) diff --git a/internnav/habitat_extensions/vlln/README.md b/internnav/habitat_extensions/vlln/README.md index 755278ae..3440c854 100644 --- a/internnav/habitat_extensions/vlln/README.md +++ b/internnav/habitat_extensions/vlln/README.md @@ -3,16 +3,34 @@ Vision-Language-and-Language Navigation (VL-LN) is a new [benchmark](https://0309hws.github.io/VL-LN.github.io/) built upon VLN in Habitat, which refers to the setting that models take the vision and language as input and output language and navigation actions. In contrast to VLN, where agents only take navigation actions, agents in VL-LN could ask questions and engage in dialogue with users to complete tasks better with language interaction. This package adapts [Meta AI Habitat](https://aihabitat.org) for VL-LN within InternNav. It wraps Habitat environments that expose semantic masks, registers dialog-aware datasets and metrics, and provides evaluators that coordinate agent actions, NPC interactions, and logging. +Install our benchmark [dataset](https://huggingface.co/datasets/InternRobotics/VL-LN-Bench) and the [latest checkpoints](https://huggingface.co/InternRobotics/VL-LN-Bench-basemodel) from HuggingFace. +Place the downloaded benchmark under `InternNav/projects/VL-LN-Bench` to match the default path expected by the code. + ## Package structure ``` -habitat_vlln_extensions/ -β”œβ”€β”€ __init__.py -β”œβ”€β”€ habitat_dialog_evaluator.py -β”œβ”€β”€ habitat_vlln_env.py -β”œβ”€β”€ measures.py -β”œβ”€β”€ simple_npc/ -└── utils/ +InternNav +β”œβ”€β”€ assets/ +β”œβ”€β”€ internnav/ +β”‚ β”œβ”€β”€ habitat_vlln_extensions +β”‚ β”‚ β”œβ”€β”€ simple_npc +β”‚ β”‚ β”‚ β”œβ”€β”€ api_key.txt +β”‚ β”‚ β”œβ”€β”€ measures.py +β”‚ β”‚ β”œβ”€β”€ habitat_dialog_evaluator.py +β”‚ β”‚ β”œβ”€β”€ habitat_vlln_env.py +β”‚ ... ... ... +... +β”œβ”€β”€ projects +β”‚ β”œβ”€β”€ VL-LN-Bench/ +β”‚ β”‚ β”œβ”€β”€ base_model/ +β”‚ β”‚ β”œβ”€β”€ raw_data/ +β”‚ β”‚ β”œβ”€β”€ scene_datasets/ +β”‚ β”‚ β”‚ └── mp3d/ +β”‚ β”‚ β”‚ └── 17DRP5sb8fy/ +β”‚ β”‚ β”‚ β”œβ”€β”€ 1LXtFkjw3qL/ +β”‚ β”‚ β”‚ ... +β”‚ β”‚ β”œβ”€β”€ traj_data/ +... ``` * `__init__.py` re-exports the public entry points so callers can import diff --git a/internnav/habitat_extensions/vlln/habitat_dialog_evaluator.py b/internnav/habitat_extensions/vlln/habitat_dialog_evaluator.py index 91c001a6..798e2399 100644 --- a/internnav/habitat_extensions/vlln/habitat_dialog_evaluator.py +++ b/internnav/habitat_extensions/vlln/habitat_dialog_evaluator.py @@ -257,6 +257,13 @@ def calc_metrics(self, global_metrics: dict) -> dict: # avoid /0 if no episodes denom = max(len(sucs_all), 1) + # clean NaN in spls, treat as 0.0 + torch.nan_to_num(spls_all, nan=0.0, posinf=0.0, neginf=0.0, out=spls_all) + + # clean inf in nes, only fiinite nes are counted + nes_finite_mask = torch.isfinite(nes_all) + nes_all = nes_all[nes_finite_mask] + return { "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0, "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0, diff --git a/internnav/habitat_extensions/vln/README.md b/internnav/habitat_extensions/vln/README.md index 41dc2009..f4b0b9eb 100644 --- a/internnav/habitat_extensions/vln/README.md +++ b/internnav/habitat_extensions/vln/README.md @@ -9,7 +9,7 @@ utilities. ## Package structure ``` -habitat_extensions/ +habitat_extensions/vln/ β”œβ”€β”€ __init__.py β”œβ”€β”€ habitat_env.py β”œβ”€β”€ habitat_default_evaluator.py diff --git a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py index d2417430..293045a7 100644 --- a/internnav/habitat_extensions/vln/habitat_vln_evaluator.py +++ b/internnav/habitat_extensions/vln/habitat_vln_evaluator.py @@ -192,7 +192,7 @@ def eval_action(self): "nes": nes, # shape [N_local] } - if ndtws: + if ndtws is not None: result["ndtws"] = ndtws # shape [N_local] return result @@ -207,6 +207,13 @@ def calc_metrics(self, global_metrics: dict) -> dict: # avoid /0 if no episodes denom = max(len(sucs_all), 1) + + # clean NaN in spls, treat as 0.0 + torch.nan_to_num(spls_all, nan=0.0, posinf=0.0, neginf=0.0, out=spls_all) + + # clean inf in nes, only fiinite nes are counted + nes_finite_mask = torch.isfinite(nes_all) + nes_all = nes_all[nes_finite_mask] result_all = { "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0, @@ -587,7 +594,7 @@ def _run_eval_dual_system(self) -> tuple: torch.tensor(spls).to(self.device), torch.tensor(oss).to(self.device), torch.tensor(nes).to(self.device), - torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None, + torch.tensor(ndtw).to(self.device) if ndtw else None, ) def _run_eval_system2(self) -> tuple: @@ -876,5 +883,5 @@ def _run_eval_system2(self) -> tuple: torch.tensor(spls).to(self.device), torch.tensor(oss).to(self.device), torch.tensor(nes).to(self.device), - torch.tensor(ndtw).to(self.device) if 'ndtw' in metrics else None, + torch.tensor(ndtw).to(self.device) if ndtw else None, ) diff --git a/scripts/demo/navigation_ui.py b/scripts/demo/navigation_ui.py deleted file mode 100644 index 2f13bfda..00000000 --- a/scripts/demo/navigation_ui.py +++ /dev/null @@ -1,540 +0,0 @@ -import json -import os -import time -import uuid -from collections import defaultdict -from datetime import datetime, timedelta -from typing import Optional - -import gradio as gr -import requests - -BACKEND_URL = os.getenv("BACKEND_URL", "http://localhost:8001") # fastapi server -API_ENDPOINTS = { - "submit_task": f"{BACKEND_URL}/predict/video", - "query_status": f"{BACKEND_URL}/predict/task", - "get_result": f"{BACKEND_URL}/predict", -} - - -SCENE_CONFIGS = { - "scene_1": { - "description": "Modern Apartment", - "name": "17DRP5sb8fy", - "glb_path": "scene_assets/scene1_no_ceiling.glb", # PLY file path - }, - "scene_2": { - "description": "Office Building", - "name": "r1Q1Z4BcV1o", - "glb_path": "scene_assets/scene2_no_ceiling.glb", - }, - "scene_3": { - "description": "University Campus", - "name": "dhjEzFoUFzH", - "glb_path": "scene_assets/scene3_no_ceiling.glb", - }, -} - -EPISODE_CONFIGS = { - "episode_1": { - "description": "1", - }, - "episode_2": { - "description": "2", - }, - "episode_3": { - "description": "3", - }, - "episode_4": { - "description": "4", - }, -} - -MODEL_CHOICES = [] - - -############################################################################### - -SESSION_TASKS = {} -IP_REQUEST_RECORDS = defaultdict(list) -IP_LIMIT = 5 - - -def is_request_allowed(ip: str) -> bool: - now = datetime.now() - IP_REQUEST_RECORDS[ip] = [t for t in IP_REQUEST_RECORDS[ip] if now - t < timedelta(minutes=1)] - if len(IP_REQUEST_RECORDS[ip]) < IP_LIMIT: - IP_REQUEST_RECORDS[ip].append(now) - return True - return False - - -############################################################################### - - -# Log directory path -LOG_DIR = "~/logs" -os.makedirs(LOG_DIR, exist_ok=True) -ACCESS_LOG = os.path.join(LOG_DIR, "access.log") -SUBMISSION_LOG = os.path.join(LOG_DIR, "submissions.log") - - -def log_access(user_ip: str = None, user_agent: str = None): - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - log_entry = { - "timestamp": timestamp, - "type": "access", - "user_ip": user_ip or "unknown", - "user_agent": user_agent or "unknown", - } - - with open(ACCESS_LOG, "a") as f: - f.write(json.dumps(log_entry) + "\n") - - -def log_submission(scene: str, prompt: str, model: str, user: str = "anonymous", res: str = "unknown"): - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - log_entry = { - "timestamp": timestamp, - "type": "submission", - "user": user, - "scene": scene, - "prompt": prompt, - "model": model, - # "max_step": str(max_step), - "res": res, - } - - with open(SUBMISSION_LOG, "a") as f: - f.write(json.dumps(log_entry) + "\n") - - -def read_logs(log_type: str = "all", max_entries: int = 50) -> list: - logs = [] - - if log_type in ["all", "access"]: - try: - with open(ACCESS_LOG, "r") as f: - for line in f: - logs.append(json.loads(line.strip())) - except FileNotFoundError: - pass - - if log_type in ["all", "submission"]: - try: - with open(SUBMISSION_LOG, "r") as f: - for line in f: - logs.append(json.loads(line.strip())) - except FileNotFoundError: - pass - - # Sorted by timestamp - logs.sort(key=lambda x: x["timestamp"], reverse=True) - return logs[:max_entries] - - -def format_logs_for_display(logs: list) -> str: - if not logs: - return "No log record" - - markdown = "### System Access Log\n\n" - markdown += "| Time | Type | User/IP | Details |\n" - markdown += "|------|------|---------|----------|\n" - - for log in logs: - timestamp = log.get("timestamp", "unknown") - log_type = "Access" if log.get("type") == "access" else "Submission" - - if log_type == "Access": - user = log.get("user_ip", "unknown") - details = f"User-Agent: {log.get('user_agent', 'unknown')}" - else: - user = log.get("user", "anonymous") - result = log.get('res', 'unknown') - if result != "success": - if len(result) > 40: # Adjust this threshold as needed - result = f"{result[:20]}...{result[-20:]}" - details = f"Scene: {log.get('scene', 'unknown')}, Prompt: {log.get('prompt', '')}, Model: {log.get('model', 'unknown')}, result: {result}" - - markdown += f"| {timestamp} | {log_type} | {user} | {details} |\n" - - return markdown - - -def submit_to_backend( - scene: str, - prompt: str, - episode: str, - user: str = "Gradio-user", -) -> dict: - job_id = str(uuid.uuid4()) - - scene_index = scene.split("_")[-1] - episode_index = episode.split("_")[-1] - - data = { - "task_type": "vln_eval", # Identify task type - "instruction": prompt, - "scene_index": scene_index, - "episode_index": episode_index, - } - - payload = {"user": user, "task": "robot_navigation", "job_id": job_id, "data": json.dumps(data)} - - try: - headers = {"Content-Type": "application/json"} - response = requests.post(API_ENDPOINTS["submit_task"], json=payload, headers=headers, timeout=600) - return response.json() - except Exception as e: - return {"status": "error", "message": str(e)} - - -def get_task_status(task_id: str) -> dict: - try: - response = requests.get(f"{API_ENDPOINTS['query_status']}/{task_id}", timeout=600) - try: - return response.json() - except json.JSONDecodeError: - return {"status": "error", "message": response.text} - except Exception as e: - return {"status": "error", "message": str(e)} - - -def get_task_result(task_id: str) -> Optional[dict]: - try: - response = requests.get(f"{API_ENDPOINTS['get_result']}/{task_id}", timeout=5) - return response.json() - except Exception as e: - print(f"Error fetching result: {e}") - return None - - -def run_simulation(scene: str, prompt: str, episode: str, history: list, request: gr.Request) -> dict: - model = "InternNav-VLA" - - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - scene_desc = SCENE_CONFIGS.get(scene, {}).get("description", scene) - - user_ip = request.client.host if request else "unknown" - session_id = request.session_hash - - if not is_request_allowed(user_ip): - log_submission(scene, prompt, model, user_ip, "IP blocked temporarily") - raise gr.Error("Too many requests from this IP. Please wait and try again one minute later.") - - submission_result = submit_to_backend(scene, prompt, episode) - print("submission_result: ", submission_result) - - if submission_result.get("status") != "pending": - log_submission(scene, prompt, model, user_ip, "Submission failed") - raise gr.Error(f"Submission failed: {submission_result.get('message', 'unknown issue')}") - - try: - task_id = submission_result["task_id"] - SESSION_TASKS[session_id] = task_id - - gr.Info(f"Simulation started, task_id: {task_id}") - time.sleep(5) - # Get Task Status - status = get_task_status(task_id) - print("first status: ", status) - result_folder = status.get("result", "") - except Exception as e: - log_submission(scene, prompt, model, user_ip, str(e)) - raise gr.Error(f"error occurred when parsing submission result from backend: {str(e)}") - - while True: - status = get_task_status(task_id) - if status.get("status") == "completed": - break - elif status.get("status") == "failed": - break - time.sleep(1) - if status.get("status") == "completed": - import base64 - - video_bytes = base64.b64decode(status.get("video")) - receive_time = time.time() - with open(f"received_video_{receive_time}.mp4", "wb") as f: - f.write(video_bytes) - video_path = f"received_video_{receive_time}.mp4" - new_entry = {"timestamp": timestamp, "scene": scene, "model": model, "prompt": prompt, "video_path": video_path} - - updated_history = history + [new_entry] - - if len(updated_history) > 10: - updated_history = updated_history[:10] - - print("updated_history:", updated_history) - log_submission(scene, prompt, model, user_ip, "success") - gr.Info("Simulation completed successfully!") - yield video_path, updated_history - - elif status.get("status") == "failed": - log_submission(scene, prompt, model, user_ip, status.get('result', 'backend error')) - raise gr.Error(f"task execution fails: {status.get('result', 'backend error')}") - yield None, history - - elif status.get("status") == "terminated": - log_submission(scene, prompt, model, user_ip, "terminated") - video_path = os.path.join(result_folder, "output.mp4") - if os.path.exists(video_path): - return f" task {task_id} terminated with some results", video_path, history - else: - return f" task {task_id} terminated without any results", None, history - - else: - log_submission(scene, prompt, model, user_ip, "missing task's status from backend") - yield None, history - - -################################################################################################################### -def update_history_display(history: list) -> list: - print("update_history_display") - updates = [] - - for i in range(10): - if i < len(history): - entry = history[i] - updates.extend( - [ - gr.update(visible=True), - gr.update( - visible=True, - label=f"Simulation {i+1} scene: {entry['scene']}, prompt: {entry['prompt']}", - open=False, - ), - gr.update(value=entry['video_path'], visible=True), - gr.update(value=f"{entry['timestamp']}"), - ] - ) - print(f'update video') - print(entry['video_path']) - else: - updates.extend( - [ - gr.update(visible=False), - gr.update(visible=False), - gr.update(value=None, visible=False), - gr.update(value=""), - ] - ) - print("update_history_display end!!") - return updates - - -def update_scene_display(scene: str): - print(f"update_scene_display {scene}") - config = SCENE_CONFIGS.get(scene, {}) - glb_path = config.get("glb_path", "") - - # Validate if file path exists - if not os.path.exists(glb_path): - return None, None - - return None, glb_path - - -def update_episode_display(scene: str, episode: str): - print(f"update_episode_display {scene} {episode}") - config = SCENE_CONFIGS.get(scene, {}) - scene_name = config.get("name", "") - episode_id = int(episode[-1]) - image_path = os.path.join("scene_assets", f"{scene_name}_{episode_id-1}.jpg") - print(f"image_path {image_path}") - # valid if file path exists - if not os.path.exists(image_path): - return None - - return image_path - - -def update_log_display(): - logs = read_logs() - return format_logs_for_display(logs) - - -############################################################################## - - -def cleanup_session(request: gr.Request): - session_id = request.session_hash - task_id = SESSION_TASKS.pop(session_id, None) - if task_id: - try: - requests.post(f"{BACKEND_URL}/predict/terminate/{task_id}", timeout=3) - print(f"Task Terminated: {task_id}") - except Exception as e: - print(f"Task Termination Failed: {task_id}: {e}") - - -############################################################################### - -custom_css = """ -#simulation-panel { - border-radius: 8px; - padding: 20px; - background: #f9f9f9; - box-shadow: 0 2px 4px rgba(0,0,0,0.1); -} -#result-panel { - border-radius: 8px; - padding: 20px; - background: #f0f8ff; -} -.dark #simulation-panel { background: #2a2a2a; } -.dark #result-panel { background: #1a2a3a; } - -.history-container { - max-height: 600px; - overflow-y: auto; - margin-top: 20px; -} - -.history-accordion { - margin-bottom: 10px; -} - -.scene-preview { - height: 400px; - border: 1px solid #ddd; - border-radius: 8px; - overflow: hidden; -} -""" - -with gr.Blocks(title="Robot Navigation Inference", css=custom_css) as demo: - gr.Markdown( - """ - # 🧭 Habitat Robot Navigation Demo - ### Simulation Test Based on Habitat Framework - """ - ) - - history_state = gr.State([]) - - with gr.Row(): - with gr.Column(elem_id="simulation-panel"): - gr.Markdown("### Simulation Task Configuration") - with gr.Row(): - scene_dropdown = gr.Dropdown( - label="Select Scene", - choices=list(SCENE_CONFIGS.keys()), - value="scene_1", - interactive=True, - ) - episode_dropdown = gr.Dropdown( - label="Select Start Position", - choices=list(EPISODE_CONFIGS.keys()), - value="episode_1", - interactive=True, - ) - - with gr.Row(): - scene_preview = gr.Model3D( - elem_classes=["scene-preview"], - camera_position=(90.0, 120, 20000.0), - # display_mode="solid" - ) - fps_preview = gr.Image(label="FPS Preview") - - scene_description = gr.Markdown("### Scene preview") - - prompt_input = gr.Textbox( - label="Navigation Instruction", - value="Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.", - placeholder="e.g.: 'Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.'", - lines=2, - max_lines=4, - ) - - scene_dropdown.change( - update_scene_display, inputs=scene_dropdown, outputs=[scene_description, scene_preview] - ).then(update_episode_display, inputs=[scene_dropdown, episode_dropdown], outputs=[fps_preview]) - - episode_dropdown.change( - update_episode_display, inputs=[scene_dropdown, episode_dropdown], outputs=[fps_preview] - ) - - submit_btn = gr.Button("Start Navigation Simulation", variant="primary") - - with gr.Column(elem_id="result-panel"): - gr.Markdown("### Latest Simulation Result") - - # Video Output - video_output = gr.Video( - label="Live", - interactive=False, - format="mp4", - autoplay=True, - # streaming=True - ) - - with gr.Column() as history_container: - gr.Markdown("### History") - gr.Markdown("#### History will be reset after refresh") - - history_slots = [] - for i in range(10): - with gr.Column(visible=False) as slot: - with gr.Accordion(visible=False, open=False) as accordion: - video = gr.Video(interactive=False) - detail_md = gr.Markdown() - history_slots.append((slot, accordion, video, detail_md)) - - with gr.Accordion("View System Log (DEV ONLY)", open=False): - logs_display = gr.Markdown() - refresh_logs_btn = gr.Button("Refresh Log", variant="secondary") - - refresh_logs_btn.click(update_log_display, outputs=logs_display) - - gr.Examples( - examples=[ - [ - "scene_1", - "Exit the bedroom and turn left. Walk straight passing the gray couch and stop near the rug.", - "episode_0", - ], - ["scene_2", "Go from reception to conference room passing the water cooler.", "episode_1"], - ["scene_3", "From the classroom, go to the library via the main hall.", "episode_2"], - ["scene_4", "From emergency room to pharmacy passing nurse station.", "episode_3"], - ], - inputs=[scene_dropdown, prompt_input, episode_dropdown], - label="Navigation Task Example", - ) - - submit_btn.click( - fn=run_simulation, - inputs=[scene_dropdown, prompt_input, episode_dropdown, history_state], - outputs=[video_output, history_state], - queue=True, - api_name="run_simulation", - ).then( - fn=update_history_display, - inputs=history_state, - outputs=[comp for slot in history_slots for comp in slot], - queue=True, - ).then( - fn=update_log_display, - outputs=logs_display, - ) - - demo.load(fn=lambda: update_scene_display("scene_1"), outputs=[scene_description, scene_preview]).then( - fn=update_log_display, outputs=logs_display - ) - demo.load(fn=lambda: update_episode_display("scene_1", "episode_1"), outputs=[fps_preview]) - - def record_access(request: gr.Request): - user_ip = request.client.host if request else "unknown" - user_agent = request.headers.get("user-agent", "unknown") - log_access(user_ip, user_agent) - return update_log_display() - - demo.load(fn=record_access, inputs=None, outputs=logs_display, queue=False) - - demo.queue(default_concurrency_limit=8) - - demo.unload(fn=cleanup_session) - -if __name__ == "__main__": - demo.launch(server_name="0.0.0.0", server_port=5750, debug=True, share=True, allowed_paths=["/mnt"]) diff --git a/scripts/demo/vln_gradio_backend.py b/scripts/demo/vln_gradio_backend.py deleted file mode 100644 index efbdbb38..00000000 --- a/scripts/demo/vln_gradio_backend.py +++ /dev/null @@ -1,215 +0,0 @@ -import argparse -import base64 -import json -import os -import sys -import uuid - -# from utils.log_util import logger -from enum import Enum -from typing import Dict, Optional - -import numpy as np -import torch -import uvicorn -from fastapi import APIRouter, BackgroundTasks, FastAPI, HTTPException, status -from pydantic import BaseModel -from transformers import AutoProcessor - -from internnav.env.utils.habitat_extensions.evaluator_single import VLNEvaluator -from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM -from internnav.utils.dist import * - -PROJECT_ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(PROJECT_ROOT_PATH) -print(f"PROJECT_ROOT_PATH {PROJECT_ROOT_PATH}", flush=True) - - -global instruction - - -class VideoRequest(BaseModel): - """ - Frontend post json object template - """ - - user: str - task: str - job_id: str - data: str - - """ - data json template - Manipulation: - { - model_type: str, - instruction: str, - scene_type: str, - max_step: str - } - - Navigation: - { - model_type: str, - instruction: str, - episode_type: str - } - """ - - -class TaskInfo: - def __init__(self, task_id, status, result_path): - self.task_id: str = task_id - self.status: str = status - self.result_path: str = result_path - - -class TaskStatus(str, Enum): - pending = ('pending',) - completed = 'completed' - failed = 'failed' - terminated = 'terminated' - - -class BackendServer: - def __init__(self, host: str, port: int): - self.host = host - self.port = port - self.app = FastAPI(title='Backend Service') - self._router = APIRouter(prefix='/predict') - self._register_routes() - self.app.include_router(self._router) - - self.GPU_COUNT = torch.cuda.device_count() - self.tasks: Dict[str, TaskInfo] = {} - self.MAX_TASK_LIMIT = 48 - - def _register_routes(self): - route_config = [ - ('/video', self.predict, ['POST'], None), - ('/task/{task_id}', self.get_task_status, ['GET'], Dict[str, Optional[str]]), - ] - - for path, handler, methods, response_model in route_config: - self._router.add_api_route( - path=path, - endpoint=handler, - methods=methods, - response_model=response_model if 'GET' in methods else None, - status_code=status.HTTP_200_OK if 'GET' in methods else None, - ) - - async def predict(self, request: VideoRequest, background_tasks: BackgroundTasks) -> Dict[str, str]: - # Safety: allow tasks pending to MAX_TASK_LIMIT - # TODO: may need to improve - if sum(task.status == "pending" for task in self.tasks.values()) >= self.MAX_TASK_LIMIT: - print(f"Failed to START Task: reach to limit") - raise HTTPException(status_code=429, detail=f"Failed to start new task: reach to limit") - - task_id = str(uuid.uuid4()) - path = os.path.join(output_path, task_id) - print(f"Create new task: ID={task_id}, output path={path}") - self.tasks[task_id] = TaskInfo(task_id=task_id, status="pending", result_path=path) - - background_tasks.add_task(self._submit_task, task_id, request.data, path) - - print(f"Start Task: {task_id} for user: {request.user}, task: {request.task}") - - return {"task_id": task_id, "status": "pending"} - - def _submit_task(self, task_id: str, data: str, path: str): - - print(f"process task: ID={task_id}") - print(f"receive data: {data}...") # εͺ打印前100δΈͺ字符 - try: - data_dict = json.loads(data) - if data_dict.get("task_type") == "vln_eval": - print("=======VLN Eval Task=======") - cache_dir = f"/tmp/InternNav/.triton" - os.makedirs(cache_dir, exist_ok=True) - os.chmod(cache_dir, 0o777) - - evaluator.infer_scene_id = int(data_dict["scene_index"]) - 1 - evaluator.infer_episode_id = int(data_dict["episode_index"]) - 1 - evaluator.infer_instruction = data_dict["instruction"] - evaluator.output_path = path - evaluator.infer_data_ready = True - evaluator.run_single_eval() - - except Exception as e: - import traceback - - print(traceback.format_exc()) - self.tasks[task_id].status = "failed" - print(f"Task {task_id} failed: {e}") - - async def get_task_status(self, task_id: str) -> Dict[str, Optional[str]]: - print(f"call get_task_status") - task = self.tasks[task_id] - if not evaluator.infer_success: - return {"status": "pending", "result": task.result_path} - - video_path = os.path.join(task.result_path, f"res_{evaluator.infer_success_cnt}.mp4") - with open(video_path, 'rb') as f: - video_bytes = f.read() - video_data = base64.b64encode(video_bytes).decode("utf-8") - - return {"status": 'completed', "result": task.result_path, "video": video_data} - - def run(self): - uvicorn.run('__main__:server.app', host=self.host, port=self.port) - - -if __name__ == "__main__": - output_path = f"log/" - - parser = argparse.ArgumentParser() - parser.add_argument("--local_rank", default=0, type=int, help="node rank") - parser.add_argument("--model_path", type=str, default="checkpoints/InternVLA-N1") - parser.add_argument("--habitat_config_path", type=str, default='scripts/eval/configs/vln_r2r.yaml') - parser.add_argument("--eval_split", type=str, default='train') - parser.add_argument("--output_path", type=str, default='./exps_pix/val_unseen/debug_coord_wm') - parser.add_argument("--num_future_steps", type=int, default=4) - parser.add_argument("--num_frames", type=int, default=32) - parser.add_argument("--save_video", action="store_true", default=True) - parser.add_argument("--num_history", type=int, default=8) - parser.add_argument("--resize_w", type=int, default=384) - parser.add_argument("--resize_h", type=int, default=384) - parser.add_argument("--predict_step_nums", type=int, default=32) - parser.add_argument("--continuous_traj", action="store_true", default=True) - parser.add_argument("--max_new_tokens", type=int, default=1024) - - parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') - parser.add_argument('--rank', default=0, type=int, help='rank') - parser.add_argument('--gpu', default=0, type=int, help='gpu') - parser.add_argument('--port', default='2443') - parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') - parser.add_argument('--device', default='cuda', help='device to use for training / testing') - args = parser.parse_args() - init_distributed_mode(args) - local_rank = args.local_rank - np.random.seed(local_rank) - - processor = AutoProcessor.from_pretrained(args.model_path) - processor.tokenizer.padding_side = 'left' - - device = torch.device(f"cuda:{local_rank}") - model = InternVLAN1ForCausalLM.from_pretrained( - args.model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map={"": device} - ) - model.eval() - world_size = get_world_size() - evaluator = VLNEvaluator( - config_path=args.habitat_config_path, - split=args.eval_split, - env_num=world_size, - output_path=args.output_path, - model=model, - processor=processor, - epoch=0, - args=args, - ) - # evaluator.eval_action(0) - - server = BackendServer(host="0.0.0.0", port=8001) - server.run() diff --git a/scripts/eval/bash/eval_vln_distributed.sh b/scripts/eval/bash/eval_vln_distributed.sh index ae2060e9..e5509556 100644 --- a/scripts/eval/bash/eval_vln_distributed.sh +++ b/scripts/eval/bash/eval_vln_distributed.sh @@ -65,30 +65,15 @@ case "$mode" in conda activate internutopia - # -------- parse remaining arguments (e.g., --config xxx) -------- - while [[ $# -gt 0 ]]; do - case $1 in - --config) - CONFIG="$2" - shift 2 - ;; - *) - echo "Unknown parameter: $1" - exit 1 - ;; - esac - done - # ---------------------------------------------------------------- - if [ "$RANK" -eq 0 ]; then echo "[run.sh] Starting Ray head..." RAY_max_direct_call_object_size=104857600 \ ray start --head --port=6379 - sleep 20s + sleep 300s echo "[run.sh] Exec start_eval.sh..." - bash scripts/eval/bash/start_eval.sh + bash scripts/eval/bash/start_eval.sh --config $CONFIG sleep inf else diff --git a/scripts/eval/configs/h1_internvla_n1_async_cfg.py b/scripts/eval/configs/h1_internvla_n1_async_cfg.py index 90028b03..3352fe55 100644 --- a/scripts/eval/configs/h1_internvla_n1_async_cfg.py +++ b/scripts/eval/configs/h1_internvla_n1_async_cfg.py @@ -49,7 +49,7 @@ 'env_num': 1, 'use_distributed': False, # If the others setting in task_settings, please set use_distributed = False. 'proc_num': 1, - 'max_step': 50000, # If use flash mode,default 1000; descrete mode, set 50000 + 'max_step': 1000, # If use flash mode,default 1000; descrete mode, set 50000 }, scene=SceneCfg( scene_type='mp3d', @@ -57,7 +57,7 @@ ), robot_name='h1', robot_flash=True, # If robot_flash is True, the mode is flash (set world_pose directly); else you choose physical mode. - flash_collision=True, # If flash_collision is True, the robot will stop when collision detected. + flash_collision=False, # If flash_collision is True, the robot will stop when collision detected. robot_usd_path='data/Embodiments/vln-pe/h1/h1_internvla.usd', camera_resolution=[640, 480], # (W,H) camera_prim_path='torso_link/h1_1_25_down_30', diff --git a/scripts/eval/configs/habitat_dialog_cfg.py b/scripts/eval/configs/habitat_dialog_cfg.py index 202389ec..662e46f3 100755 --- a/scripts/eval/configs/habitat_dialog_cfg.py +++ b/scripts/eval/configs/habitat_dialog_cfg.py @@ -9,7 +9,7 @@ model_settings={ "mode": "system2", # inference mode: dual_system or system2 "dialog_enabled": True, - "model_path": "VL-LN-Bench/base_model/checkpoint-iion", # path to model checkpoint + "model_path": "projects/VL-LN-Bench/base_model/iion", # path to model checkpoint "append_look_down": False, "num_history": 8, "resize_w": 384, # image resize width @@ -39,7 +39,7 @@ "base_url": 'http://35.220.164.252:3888/v1', "model_name": "gpt-4o", "openai_api_key": 'internnav/habitat_extensions/vlln/simple_npc/api_key.txt', - "scene_summary": 'VL-LN-Bench/raw_data/mp3d/scene_summary', + "scene_summary": 'projects/VL-LN-Bench/raw_data/mp3d/scene_summary', # distributed settings "port": "2333", # communication port "dist_url": "env://", # url for distributed setup diff --git a/scripts/eval/configs/habitat_object_cfg.py b/scripts/eval/configs/habitat_object_cfg.py index 05772195..dbfd82f7 100755 --- a/scripts/eval/configs/habitat_object_cfg.py +++ b/scripts/eval/configs/habitat_object_cfg.py @@ -9,7 +9,7 @@ model_settings={ "mode": "system2", # inference mode: dual_system or system2 "dialog_enabled": False, - "model_path": "checkpoints/Vlln-object", # path to model checkpoint + "model_path": "projects/VL-LN-Bench/base_model/checkpoint-iion", # path to model checkpoint "append_look_down": True, "num_history": 8, "resize_w": 384, # image resize width @@ -40,8 +40,8 @@ # npc setting "base_url": 'http://35.220.164.252:3888/v1', "model_name": "gpt-4o", - "openai_api_key": 'internnav/habitat_vlln_extensions/simple_npc/api_key.txt', - "scene_summary": 'internnav/habitat_vlln_extensions/simple_npc/scene_summary', + "openai_api_key": 'internnav/habitat_extensions/vlln/simple_npc/api_key.txt', + "scene_summary": 'projects/VL-LN-Bench/raw_data/mp3d/scene_summary', # distributed settings "port": "2333", # communication port "dist_url": "env://", # url for distributed setup diff --git a/scripts/eval/configs/instance_dialog.yaml b/scripts/eval/configs/instance_dialog.yaml index 34d0a710..5c3e2e3b 100644 --- a/scripts/eval/configs/instance_dialog.yaml +++ b/scripts/eval/configs/instance_dialog.yaml @@ -74,5 +74,5 @@ habitat: dataset: type: dialog split: unseen_mini - scenes_dir: VL-LN-Bench/scene_datasets/ - data_path: VL-LN-Bench/raw_data/mp3d/val_unseen/val_unseen_iion.json.gz + scenes_dir: projects/VL-LN-Bench/scene_datasets/ + data_path: projects/VL-LN-Bench/raw_data/mp3d/val_unseen/val_unseen_iion.json.gz diff --git a/scripts/eval/configs/objectnav_hm3d.yaml b/scripts/eval/configs/objectnav_hm3d.yaml index 9f7c44c2..262401bb 100644 --- a/scripts/eval/configs/objectnav_hm3d.yaml +++ b/scripts/eval/configs/objectnav_hm3d.yaml @@ -78,5 +78,5 @@ habitat: dataset: type: ObjectNav-v1 split: val - scenes_dir: data/scene_datasets/ + scenes_dir: projects/VL-LN-Bench/scene_datasets/ data_path: data/datasets/objectnav_hm3d_v2/{split}/{split}.json.gz