From 1cc3ed1cba2387b5a82962b192696563a495d7ae Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 24 Jul 2025 16:23:46 -0700 Subject: [PATCH] remove pil image usage --- dimos/models/qwen/video_query.py | 25 +++++++++---------- .../qwen_frontier_predictor.py | 6 ++++- dimos/skills/observe.py | 13 ++++------ dimos/skills/observe_stream.py | 12 ++++----- tests/test_qwen_image_query.py | 13 +++++++++- 5 files changed, 39 insertions(+), 30 deletions(-) diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py index 7eda5f1aed..c37ca953c2 100644 --- a/dimos/models/qwen/video_query.py +++ b/dimos/models/qwen/video_query.py @@ -85,15 +85,15 @@ def query_single_frame_observable( def query_single_frame( - image: "PIL.Image", + image: np.ndarray, query: str = "Return the center coordinates of the fridge handle as a tuple (x,y)", api_key: Optional[str] = None, model_name: str = "qwen2.5-vl-72b-instruct", ) -> str: - """Process a single PIL image with Qwen model. + """Process a single numpy image array with Qwen model. Args: - image: A PIL Image to process + image: A numpy array image to process (H, W, 3) in RGB format query: The query to ask about the image api_key: Alibaba API key. If None, will try to get from ALIBABA_API_KEY env var model_name: The Qwen model to use. Defaults to qwen2.5-vl-72b-instruct @@ -103,8 +103,9 @@ def query_single_frame( Example: ```python - from PIL import Image - image = Image.open('image.jpg') + import cv2 + image = cv2.imread('image.jpg') + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert to RGB response = query_single_frame(image, "Return the center coordinates of the object _____ as a tuple (x,y)") print(response) ``` @@ -133,8 +134,8 @@ def query_single_frame( pool_scheduler=get_scheduler(), ) - # Convert PIL image to numpy array - frame = np.array(image) + # Use the numpy array directly (no conversion needed) + frame = image # Create a Subject that will emit the image once frame_subject = Subject() @@ -200,18 +201,16 @@ def get_bbox_from_qwen_frame(frame, object_name: Optional[str] = None) -> Option """Get bounding box coordinates from Qwen for a specific object or any object using a single frame. Args: - frame: A single image frame (PIL Image or numpy array) + frame: A single image frame (numpy array in RGB format) object_name: Optional name of object to detect Returns: tuple: (bbox, size) where bbox is [x1, y1, x2, y2] or None if no detection and size is the estimated height in meters """ - # Convert numpy array to PIL Image if needed - if isinstance(frame, np.ndarray): - from PIL import Image - - frame = Image.fromarray(frame) + # Ensure frame is numpy array + if not isinstance(frame, np.ndarray): + raise ValueError("Frame must be a numpy array") prompt = ( f"Look at this image and find the {object_name if object_name else 'most prominent object'}. Estimate the approximate height of the subject." diff --git a/dimos/robot/frontier_exploration/qwen_frontier_predictor.py b/dimos/robot/frontier_exploration/qwen_frontier_predictor.py index 10a1d8a265..2ccdb89a17 100644 --- a/dimos/robot/frontier_exploration/qwen_frontier_predictor.py +++ b/dimos/robot/frontier_exploration/qwen_frontier_predictor.py @@ -246,8 +246,12 @@ def get_exploration_goal(self, robot_pose: Vector, costmap: Costmap) -> Optional # Query Qwen model for frontier prediction try: prompt = self._create_vision_prompt() + + # Convert PIL image to numpy array for query_single_frame + annotated_array = np.array(annotated_image) + response = query_single_frame( - annotated_image, prompt, api_key=self.api_key, model_name=self.model_name + annotated_array, prompt, api_key=self.api_key, model_name=self.model_name ) print(f"DEBUG: Qwen response: {response}") diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py index 067307353a..8a934bf34d 100644 --- a/dimos/skills/observe.py +++ b/dimos/skills/observe.py @@ -164,22 +164,19 @@ def _process_frame_with_qwen(self, frame): logger.info(f"Processing frame with Qwen model: {self._model_name}") try: - # Convert numpy array to PIL Image if needed - from PIL import Image - + # Ensure frame is in RGB format for Qwen if isinstance(frame, np.ndarray): - # OpenCV uses BGR, PIL uses RGB + # OpenCV uses BGR, convert to RGB if needed if frame.shape[-1] == 3: # Check if it has color channels frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - pil_image = Image.fromarray(frame_rgb) else: - pil_image = Image.fromarray(frame) + frame_rgb = frame else: - pil_image = frame + raise ValueError("Frame must be a numpy array") # Query Qwen with the frame (direct function call) response = query_single_frame( - pil_image, + frame_rgb, self.query_text, model_name=self._model_name, ) diff --git a/dimos/skills/observe_stream.py b/dimos/skills/observe_stream.py index 7b4e08874e..1766ffe2aa 100644 --- a/dimos/skills/observe_stream.py +++ b/dimos/skills/observe_stream.py @@ -28,7 +28,6 @@ import reactivex as rx from reactivex import operators as ops from pydantic import Field -from PIL import Image from dimos.skills.skills import AbstractRobotSkill from dimos.agents.agent import LLMAgent @@ -200,20 +199,19 @@ def _process_frame(self, frame): logger.info("Processing frame with Qwen VLM") try: - # Convert frame to PIL Image format + # Ensure frame is in RGB format for Qwen if isinstance(frame, np.ndarray): - # OpenCV uses BGR, PIL uses RGB + # OpenCV uses BGR, convert to RGB if needed if frame.shape[-1] == 3: # Check if it has color channels frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - pil_image = Image.fromarray(frame_rgb) else: - pil_image = Image.fromarray(frame) + frame_rgb = frame else: - pil_image = frame + raise ValueError("Frame must be a numpy array") # Use Qwen to process the frame model_name = "qwen2.5-vl-72b-instruct" # Using the most capable model - response = query_single_frame(pil_image, self.query_text, model_name=model_name) + response = query_single_frame(frame_rgb, self.query_text, model_name=model_name) logger.info(f"Qwen response received: {response[:100]}...") diff --git a/tests/test_qwen_image_query.py b/tests/test_qwen_image_query.py index 13feaf7eb3..634f9f6563 100644 --- a/tests/test_qwen_image_query.py +++ b/tests/test_qwen_image_query.py @@ -15,6 +15,8 @@ """Test the Qwen image query functionality.""" import os +import cv2 +import numpy as np from PIL import Image from dimos.models.qwen.video_query import query_single_frame @@ -28,7 +30,16 @@ def test_qwen_image_query(): # Load test image image_path = os.path.join(os.getcwd(), "assets", "test_spatial_memory", "frame_038.jpg") - image = Image.open(image_path) + pil_image = Image.open(image_path) + + # Convert PIL image to numpy array in RGB format + image_array = np.array(pil_image) + if image_array.shape[-1] == 3: + # Ensure it's in RGB format (PIL loads as RGB by default) + image = image_array + else: + # Handle grayscale images + image = cv2.cvtColor(image_array, cv2.COLOR_GRAY2RGB) # Test basic object detection query response = query_single_frame(