dimensionalOS · alexlin2 · Jul 25, 2025 · Jul 24, 2025
diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py
@@ -85,15 +85,15 @@ def query_single_frame_observable(
 
 
 def query_single_frame(
-    image: "PIL.Image",
+    image: np.ndarray,
     query: str = "Return the center coordinates of the fridge handle as a tuple (x,y)",
     api_key: Optional[str] = None,
     model_name: str = "qwen2.5-vl-72b-instruct",
 ) -> str:
-    """Process a single PIL image with Qwen model.
+    """Process a single numpy image array with Qwen model.
 
     Args:
-        image: A PIL Image to process
+        image: A numpy array image to process (H, W, 3) in RGB format
         query: The query to ask about the image
         api_key: Alibaba API key. If None, will try to get from ALIBABA_API_KEY env var
         model_name: The Qwen model to use. Defaults to qwen2.5-vl-72b-instruct
@@ -103,8 +103,9 @@ def query_single_frame(
 
     Example:
         ```python
-        from PIL import Image
-        image = Image.open('image.jpg')
+        import cv2
+        image = cv2.imread('image.jpg')
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB
         response = query_single_frame(image, "Return the center coordinates of the object _____ as a tuple (x,y)")
         print(response)
         ```
@@ -133,8 +134,8 @@ def query_single_frame(
         pool_scheduler=get_scheduler(),
     )
 
-    # Convert PIL image to numpy array
-    frame = np.array(image)
+    # Use the numpy array directly (no conversion needed)
+    frame = image
 
     # Create a Subject that will emit the image once
     frame_subject = Subject()
@@ -200,18 +201,16 @@ def get_bbox_from_qwen_frame(frame, object_name: Optional[str] = None) -> Option
     """Get bounding box coordinates from Qwen for a specific object or any object using a single frame.
 
     Args:
-        frame: A single image frame (PIL Image or numpy array)
+        frame: A single image frame (numpy array in RGB format)
         object_name: Optional name of object to detect
 
     Returns:
         tuple: (bbox, size) where bbox is [x1, y1, x2, y2] or None if no detection
                and size is the estimated height in meters
     """
-    # Convert numpy array to PIL Image if needed
-    if isinstance(frame, np.ndarray):
-        from PIL import Image
-
-        frame = Image.fromarray(frame)
+    # Ensure frame is numpy array
+    if not isinstance(frame, np.ndarray):
+        raise ValueError("Frame must be a numpy array")
 
     prompt = (
         f"Look at this image and find the {object_name if object_name else 'most prominent object'}. Estimate the approximate height of the subject."

diff --git a/dimos/robot/frontier_exploration/qwen_frontier_predictor.py b/dimos/robot/frontier_exploration/qwen_frontier_predictor.py
@@ -246,8 +246,12 @@ def get_exploration_goal(self, robot_pose: Vector, costmap: Costmap) -> Optional
         # Query Qwen model for frontier prediction
         try:
             prompt = self._create_vision_prompt()
+
+            # Convert PIL image to numpy array for query_single_frame
+            annotated_array = np.array(annotated_image)
+
             response = query_single_frame(
-                annotated_image, prompt, api_key=self.api_key, model_name=self.model_name
+                annotated_array, prompt, api_key=self.api_key, model_name=self.model_name
             )
 
             print(f"DEBUG: Qwen response: {response}")

diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py
@@ -164,22 +164,19 @@ def _process_frame_with_qwen(self, frame):
         logger.info(f"Processing frame with Qwen model: {self._model_name}")
 
         try:
-            # Convert numpy array to PIL Image if needed
-            from PIL import Image
-
+            # Ensure frame is in RGB format for Qwen
             if isinstance(frame, np.ndarray):
-                # OpenCV uses BGR, PIL uses RGB
+                # OpenCV uses BGR, convert to RGB if needed
                 if frame.shape[-1] == 3:  # Check if it has color channels
                     frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb)
                 else:
-                    pil_image = Image.fromarray(frame)
+                    frame_rgb = frame
             else:
-                pil_image = frame
+                raise ValueError("Frame must be a numpy array")
 
             # Query Qwen with the frame (direct function call)
             response = query_single_frame(
-                pil_image,
+                frame_rgb,
                 self.query_text,
                 model_name=self._model_name,
             )

diff --git a/dimos/skills/observe_stream.py b/dimos/skills/observe_stream.py
@@ -28,7 +28,6 @@
 import reactivex as rx
 from reactivex import operators as ops
 from pydantic import Field
-from PIL import Image
 
 from dimos.skills.skills import AbstractRobotSkill
 from dimos.agents.agent import LLMAgent
@@ -200,20 +199,19 @@ def _process_frame(self, frame):
         logger.info("Processing frame with Qwen VLM")
 
         try:
-            # Convert frame to PIL Image format
+            # Ensure frame is in RGB format for Qwen
             if isinstance(frame, np.ndarray):
-                # OpenCV uses BGR, PIL uses RGB
+                # OpenCV uses BGR, convert to RGB if needed
                 if frame.shape[-1] == 3:  # Check if it has color channels
                     frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb)
                 else:
-                    pil_image = Image.fromarray(frame)
+                    frame_rgb = frame
             else:
-                pil_image = frame
+                raise ValueError("Frame must be a numpy array")
 
             # Use Qwen to process the frame
             model_name = "qwen2.5-vl-72b-instruct"  # Using the most capable model
-            response = query_single_frame(pil_image, self.query_text, model_name=model_name)
+            response = query_single_frame(frame_rgb, self.query_text, model_name=model_name)
 
             logger.info(f"Qwen response received: {response[:100]}...")
 

diff --git a/tests/test_qwen_image_query.py b/tests/test_qwen_image_query.py
@@ -15,6 +15,8 @@
 """Test the Qwen image query functionality."""
 
 import os
+import cv2
+import numpy as np
 from PIL import Image
 from dimos.models.qwen.video_query import query_single_frame
 
@@ -28,7 +30,16 @@ def test_qwen_image_query():
 
     # Load test image
     image_path = os.path.join(os.getcwd(), "assets", "test_spatial_memory", "frame_038.jpg")
-    image = Image.open(image_path)
+    pil_image = Image.open(image_path)
+
+    # Convert PIL image to numpy array in RGB format
+    image_array = np.array(pil_image)
+    if image_array.shape[-1] == 3:
+        # Ensure it's in RGB format (PIL loads as RGB by default)
+        image = image_array
+    else:
+        # Handle grayscale images
+        image = cv2.cvtColor(image_array, cv2.COLOR_GRAY2RGB)
 
     # Test basic object detection query
     response = query_single_frame(