Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions dimos/models/qwen/video_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ def query_single_frame_observable(


def query_single_frame(
image: "PIL.Image",
image: np.ndarray,
query: str = "Return the center coordinates of the fridge handle as a tuple (x,y)",
api_key: Optional[str] = None,
model_name: str = "qwen2.5-vl-72b-instruct",
) -> str:
"""Process a single PIL image with Qwen model.
"""Process a single numpy image array with Qwen model.

Args:
image: A PIL Image to process
image: A numpy array image to process (H, W, 3) in RGB format
query: The query to ask about the image
api_key: Alibaba API key. If None, will try to get from ALIBABA_API_KEY env var
model_name: The Qwen model to use. Defaults to qwen2.5-vl-72b-instruct
Expand All @@ -103,8 +103,9 @@ def query_single_frame(

Example:
```python
from PIL import Image
image = Image.open('image.jpg')
import cv2
image = cv2.imread('image.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert to RGB
response = query_single_frame(image, "Return the center coordinates of the object _____ as a tuple (x,y)")
print(response)
```
Expand Down Expand Up @@ -133,8 +134,8 @@ def query_single_frame(
pool_scheduler=get_scheduler(),
)

# Convert PIL image to numpy array
frame = np.array(image)
# Use the numpy array directly (no conversion needed)
frame = image

# Create a Subject that will emit the image once
frame_subject = Subject()
Expand Down Expand Up @@ -200,18 +201,16 @@ def get_bbox_from_qwen_frame(frame, object_name: Optional[str] = None) -> Option
"""Get bounding box coordinates from Qwen for a specific object or any object using a single frame.

Args:
frame: A single image frame (PIL Image or numpy array)
frame: A single image frame (numpy array in RGB format)
object_name: Optional name of object to detect

Returns:
tuple: (bbox, size) where bbox is [x1, y1, x2, y2] or None if no detection
and size is the estimated height in meters
"""
# Convert numpy array to PIL Image if needed
if isinstance(frame, np.ndarray):
from PIL import Image

frame = Image.fromarray(frame)
# Ensure frame is numpy array
if not isinstance(frame, np.ndarray):
raise ValueError("Frame must be a numpy array")

prompt = (
f"Look at this image and find the {object_name if object_name else 'most prominent object'}. Estimate the approximate height of the subject."
Expand Down
6 changes: 5 additions & 1 deletion dimos/robot/frontier_exploration/qwen_frontier_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,12 @@ def get_exploration_goal(self, robot_pose: Vector, costmap: Costmap) -> Optional
# Query Qwen model for frontier prediction
try:
prompt = self._create_vision_prompt()

# Convert PIL image to numpy array for query_single_frame
annotated_array = np.array(annotated_image)

response = query_single_frame(
annotated_image, prompt, api_key=self.api_key, model_name=self.model_name
annotated_array, prompt, api_key=self.api_key, model_name=self.model_name
)

print(f"DEBUG: Qwen response: {response}")
Expand Down
13 changes: 5 additions & 8 deletions dimos/skills/observe.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,22 +164,19 @@ def _process_frame_with_qwen(self, frame):
logger.info(f"Processing frame with Qwen model: {self._model_name}")

try:
# Convert numpy array to PIL Image if needed
from PIL import Image

# Ensure frame is in RGB format for Qwen
if isinstance(frame, np.ndarray):
# OpenCV uses BGR, PIL uses RGB
# OpenCV uses BGR, convert to RGB if needed
if frame.shape[-1] == 3: # Check if it has color channels
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
else:
pil_image = Image.fromarray(frame)
frame_rgb = frame
else:
pil_image = frame
raise ValueError("Frame must be a numpy array")

# Query Qwen with the frame (direct function call)
response = query_single_frame(
pil_image,
frame_rgb,
self.query_text,
model_name=self._model_name,
)
Expand Down
12 changes: 5 additions & 7 deletions dimos/skills/observe_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import reactivex as rx
from reactivex import operators as ops
from pydantic import Field
from PIL import Image

from dimos.skills.skills import AbstractRobotSkill
from dimos.agents.agent import LLMAgent
Expand Down Expand Up @@ -200,20 +199,19 @@ def _process_frame(self, frame):
logger.info("Processing frame with Qwen VLM")

try:
# Convert frame to PIL Image format
# Ensure frame is in RGB format for Qwen
if isinstance(frame, np.ndarray):
# OpenCV uses BGR, PIL uses RGB
# OpenCV uses BGR, convert to RGB if needed
if frame.shape[-1] == 3: # Check if it has color channels
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
else:
pil_image = Image.fromarray(frame)
frame_rgb = frame
else:
pil_image = frame
raise ValueError("Frame must be a numpy array")

# Use Qwen to process the frame
model_name = "qwen2.5-vl-72b-instruct" # Using the most capable model
response = query_single_frame(pil_image, self.query_text, model_name=model_name)
response = query_single_frame(frame_rgb, self.query_text, model_name=model_name)

logger.info(f"Qwen response received: {response[:100]}...")

Expand Down
13 changes: 12 additions & 1 deletion tests/test_qwen_image_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
"""Test the Qwen image query functionality."""

import os
import cv2
import numpy as np
from PIL import Image
from dimos.models.qwen.video_query import query_single_frame

Expand All @@ -28,7 +30,16 @@ def test_qwen_image_query():

# Load test image
image_path = os.path.join(os.getcwd(), "assets", "test_spatial_memory", "frame_038.jpg")
image = Image.open(image_path)
pil_image = Image.open(image_path)

# Convert PIL image to numpy array in RGB format
image_array = np.array(pil_image)
if image_array.shape[-1] == 3:
# Ensure it's in RGB format (PIL loads as RGB by default)
image = image_array
else:
# Handle grayscale images
image = cv2.cvtColor(image_array, cv2.COLOR_GRAY2RGB)

# Test basic object detection query
response = query_single_frame(
Expand Down