Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions dimos/models/vl/moondream_hosted.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from dimos.models.vl.base import VlModel
from dimos.msgs.sensor_msgs import Image
from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
from dimos.perception.detection.type import Detection2DBBox, Detection2DPoint, ImageDetections2D


class MoondreamHostedVlModel(VlModel):
Expand Down Expand Up @@ -107,29 +107,41 @@ def query_detections(self, image: Image, query: str, **kwargs) -> ImageDetection

return image_detections

def point(self, image: Image, query: str) -> list[tuple[float, float]]:
"""Get coordinates of specific objects in an image.
def query_points(
self, image: Image, query: str, **kwargs: object
) -> ImageDetections2D[Detection2DPoint]:
"""Detect point locations using Moondream's hosted point method.

Args:
image: Input image
query: Object query
query: Object query (e.g., "person's head", "center of the ball")

Returns:
List of (x, y) pixel coordinates
ImageDetections2D containing detected points
"""
pil_image = self._to_pil_image(image)
result = self._client.point(pil_image, query)
points = result.get("points", [])

pixel_points = []
image_detections: ImageDetections2D[Detection2DPoint] = ImageDetections2D(image)
height, width = image.height, image.width

for p in points:
x_norm = p.get("x", 0.0)
y_norm = p.get("y", 0.0)
pixel_points.append((x_norm * width, y_norm * height))
for track_id, point in enumerate(result.get("points", [])):
x = point.get("x", 0.0) * width
y = point.get("y", 0.0) * height

return pixel_points
detection = Detection2DPoint(
x=x,
y=y,
name=query,
ts=image.ts,
image=image,
track_id=track_id,
)

if detection.is_valid():
image_detections.detections.append(detection)

return image_detections

def stop(self) -> None:
pass
Expand Down
3 changes: 3 additions & 0 deletions dimos/models/vl/test_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from dimos.core import LCMTransport
from dimos.models.vl.moondream import MoondreamVlModel
from dimos.models.vl.moondream_hosted import MoondreamHostedVlModel
from dimos.models.vl.qwen import QwenVlModel
from dimos.msgs.sensor_msgs import Image
from dimos.perception.detection.type import ImageDetections2D
Expand All @@ -26,6 +27,7 @@
"model_class,model_name",
[
(MoondreamVlModel, "Moondream"),
(MoondreamHostedVlModel, "Moondream Hosted"),
(QwenVlModel, "Qwen"),
],
)
Expand Down Expand Up @@ -94,6 +96,7 @@ def test_vlm_bbox_detections(model_class: "type[VlModel]", model_name: str) -> N
"model_class,model_name",
[
(MoondreamVlModel, "Moondream"),
(MoondreamHostedVlModel, "Moondream Hosted"),
(QwenVlModel, "Qwen"),
],
)
Expand Down