From 87b5b40ce3e21a3dc388bf3fd2646cd1a3a027c4 Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Tue, 20 Jan 2026 10:47:30 -0500 Subject: [PATCH] fixed issue #1074 --- dimos/models/vl/moondream_hosted.py | 36 +++++++++++++++++++---------- dimos/models/vl/test_vlm.py | 3 +++ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/dimos/models/vl/moondream_hosted.py b/dimos/models/vl/moondream_hosted.py index c28a12363f..fc1f8b7a17 100644 --- a/dimos/models/vl/moondream_hosted.py +++ b/dimos/models/vl/moondream_hosted.py @@ -8,7 +8,7 @@ from dimos.models.vl.base import VlModel from dimos.msgs.sensor_msgs import Image -from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D +from dimos.perception.detection.type import Detection2DBBox, Detection2DPoint, ImageDetections2D class MoondreamHostedVlModel(VlModel): @@ -107,29 +107,41 @@ def query_detections(self, image: Image, query: str, **kwargs) -> ImageDetection return image_detections - def point(self, image: Image, query: str) -> list[tuple[float, float]]: - """Get coordinates of specific objects in an image. + def query_points( + self, image: Image, query: str, **kwargs: object + ) -> ImageDetections2D[Detection2DPoint]: + """Detect point locations using Moondream's hosted point method. Args: image: Input image - query: Object query + query: Object query (e.g., "person's head", "center of the ball") Returns: - List of (x, y) pixel coordinates + ImageDetections2D containing detected points """ pil_image = self._to_pil_image(image) result = self._client.point(pil_image, query) - points = result.get("points", []) - pixel_points = [] + image_detections: ImageDetections2D[Detection2DPoint] = ImageDetections2D(image) height, width = image.height, image.width - for p in points: - x_norm = p.get("x", 0.0) - y_norm = p.get("y", 0.0) - pixel_points.append((x_norm * width, y_norm * height)) + for track_id, point in enumerate(result.get("points", [])): + x = point.get("x", 0.0) * width + y = point.get("y", 0.0) * height - return pixel_points + detection = Detection2DPoint( + x=x, + y=y, + name=query, + ts=image.ts, + image=image, + track_id=track_id, + ) + + if detection.is_valid(): + image_detections.detections.append(detection) + + return image_detections def stop(self) -> None: pass diff --git a/dimos/models/vl/test_vlm.py b/dimos/models/vl/test_vlm.py index 1bf20eb680..54e6c5111c 100644 --- a/dimos/models/vl/test_vlm.py +++ b/dimos/models/vl/test_vlm.py @@ -8,6 +8,7 @@ from dimos.core import LCMTransport from dimos.models.vl.moondream import MoondreamVlModel +from dimos.models.vl.moondream_hosted import MoondreamHostedVlModel from dimos.models.vl.qwen import QwenVlModel from dimos.msgs.sensor_msgs import Image from dimos.perception.detection.type import ImageDetections2D @@ -26,6 +27,7 @@ "model_class,model_name", [ (MoondreamVlModel, "Moondream"), + (MoondreamHostedVlModel, "Moondream Hosted"), (QwenVlModel, "Qwen"), ], ) @@ -94,6 +96,7 @@ def test_vlm_bbox_detections(model_class: "type[VlModel]", model_name: str) -> N "model_class,model_name", [ (MoondreamVlModel, "Moondream"), + (MoondreamHostedVlModel, "Moondream Hosted"), (QwenVlModel, "Qwen"), ], )