From 87b5b40ce3e21a3dc388bf3fd2646cd1a3a027c4 Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Tue, 20 Jan 2026 10:47:30 -0500
Subject: [PATCH] fixed issue #1074

---
 dimos/models/vl/moondream_hosted.py | 36 +++++++++++++++++++----------
 dimos/models/vl/test_vlm.py         |  3 +++
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/dimos/models/vl/moondream_hosted.py b/dimos/models/vl/moondream_hosted.py
index c28a12363f..fc1f8b7a17 100644
--- a/dimos/models/vl/moondream_hosted.py
+++ b/dimos/models/vl/moondream_hosted.py
@@ -8,7 +8,7 @@
 
 from dimos.models.vl.base import VlModel
 from dimos.msgs.sensor_msgs import Image
-from dimos.perception.detection.type import Detection2DBBox, ImageDetections2D
+from dimos.perception.detection.type import Detection2DBBox, Detection2DPoint, ImageDetections2D
 
 
 class MoondreamHostedVlModel(VlModel):
@@ -107,29 +107,41 @@ def query_detections(self, image: Image, query: str, **kwargs) -> ImageDetection
 
         return image_detections
 
-    def point(self, image: Image, query: str) -> list[tuple[float, float]]:
-        """Get coordinates of specific objects in an image.
+    def query_points(
+        self, image: Image, query: str, **kwargs: object
+    ) -> ImageDetections2D[Detection2DPoint]:
+        """Detect point locations using Moondream's hosted point method.
 
         Args:
             image: Input image
-            query: Object query
+            query: Object query (e.g., "person's head", "center of the ball")
 
         Returns:
-            List of (x, y) pixel coordinates
+            ImageDetections2D containing detected points
         """
         pil_image = self._to_pil_image(image)
         result = self._client.point(pil_image, query)
-        points = result.get("points", [])
 
-        pixel_points = []
+        image_detections: ImageDetections2D[Detection2DPoint] = ImageDetections2D(image)
         height, width = image.height, image.width
 
-        for p in points:
-            x_norm = p.get("x", 0.0)
-            y_norm = p.get("y", 0.0)
-            pixel_points.append((x_norm * width, y_norm * height))
+        for track_id, point in enumerate(result.get("points", [])):
+            x = point.get("x", 0.0) * width
+            y = point.get("y", 0.0) * height
 
-        return pixel_points
+            detection = Detection2DPoint(
+                x=x,
+                y=y,
+                name=query,
+                ts=image.ts,
+                image=image,
+                track_id=track_id,
+            )
+
+            if detection.is_valid():
+                image_detections.detections.append(detection)
+
+        return image_detections
 
     def stop(self) -> None:
         pass
diff --git a/dimos/models/vl/test_vlm.py b/dimos/models/vl/test_vlm.py
index 1bf20eb680..54e6c5111c 100644
--- a/dimos/models/vl/test_vlm.py
+++ b/dimos/models/vl/test_vlm.py
@@ -8,6 +8,7 @@
 
 from dimos.core import LCMTransport
 from dimos.models.vl.moondream import MoondreamVlModel
+from dimos.models.vl.moondream_hosted import MoondreamHostedVlModel
 from dimos.models.vl.qwen import QwenVlModel
 from dimos.msgs.sensor_msgs import Image
 from dimos.perception.detection.type import ImageDetections2D
@@ -26,6 +27,7 @@
     "model_class,model_name",
     [
         (MoondreamVlModel, "Moondream"),
+        (MoondreamHostedVlModel, "Moondream Hosted"),
         (QwenVlModel, "Qwen"),
     ],
 )
@@ -94,6 +96,7 @@ def test_vlm_bbox_detections(model_class: "type[VlModel]", model_name: str) -> N
     "model_class,model_name",
     [
         (MoondreamVlModel, "Moondream"),
+        (MoondreamHostedVlModel, "Moondream Hosted"),
         (QwenVlModel, "Qwen"),
     ],
 )