Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dimos/perception/detection2d/module2D.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def process_image(self, image: np.ndarray) -> InconvinientDetectionFormat: ...
@dataclass
class Config:
detector: Optional[Callable[[Any], Detector]] = Yolo2DDetector
max_freq: float = 3.0 # hz
max_freq: float = 0.5 # hz
vlmodel: VlModel = QwenVlModel


Expand Down Expand Up @@ -122,6 +122,7 @@ def vlm_query(self, query: str) -> ImageDetections2D:
return imageDetections

def process_image_frame(self, image: Image) -> ImageDetections2D:
print("Processing image frame for detections", image)
return ImageDetections2D.from_detector(
image, self.detector.process_image(image.to_opencv())
)
Expand All @@ -136,7 +137,7 @@ def sharp_image_stream(self) -> Observable[Image]:

@functools.cache
def detection_stream_2d(self) -> Observable[ImageDetections2D]:
return self.vlm_detections_subject
# self.vlm_detections_subject
# Regular detection stream from the detector
regular_detections = self.sharp_image_stream().pipe(ops.map(self.process_image_frame))
# Merge with VL model detections
Expand Down
25 changes: 13 additions & 12 deletions dimos/perception/detection2d/module3D.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from dimos_lcm.sensor_msgs import CameraInfo
from reactivex import operators as ops
Expand Down Expand Up @@ -57,7 +56,7 @@ def process_frame(
if not transform:
return ImageDetections3D(detections.image, [])

print("PROCESS FRAME", detections)
print("3d projection", detections, pointcloud, transform)
detection3d_list = []
for detection in detections:
detection3d = Detection3D.from_2d(
Expand All @@ -69,7 +68,9 @@ def process_frame(
if detection3d is not None:
detection3d_list.append(detection3d)

return ImageDetections3D(detections.image, detection3d_list)
ret = ImageDetections3D(detections.image, detection3d_list)
print("3d projection finished", ret)
return ret

@rpc
def start(self):
Expand All @@ -80,16 +81,16 @@ def detection2d_to_3d(args):
transform = self.tf.get("camera_optical", pc.frame_id, detections.image.ts, 5.0)
return self.process_frame(detections, pc, transform)

# self.detection_stream_3d = align_timestamped(
# backpressure(self.detection_stream_2d()),
# self.pointcloud.observable(),
# match_tolerance=0.25,
# buffer_size=20.0,
# ).pipe(ops.map(detection2d_to_3d))
self.detection_stream_3d = align_timestamped(
backpressure(self.detection_stream_2d()),
self.pointcloud.observable(),
match_tolerance=0.25,
buffer_size=20.0,
).pipe(ops.map(detection2d_to_3d))

self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
)
# self.detection_stream_3d = backpressure(self.detection_stream_2d()).pipe(
# ops.with_latest_from(self.pointcloud.observable()), ops.map(detection2d_to_3d)
# )

self.detection_stream_3d.subscribe(self._publish_detections)

Expand Down
76 changes: 65 additions & 11 deletions dimos/perception/detection2d/moduleDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def to_pose(self) -> PoseStamped:
print("Global pose:", global_pose)
global_pose.frame_id = self.best_detection.frame_id
print("remap to", self.best_detection.frame_id)
return PoseStamped(position=self.center, orientation=Quaternion(), frame_id="world")
return PoseStamped(
position=self.center, orientation=Quaternion(), frame_id=self.best_detection.frame_id
)


class ObjectDBModule(Detection3DModule, TableStr):
Expand Down Expand Up @@ -147,10 +149,13 @@ class ObjectDBModule(Detection3DModule, TableStr):

target: Out[PoseStamped] = None # type: ignore

remembered_locations: Dict[str, PoseStamped]

def __init__(self, goto: Callable[[PoseStamped], Any], *args, **kwargs):
super().__init__(*args, **kwargs)
self.goto = goto
self.objects = {}
self.remembered_locations = {}

def closest_object(self, detection: Detection3D) -> Optional[Object3D]:
# Filter objects to only those with matching names
Expand Down Expand Up @@ -202,25 +207,73 @@ def agent_encode(self) -> List[Any]:

def vlm_query(self, description: str) -> str:
imageDetections2D = super().vlm_query(description)
time.sleep(1.5)

print("VLM query found", imageDetections2D, "detections")
time.sleep(3)

if not imageDetections2D.detections:
return None

ret = []
for obj in self.objects.values():
if obj.ts != imageDetections2D.ts:
print(
"Skipping",
obj.track_id,
"ts",
obj.ts,
"!=",
imageDetections2D.ts,
)
continue
if obj.class_id != -100:
continue
if obj.name != imageDetections2D.detections[0].name:
print("Skipping", obj.name, "!=", imageDetections2D.detections[0].name)
continue
ret.append(obj)
return ret
ret.sort(key=lambda x: x.ts)

return ret[0] if ret else None

@skill()
def remember_location(self, name: str) -> str:
"""Remember the current location with a name."""
transform = self.tf.get("map", "sensor", time_point=time.time(), time_tolerance=1.0)
if not transform:
return f"Could not get current location transform from map to sensor"

pose = transform.to_pose()
pose.frame_id = "map"
self.remembered_locations[name] = pose
return f"Location '{name}' saved at position: {pose.position}"

@skill()
def navigate_to_object_in_view(self, description: str) -> str:
"""Navigate to an object by description using vision-language model to find it."""
objects = self.vlm_query(description)
if not objects:
return f"No objects found matching '{description}'"
target_obj = objects[0]
def goto_remembered_location(self, name: str) -> str:
"""Go to a remembered location by name."""
pose = self.remembered_locations.get(name, None)
if not pose:
return f"Location {name} not found. Known locations: {list(self.remembered_locations.keys())}"
self.goto(pose)
return f"Navigating to remembered location {name} and pose {pose}"

@skill()
def list_remembered_locations(self) -> List[str]:
"""List all remembered locations."""
return str(list(self.remembered_locations.keys()))

def nav_to(self, target_pose) -> str:
target_pose.orientation = Quaternion(0.0, 0.0, 0.0, 0.0)
self.target.publish(target_pose)
time.sleep(0.1)
self.target.publish(target_pose)
self.goto(target_pose)

@skill()
def navigate_to_object_in_view(self, query: str) -> str:
"""Navigate to an object in your current image view via natural language query using vision-language model to find it."""
target_obj = self.vlm_query(query)
if not target_obj:
return f"No objects found matching '{query}'"
return self.navigate_to_object_by_id(target_obj.track_id)

@skill(reducer=Reducer.all)
Expand All @@ -236,10 +289,11 @@ def navigate_to_object_by_id(self, object_id: str):
if not target_obj:
return f"Object {object_id} not found\nHere are the known objects:\n{str(self.agent_encode())}"
target_pose = target_obj.to_pose()
target_pose.frame_id = "map"
self.target.publish(target_pose)
time.sleep(0.1)
self.target.publish(target_pose)
self.goto(target_pose)
self.nav_to(target_pose)
return f"Navigating to f{object_id} f{target_obj.name}"

def lookup(self, label: str) -> List[Detection3D]:
Expand Down
2 changes: 0 additions & 2 deletions dimos/perception/detection2d/test_moduleDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from dimos.msgs.sensor_msgs import Image, PointCloud2
from dimos.msgs.vision_msgs import Detection2DArray
from dimos.perception.detection2d import testing
from dimos.perception.detection2d.module2D import Detection2DModule
from dimos.perception.detection2d.module3D import Detection3DModule
from dimos.perception.detection2d.moduleDB import ObjectDBModule
from dimos.protocol.service import lcmservice as lcm
from dimos.robot.unitree_webrtc.modular import deploy_connection, deploy_navigation
Expand Down
6 changes: 0 additions & 6 deletions dimos/perception/detection2d/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import hashlib
import os
import time
from pathlib import Path
from typing import Optional, TypedDict, Union

from dimos_lcm.foxglove_msgs.ImageAnnotations import ImageAnnotations
Expand All @@ -32,7 +28,6 @@
from dimos.perception.detection2d.module3D import Detection3DModule
from dimos.perception.detection2d.moduleDB import ObjectDBModule
from dimos.perception.detection2d.type import ImageDetections2D, ImageDetections3D
from dimos.protocol.service import lcmservice as lcm
from dimos.protocol.tf import TF
from dimos.robot.unitree_webrtc.modular.connection_module import ConnectionModule
from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
Expand Down Expand Up @@ -83,7 +78,6 @@ def get_g1_moment(**kwargs):
for timestamp, tf_frame in tf_replay.iterate_ts(seek=seek - tf_window, duration=tf_window):
tf.publish(*TFMessage.lcm_decode(tf_frame).transforms)

print(tf)
image_frame = Image.lcm_decode(
TimedSensorReplay(f"{data_dir}/image#sensor_msgs.Image").find_closest_seek(seek)
)
Expand Down
6 changes: 3 additions & 3 deletions dimos/perception/detection2d/type/detection3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def height_filter(height=0.1) -> Detection3DFilter:
return lambda det, pc, ci, tf: pc.filter_by_height(height)


def statistical(nb_neighbors=20, std_ratio=0.5) -> Detection3DFilter:
def statistical(nb_neighbors=40, std_ratio=0.5) -> Detection3DFilter:
def filter_func(
det: Detection2D, pc: PointCloud2, ci: CameraInfo, tf: Transform
) -> Optional[PointCloud2]:
Expand Down Expand Up @@ -74,7 +74,7 @@ def filter_func(
return filter_func


def radius_outlier(min_neighbors: int = 5, radius: float = 0.3) -> Detection3DFilter:
def radius_outlier(min_neighbors: int = 20, radius: float = 0.3) -> Detection3DFilter:
"""
Remove isolated points: keep only points that have at least `min_neighbors`
neighbors within `radius` meters (same units as your point cloud).
Expand Down Expand Up @@ -109,8 +109,8 @@ def from_2d(
filters: list[Callable[[PointCloud2], PointCloud2]] = [
# height_filter(0.1),
raycast(),
statistical(),
radius_outlier(),
statistical(),
],
) -> Optional["Detection3D"]:
"""Create a Detection3D from a 2D detection by projecting world pointcloud.
Expand Down
48 changes: 23 additions & 25 deletions dimos/robot/unitree_webrtc/unitree_g1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,14 @@
import os
import time
from typing import Optional
from dimos import core
from dimos.core import In, Module, Out, rpc
from geometry_msgs.msg import PoseStamped as ROSPoseStamped

from dimos.msgs.sensor_msgs import Joy
from dimos.msgs.std_msgs.Bool import Bool
from dimos.robot.unitree_webrtc.rosnav import NavigationModule
from geometry_msgs.msg import PoseStamped as ROSPoseStamped
from geometry_msgs.msg import TwistStamped as ROSTwistStamped
from lcm_msgs.foxglove_msgs import SceneUpdate

from dimos_lcm.foxglove_msgs import SceneUpdate
from nav_msgs.msg import Odometry as ROSOdometry
from sensor_msgs.msg import PointCloud2 as ROSPointCloud2, Joy as ROSJoy
from sensor_msgs.msg import Joy as ROSJoy
from sensor_msgs.msg import PointCloud2 as ROSPointCloud2
from tf2_msgs.msg import TFMessage as ROSTFMessage

from dimos import core
Expand All @@ -50,7 +47,8 @@
Vector3,
)
from dimos.msgs.nav_msgs.Odometry import Odometry
from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
from dimos.msgs.sensor_msgs import CameraInfo, Image, Joy, PointCloud2
from dimos.msgs.std_msgs.Bool import Bool
from dimos.msgs.tf2_msgs.TFMessage import TFMessage
from dimos.msgs.vision_msgs import Detection2DArray
from dimos.perception.detection2d import Detection3DModule
Expand Down Expand Up @@ -203,7 +201,7 @@ def _deploy_detection(self, goto):
detection.detections.transport = core.LCMTransport("/detections", Detection2DArray)

detection.scene_update.transport = core.LCMTransport("/scene_update", SceneUpdate)

detection.target.transport = core.LCMTransport("/target", PoseStamped)
detection.detected_pointcloud_0.transport = core.LCMTransport(
"/detected/pointcloud/0", PointCloud2
)
Expand Down Expand Up @@ -251,9 +249,9 @@ def start(self):

self.lcm.start()

from dimos.agents2.spec import Model, Provider
from dimos.agents2 import Agent, Output, Reducer, Stream, skill
from dimos.agents2.cli.human import HumanInput
from dimos.agents2.spec import Model, Provider

agent = Agent(
system_prompt="You are a helpful assistant for controlling a humanoid robot. ",
Expand Down Expand Up @@ -289,7 +287,7 @@ def _deploy_camera(self):
CameraModule,
transform=Transform(
translation=Vector3(0.05, 0.0, 0.0),
rotation=Quaternion(0.0, 0.0, 0.0, 1.0),
rotation=Quaternion.from_euler(Vector3(0.0, 0.2, 0.0)),
frame_id="sensor",
child_frame_id="camera_link",
),
Expand Down Expand Up @@ -461,19 +459,19 @@ def main():
)
robot.start()

time.sleep(7)
print("Starting navigation...")
print(
robot.nav.go_to(
PoseStamped(
ts=time.time(),
frame_id="map",
position=Vector3(0.0, 0.0, 0.03),
orientation=Quaternion(0, 0, 0, 0),
),
timeout=10,
),
)
# time.sleep(7)
# print("Starting navigation...")
# print(
# robot.nav.go_to(
# PoseStamped(
# ts=time.time(),
# frame_id="map",
# position=Vector3(0.0, 0.0, 0.03),
# orientation=Quaternion(0, 0, 0, 0),
# ),
# timeout=10,
# ),
# )
try:
if args.joystick:
print("\n" + "=" * 50)
Expand Down
Loading