diff --git a/dimos/hardware/can_activate.sh b/dimos/hardware/can_activate.sh new file mode 100644 index 0000000000..60cc95e7ea --- /dev/null +++ b/dimos/hardware/can_activate.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +# The default CAN name can be set by the user via command-line parameters. +DEFAULT_CAN_NAME="${1:-can0}" + +# The default bitrate for a single CAN module can be set by the user via command-line parameters. +DEFAULT_BITRATE="${2:-1000000}" + +# USB hardware address (optional parameter) +USB_ADDRESS="${3}" +echo "-------------------START-----------------------" +# Check if ethtool is installed. +if ! dpkg -l | grep -q "ethtool"; then + echo "\e[31mError: ethtool not detected in the system.\e[0m" + echo "Please use the following command to install ethtool:" + echo "sudo apt update && sudo apt install ethtool" + exit 1 +fi + +# Check if can-utils is installed. +if ! dpkg -l | grep -q "can-utils"; then + echo "\e[31mError: can-utils not detected in the system.\e[0m" + echo "Please use the following command to install ethtool:" + echo "sudo apt update && sudo apt install can-utils" + exit 1 +fi + +echo "Both ethtool and can-utils are installed." + +# Retrieve the number of CAN modules in the current system. +CURRENT_CAN_COUNT=$(ip link show type can | grep -c "link/can") + +# Verify if the number of CAN modules in the current system matches the expected value. +if [ "$CURRENT_CAN_COUNT" -ne "1" ]; then + if [ -z "$USB_ADDRESS" ]; then + # Iterate through all CAN interfaces. + for iface in $(ip -br link show type can | awk '{print $1}'); do + # Use ethtool to retrieve bus-info. + BUS_INFO=$(sudo ethtool -i "$iface" | grep "bus-info" | awk '{print $2}') + + if [ -z "$BUS_INFO" ];then + echo "Error: Unable to retrieve bus-info for interface $iface." + continue + fi + + echo "Interface $iface is inserted into USB port $BUS_INFO" + done + echo -e " \e[31m Error: The number of CAN modules detected by the system ($CURRENT_CAN_COUNT) does not match the expected number (1). \e[0m" + echo -e " \e[31m Please add the USB hardware address parameter, such as: \e[0m" + echo -e " bash can_activate.sh can0 1000000 1-2:1.0" + echo "-------------------ERROR-----------------------" + exit 1 + fi +fi + +# Load the gs_usb module. +# sudo modprobe gs_usb +# if [ $? -ne 0 ]; then +# echo "Error: Unable to load the gs_usb module." +# exit 1 +# fi + +if [ -n "$USB_ADDRESS" ]; then + echo "Detected USB hardware address parameter: $USB_ADDRESS" + + # Use ethtool to find the CAN interface corresponding to the USB hardware address. + INTERFACE_NAME="" + for iface in $(ip -br link show type can | awk '{print $1}'); do + BUS_INFO=$(sudo ethtool -i "$iface" | grep "bus-info" | awk '{print $2}') + if [ "$BUS_INFO" = "$USB_ADDRESS" ]; then + INTERFACE_NAME="$iface" + break + fi + done + + if [ -z "$INTERFACE_NAME" ]; then + echo "Error: Unable to find CAN interface corresponding to USB hardware address $USB_ADDRESS." + exit 1 + else + echo "Found the interface corresponding to USB hardware address $USB_ADDRESS: $INTERFACE_NAME." + fi +else + # Retrieve the unique CAN interface. + INTERFACE_NAME=$(ip -br link show type can | awk '{print $1}') + + # Check if the interface name has been retrieved. + if [ -z "$INTERFACE_NAME" ]; then + echo "Error: Unable to detect CAN interface." + exit 1 + fi + BUS_INFO=$(sudo ethtool -i "$INTERFACE_NAME" | grep "bus-info" | awk '{print $2}') + echo "Expected to configure a single CAN module, detected interface $INTERFACE_NAME with corresponding USB address $BUS_INFO." +fi + +# Check if the current interface is already activated. +IS_LINK_UP=$(ip link show "$INTERFACE_NAME" | grep -q "UP" && echo "yes" || echo "no") + +# Retrieve the bitrate of the current interface. +CURRENT_BITRATE=$(ip -details link show "$INTERFACE_NAME" | grep -oP 'bitrate \K\d+') + +if [ "$IS_LINK_UP" = "yes" ] && [ "$CURRENT_BITRATE" -eq "$DEFAULT_BITRATE" ]; then + echo "Interface $INTERFACE_NAME is already activated with a bitrate of $DEFAULT_BITRATE." + + # Check if the interface name matches the default name. + if [ "$INTERFACE_NAME" != "$DEFAULT_CAN_NAME" ]; then + echo "Rename interface $INTERFACE_NAME to $DEFAULT_CAN_NAME." + sudo ip link set "$INTERFACE_NAME" down + sudo ip link set "$INTERFACE_NAME" name "$DEFAULT_CAN_NAME" + sudo ip link set "$DEFAULT_CAN_NAME" up + echo "The interface has been renamed to $DEFAULT_CAN_NAME and reactivated." + else + echo "The interface name is already $DEFAULT_CAN_NAME." + fi +else + # If the interface is not activated or the bitrate is different, configure it. + if [ "$IS_LINK_UP" = "yes" ]; then + echo "Interface $INTERFACE_NAME is already activated, but the bitrate is $CURRENT_BITRATE, which does not match the set value of $DEFAULT_BITRATE." + else + echo "Interface $INTERFACE_NAME is not activated or bitrate is not set." + fi + + # Set the interface bitrate and activate it. + sudo ip link set "$INTERFACE_NAME" down + sudo ip link set "$INTERFACE_NAME" type can bitrate $DEFAULT_BITRATE + sudo ip link set "$INTERFACE_NAME" up + echo "Interface $INTERFACE_NAME has been reset to bitrate $DEFAULT_BITRATE and activated." + + # Rename the interface to the default name. + if [ "$INTERFACE_NAME" != "$DEFAULT_CAN_NAME" ]; then + echo "Rename interface $INTERFACE_NAME to $DEFAULT_CAN_NAME." + sudo ip link set "$INTERFACE_NAME" down + sudo ip link set "$INTERFACE_NAME" name "$DEFAULT_CAN_NAME" + sudo ip link set "$DEFAULT_CAN_NAME" up + echo "The interface has been renamed to $DEFAULT_CAN_NAME and reactivated." + fi +fi + +echo "-------------------OVER------------------------" diff --git a/dimos/hardware/piper_arm.py b/dimos/hardware/piper_arm.py new file mode 100644 index 0000000000..774f70b1c6 --- /dev/null +++ b/dimos/hardware/piper_arm.py @@ -0,0 +1,517 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# dimos/hardware/piper_arm.py + +from typing import ( + Optional, + Tuple, +) +from piper_sdk import * # from the official Piper SDK +import numpy as np +import time +import subprocess +import kinpy as kp +import sys +import termios +import tty +import select +from scipy.spatial.transform import Rotation as R +from dimos.utils.transform_utils import euler_to_quaternion, quaternion_to_euler +from dimos.utils.logging_config import setup_logger + +import threading + +import pytest + +import dimos.core as core +import dimos.protocol.service.lcmservice as lcmservice +from dimos.core import In, Module, rpc +from dimos_lcm.geometry_msgs import Pose, Vector3, Twist + +logger = setup_logger("dimos.hardware.piper_arm") + + +class PiperArm: + def __init__(self, arm_name: str = "arm"): + self.arm = C_PiperInterface_V2() + self.arm.ConnectPort() + self.resetArm() + time.sleep(0.5) + self.resetArm() + time.sleep(0.5) + self.enable() + self.enable_gripper() # Enable gripper after arm is enabled + self.gotoZero() + time.sleep(1) + self.init_vel_controller() + + def enable(self): + while not self.arm.EnablePiper(): + pass + time.sleep(0.01) + logger.info("Arm enabled") + # self.arm.ModeCtrl( + # ctrl_mode=0x01, # CAN command mode + # move_mode=0x01, # “Move-J”, but ignored in MIT + # move_spd_rate_ctrl=100, # doesn’t matter in MIT + # is_mit_mode=0xAD # <-- the magic flag + # ) + self.arm.MotionCtrl_2(0x01, 0x01, 80, 0xAD) + + def gotoZero(self): + factor = 1000 + position = [57.0, 0.0, 215.0, 0, 90.0, 0, 0] + X = round(position[0] * factor) + Y = round(position[1] * factor) + Z = round(position[2] * factor) + RX = round(position[3] * factor) + RY = round(position[4] * factor) + RZ = round(position[5] * factor) + joint_6 = round(position[6] * factor) + logger.debug(f"Going to zero position: X={X}, Y={Y}, Z={Z}, RX={RX}, RY={RY}, RZ={RZ}") + self.arm.MotionCtrl_2(0x01, 0x00, 100, 0x00) + self.arm.EndPoseCtrl(X, Y, Z, RX, RY, RZ) + self.arm.GripperCtrl(0, 1000, 0x01, 0) + + def gotoObserve(self): + factor = 1000 + position = [57.0, 0.0, 280.0, 0, 120.0, 0, 0] + X = round(position[0] * factor) + Y = round(position[1] * factor) + Z = round(position[2] * factor) + RX = round(position[3] * factor) + RY = round(position[4] * factor) + RZ = round(position[5] * factor) + joint_6 = round(position[6] * factor) + logger.debug(f"Going to zero position: X={X}, Y={Y}, Z={Z}, RX={RX}, RY={RY}, RZ={RZ}") + self.arm.MotionCtrl_2(0x01, 0x00, 100, 0x00) + self.arm.EndPoseCtrl(X, Y, Z, RX, RY, RZ) + + def softStop(self): + self.gotoZero() + time.sleep(1) + self.arm.MotionCtrl_2( + 0x01, + 0x00, + 100, + ) + self.arm.MotionCtrl_1(0x01, 0, 0) + time.sleep(3) + + def cmd_ee_pose_values(self, x, y, z, r, p, y_, line_mode=False): + """Command end-effector to target pose in space (position + Euler angles)""" + factor = 1000 + pose = [ + x * factor * factor, + y * factor * factor, + z * factor * factor, + r * factor, + p * factor, + y_ * factor, + ] + self.arm.MotionCtrl_2(0x01, 0x02 if line_mode else 0x00, 100, 0x00) + self.arm.EndPoseCtrl( + int(pose[0]), int(pose[1]), int(pose[2]), int(pose[3]), int(pose[4]), int(pose[5]) + ) + + def cmd_ee_pose(self, pose: Pose, line_mode=False): + """Command end-effector to target pose using Pose message""" + # Convert quaternion to euler angles + euler = quaternion_to_euler(pose.orientation, degrees=True) + + # Command the pose + self.cmd_ee_pose_values( + pose.position.x, + pose.position.y, + pose.position.z, + euler.x, + euler.y, + euler.z, + line_mode, + ) + + def get_ee_pose(self): + """Return the current end-effector pose as Pose message with position in meters and quaternion orientation""" + pose = self.arm.GetArmEndPoseMsgs() + factor = 1000.0 + # Extract individual pose values and convert to base units + # Position values are divided by 1000 to convert from SDK units to meters + # Rotation values are divided by 1000 to convert from SDK units to radians + x = pose.end_pose.X_axis / factor / factor # Convert mm to m + y = pose.end_pose.Y_axis / factor / factor # Convert mm to m + z = pose.end_pose.Z_axis / factor / factor # Convert mm to m + rx = pose.end_pose.RX_axis / factor + ry = pose.end_pose.RY_axis / factor + rz = pose.end_pose.RZ_axis / factor + + # Create position vector (already in meters) + position = Vector3(x, y, z) + + orientation = euler_to_quaternion(Vector3(rx, ry, rz), degrees=True) + + return Pose(position, orientation) + + def cmd_gripper_ctrl(self, position, effort=0.25): + """Command end-effector gripper""" + factor = 1000 + position = position * factor * factor # meters + effort = effort * factor # N/m + + self.arm.GripperCtrl(abs(round(position)), abs(round(effort)), 0x01, 0) + logger.debug(f"Commanding gripper position: {position}mm") + + def enable_gripper(self): + """Enable the gripper using the initialization sequence""" + logger.info("Enabling gripper...") + while not self.arm.EnablePiper(): + time.sleep(0.01) + self.arm.GripperCtrl(0, 1000, 0x02, 0) + self.arm.GripperCtrl(0, 1000, 0x01, 0) + logger.info("Gripper enabled") + + def release_gripper(self): + """Release gripper by opening to 100mm (10cm)""" + logger.info("Releasing gripper (opening to 100mm)") + self.cmd_gripper_ctrl(0.1) # 0.1m = 100mm = 10cm + + def get_gripper_feedback(self) -> Tuple[float, float]: + """ + Get current gripper feedback. + + Returns: + Tuple of (angle_degrees, effort) where: + - angle_degrees: Current gripper angle in degrees + - effort: Current gripper effort (0.0 to 1.0 range) + """ + gripper_msg = self.arm.GetArmGripperMsgs() + angle_degrees = ( + gripper_msg.gripper_state.grippers_angle / 1000.0 + ) # Convert from SDK units to degrees + effort = gripper_msg.gripper_state.grippers_effort / 1000.0 # Convert from SDK units to N/m + return angle_degrees, effort + + def close_gripper(self, commanded_effort: float = 0.5) -> None: + """ + Close the gripper. + + Args: + commanded_effort: Effort to use when closing gripper (default 0.25 N/m) + """ + # Command gripper to close (0.0 position) + self.cmd_gripper_ctrl(0.0, effort=commanded_effort) + logger.info("Closing gripper") + + def gripper_object_detected(self, commanded_effort: float = 0.25) -> bool: + """ + Check if an object is detected in the gripper based on effort feedback. + + Args: + commanded_effort: The effort that was used when closing gripper (default 0.25 N/m) + + Returns: + True if object is detected in gripper, False otherwise + """ + # Get gripper feedback + angle_degrees, actual_effort = self.get_gripper_feedback() + + # Check if object is grasped (effort > 80% of commanded effort) + effort_threshold = 0.8 * commanded_effort + object_present = abs(actual_effort) > effort_threshold + + if object_present: + logger.info(f"Object detected in gripper (effort: {actual_effort:.3f} N/m)") + else: + logger.info(f"No object detected (effort: {actual_effort:.3f} N/m)") + + return object_present + + def resetArm(self): + self.arm.MotionCtrl_1(0x02, 0, 0) + self.arm.MotionCtrl_2(0, 0, 0, 0x00) + logger.info("Resetting arm") + + def init_vel_controller(self): + self.chain = kp.build_serial_chain_from_urdf( + open("dimos/hardware/piper_description.urdf"), "gripper_base" + ) + self.J = self.chain.jacobian(np.zeros(6)) + self.J_pinv = np.linalg.pinv(self.J) + self.dt = 0.01 + + def cmd_vel(self, x_dot, y_dot, z_dot, R_dot, P_dot, Y_dot): + joint_state = self.arm.GetArmJointMsgs().joint_state + # print(f"[PiperArm] Current Joints (direct): {joint_state}", type(joint_state)) + joint_angles = np.array( + [ + joint_state.joint_1, + joint_state.joint_2, + joint_state.joint_3, + joint_state.joint_4, + joint_state.joint_5, + joint_state.joint_6, + ] + ) + # print(f"[PiperArm] Current Joints: {joint_angles}", type(joint_angles)) + factor = 57295.7795 # 1000*180/3.1415926 + joint_angles = joint_angles / factor # convert to radians + + q = np.array( + [ + joint_angles[0], + joint_angles[1], + joint_angles[2], + joint_angles[3], + joint_angles[4], + joint_angles[5], + ] + ) + J = self.chain.jacobian(q) + self.J_pinv = np.linalg.pinv(J) + dq = self.J_pinv @ np.array([x_dot, y_dot, z_dot, R_dot, P_dot, Y_dot]) * self.dt + newq = q + dq + + newq = newq * factor + + self.arm.MotionCtrl_2(0x01, 0x01, 100, 0xAD) + self.arm.JointCtrl( + int(round(newq[0])), + int(round(newq[1])), + int(round(newq[2])), + int(round(newq[3])), + int(round(newq[4])), + int(round(newq[5])), + ) + time.sleep(self.dt) + # print(f"[PiperArm] Moving to Joints to : {newq}") + + def cmd_vel_ee(self, x_dot, y_dot, z_dot, RX_dot, PY_dot, YZ_dot): + factor = 1000 + x_dot = x_dot * factor + y_dot = y_dot * factor + z_dot = z_dot * factor + RX_dot = RX_dot * factor + PY_dot = PY_dot * factor + YZ_dot = YZ_dot * factor + + current_pose_msg = self.get_ee_pose() + + # Convert quaternion to euler angles + quat = [ + current_pose_msg.orientation.x, + current_pose_msg.orientation.y, + current_pose_msg.orientation.z, + current_pose_msg.orientation.w, + ] + rotation = R.from_quat(quat) + euler = rotation.as_euler("xyz") # Returns [rx, ry, rz] in radians + + # Create current pose array [x, y, z, rx, ry, rz] + current_pose = np.array( + [ + current_pose_msg.position.x, + current_pose_msg.position.y, + current_pose_msg.position.z, + euler[0], + euler[1], + euler[2], + ] + ) + + # Apply velocity increment + current_pose = ( + current_pose + np.array([x_dot, y_dot, z_dot, RX_dot, PY_dot, YZ_dot]) * self.dt + ) + + self.cmd_ee_pose_values( + current_pose[0], + current_pose[1], + current_pose[2], + current_pose[3], + current_pose[4], + current_pose[5], + ) + time.sleep(self.dt) + + def disable(self): + self.softStop() + + while self.arm.DisablePiper(): + pass + time.sleep(0.01) + self.arm.DisconnectPort() + + +class VelocityController(Module): + cmd_vel: In[Twist] = None + + def __init__(self, arm, period=0.01, *args, **kwargs): + super().__init__(*args, **kwargs) + self.arm = arm + self.period = period + self.latest_cmd = None + self.last_cmd_time = None + + @rpc + def start(self): + self.cmd_vel.subscribe(self.handle_cmd_vel) + + def control_loop(): + while True: + # Check for timeout (1 second) + if self.last_cmd_time and (time.time() - self.last_cmd_time) > 1.0: + logger.warning( + "No velocity command received for 1 second, stopping control loop" + ) + break + + cmd_vel = self.latest_cmd + + joint_state = self.arm.GetArmJointMsgs().joint_state + # print(f"[PiperArm] Current Joints (direct): {joint_state}", type(joint_state)) + joint_angles = np.array( + [ + joint_state.joint_1, + joint_state.joint_2, + joint_state.joint_3, + joint_state.joint_4, + joint_state.joint_5, + joint_state.joint_6, + ] + ) + factor = 57295.7795 # 1000*180/3.1415926 + joint_angles = joint_angles / factor # convert to radians + q = np.array( + [ + joint_angles[0], + joint_angles[1], + joint_angles[2], + joint_angles[3], + joint_angles[4], + joint_angles[5], + ] + ) + + J = self.chain.jacobian(q) + self.J_pinv = np.linalg.pinv(J) + dq = ( + self.J_pinv + @ np.array( + [ + cmd_vel.linear.X, + cmd_vel.linear.y, + cmd_vel.linear.z, + cmd_vel.angular.x, + cmd_vel.angular.y, + cmd_vel.angular.z, + ] + ) + * self.dt + ) + newq = q + dq + + newq = newq * factor # convert radians to scaled degree units for joint control + + self.arm.MotionCtrl_2(0x01, 0x01, 100, 0xAD) + self.arm.JointCtrl( + int(round(newq[0])), + int(round(newq[1])), + int(round(newq[2])), + int(round(newq[3])), + int(round(newq[4])), + int(round(newq[5])), + ) + time.sleep(self.period) + + thread = threading.Thread(target=control_loop, daemon=True) + thread.start() + + def handle_cmd_vel(self, cmd_vel: Twist): + self.latest_cmd = cmd_vel + self.last_cmd_time = time.time() + + +@pytest.mark.tool +def run_velocity_controller(): + lcmservice.autoconf() + dimos = core.start(2) + + velocity_controller = dimos.deploy(VelocityController, arm=arm, period=0.01) + velocity_controller.cmd_vel.transport = core.LCMTransport("/cmd_vel", Twist) + + velocity_controller.start() + + logger.info("Velocity controller started") + while True: + time.sleep(1) + + +if __name__ == "__main__": + arm = PiperArm() + + def get_key(timeout=0.1): + """Non-blocking key reader for arrow keys.""" + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + try: + tty.setraw(fd) + rlist, _, _ = select.select([fd], [], [], timeout) + if rlist: + ch1 = sys.stdin.read(1) + if ch1 == "\x1b": # Arrow keys start with ESC + ch2 = sys.stdin.read(1) + if ch2 == "[": + ch3 = sys.stdin.read(1) + return ch1 + ch2 + ch3 + else: + return ch1 + return None + finally: + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + def teleop_linear_vel(arm): + print("Use arrow keys to control linear velocity (x/y/z). Press 'q' to quit.") + print("Up/Down: +x/-x, Left/Right: +y/-y, 'w'/'s': +z/-z") + x_dot, y_dot, z_dot = 0.0, 0.0, 0.0 + while True: + key = get_key(timeout=0.1) + if key == "\x1b[A": # Up arrow + x_dot += 0.01 + elif key == "\x1b[B": # Down arrow + x_dot -= 0.01 + elif key == "\x1b[C": # Right arrow + y_dot += 0.01 + elif key == "\x1b[D": # Left arrow + y_dot -= 0.01 + elif key == "w": + z_dot += 0.01 + elif key == "s": + z_dot -= 0.01 + elif key == "q": + logger.info("Exiting teleop") + arm.disable() + break + + # Optionally, clamp velocities to reasonable limits + x_dot = max(min(x_dot, 0.5), -0.5) + y_dot = max(min(y_dot, 0.5), -0.5) + z_dot = max(min(z_dot, 0.5), -0.5) + + # Only linear velocities, angular set to zero + arm.cmd_vel_ee(x_dot, y_dot, z_dot, 0, 0, 0) + logger.debug( + f"Current linear velocity: x={x_dot:.3f} m/s, y={y_dot:.3f} m/s, z={z_dot:.3f} m/s" + ) + + run_velocity_controller() diff --git a/dimos/hardware/piper_description.urdf b/dimos/hardware/piper_description.urdf new file mode 100755 index 0000000000..21209b6dbb --- /dev/null +++ b/dimos/hardware/piper_description.urdf @@ -0,0 +1,497 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dimos/hardware/zed_camera.py b/dimos/hardware/zed_camera.py index b93d2577e6..7ee2aed634 100644 --- a/dimos/hardware/zed_camera.py +++ b/dimos/hardware/zed_camera.py @@ -17,6 +17,10 @@ import open3d as o3d from typing import Optional, Tuple, Dict, Any import logging +import time +import threading +from reactivex import interval +from reactivex import operators as ops try: import pyzed.sl as sl @@ -25,8 +29,17 @@ logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.") from dimos.hardware.stereo_camera import StereoCamera +from dimos.core import Module, Out, rpc +from dimos.utils.logging_config import setup_logger -logger = logging.getLogger(__name__) +# Import LCM message types +from dimos_lcm.sensor_msgs import Image +from dimos_lcm.sensor_msgs import CameraInfo +from dimos_lcm.geometry_msgs import PoseStamped +from dimos_lcm.std_msgs import Header, Time +from dimos_lcm.geometry_msgs import Pose, Point, Quaternion + +logger = setup_logger(__name__) class ZEDCamera(StereoCamera): @@ -64,6 +77,7 @@ def __init__( self.init_params = sl.InitParameters() self.init_params.camera_resolution = resolution self.init_params.depth_mode = depth_mode + self.init_params.coordinate_system = sl.COORDINATE_SYSTEM.RIGHT_HANDED_Z_UP_X_FWD self.init_params.coordinate_units = sl.UNIT.METER self.init_params.camera_fps = fps @@ -84,6 +98,12 @@ def __init__( self.point_cloud = sl.Mat() self.confidence_map = sl.Mat() + # Positional tracking + self.tracking_enabled = False + self.tracking_params = sl.PositionalTrackingParameters() + self.camera_pose = sl.Pose() + self.sensors_data = sl.SensorsData() + self.is_opened = False def open(self) -> bool: @@ -109,12 +129,160 @@ def open(self) -> bool: logger.error(f"Error opening ZED camera: {e}") return False - def close(self): - """Close the ZED camera.""" - if self.is_opened: - self.zed.close() - self.is_opened = False - logger.info("ZED camera closed") + def enable_positional_tracking( + self, + enable_area_memory: bool = False, + enable_pose_smoothing: bool = True, + enable_imu_fusion: bool = True, + set_floor_as_origin: bool = False, + initial_world_transform: Optional[sl.Transform] = None, + ) -> bool: + """ + Enable positional tracking on the ZED camera. + + Args: + enable_area_memory: Enable area learning to correct tracking drift + enable_pose_smoothing: Enable pose smoothing + enable_imu_fusion: Enable IMU fusion if available + set_floor_as_origin: Set the floor as origin (useful for robotics) + initial_world_transform: Initial world transform + + Returns: + True if tracking enabled successfully + """ + if not self.is_opened: + logger.error("ZED camera not opened") + return False + + try: + # Configure tracking parameters + self.tracking_params.enable_area_memory = enable_area_memory + self.tracking_params.enable_pose_smoothing = enable_pose_smoothing + self.tracking_params.enable_imu_fusion = enable_imu_fusion + self.tracking_params.set_floor_as_origin = set_floor_as_origin + + if initial_world_transform is not None: + self.tracking_params.initial_world_transform = initial_world_transform + + # Enable tracking + err = self.zed.enable_positional_tracking(self.tracking_params) + if err != sl.ERROR_CODE.SUCCESS: + logger.error(f"Failed to enable positional tracking: {err}") + return False + + self.tracking_enabled = True + logger.info("Positional tracking enabled successfully") + return True + + except Exception as e: + logger.error(f"Error enabling positional tracking: {e}") + return False + + def disable_positional_tracking(self): + """Disable positional tracking.""" + if self.tracking_enabled: + self.zed.disable_positional_tracking() + self.tracking_enabled = False + logger.info("Positional tracking disabled") + + def get_pose( + self, reference_frame: sl.REFERENCE_FRAME = sl.REFERENCE_FRAME.WORLD + ) -> Optional[Dict[str, Any]]: + """ + Get the current camera pose. + + Args: + reference_frame: Reference frame (WORLD or CAMERA) + + Returns: + Dictionary containing: + - position: [x, y, z] in meters + - rotation: [x, y, z, w] quaternion + - euler_angles: [roll, pitch, yaw] in radians + - timestamp: Pose timestamp in nanoseconds + - confidence: Tracking confidence (0-100) + - valid: Whether pose is valid + """ + if not self.tracking_enabled: + logger.error("Positional tracking not enabled") + return None + + try: + # Get current pose + tracking_state = self.zed.get_position(self.camera_pose, reference_frame) + + if tracking_state == sl.POSITIONAL_TRACKING_STATE.OK: + # Extract translation + translation = self.camera_pose.get_translation().get() + + # Extract rotation (quaternion) + rotation = self.camera_pose.get_orientation().get() + + # Get Euler angles + euler = self.camera_pose.get_euler_angles() + + return { + "position": translation.tolist(), + "rotation": rotation.tolist(), # [x, y, z, w] + "euler_angles": euler.tolist(), # [roll, pitch, yaw] + "timestamp": self.camera_pose.timestamp.get_nanoseconds(), + "confidence": self.camera_pose.pose_confidence, + "valid": True, + "tracking_state": str(tracking_state), + } + else: + logger.warning(f"Tracking state: {tracking_state}") + return {"valid": False, "tracking_state": str(tracking_state)} + + except Exception as e: + logger.error(f"Error getting pose: {e}") + return None + + def get_imu_data(self) -> Optional[Dict[str, Any]]: + """ + Get IMU sensor data if available. + + Returns: + Dictionary containing: + - orientation: IMU orientation quaternion [x, y, z, w] + - angular_velocity: [x, y, z] in rad/s + - linear_acceleration: [x, y, z] in m/s² + - timestamp: IMU data timestamp + """ + if not self.is_opened: + logger.error("ZED camera not opened") + return None + + try: + # Get sensors data synchronized with images + if ( + self.zed.get_sensors_data(self.sensors_data, sl.TIME_REFERENCE.IMAGE) + == sl.ERROR_CODE.SUCCESS + ): + imu = self.sensors_data.get_imu_data() + + # Get IMU orientation + imu_orientation = imu.get_pose().get_orientation().get() + + # Get angular velocity + angular_vel = imu.get_angular_velocity() + + # Get linear acceleration + linear_accel = imu.get_linear_acceleration() + + return { + "orientation": imu_orientation.tolist(), + "angular_velocity": angular_vel.tolist(), + "linear_acceleration": linear_accel.tolist(), + "timestamp": self.sensors_data.timestamp.get_nanoseconds(), + "temperature": self.sensors_data.temperature.get(sl.SENSOR_LOCATION.IMU), + } + else: + return None + + except Exception as e: + logger.error(f"Error getting IMU data: {e}") + return None def capture_frame( self, @@ -211,6 +379,52 @@ def capture_pointcloud(self) -> Optional[o3d.geometry.PointCloud]: logger.error(f"Error capturing point cloud: {e}") return None + def capture_frame_with_pose( + self, + ) -> Tuple[ + Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], Optional[Dict[str, Any]] + ]: + """ + Capture a frame with synchronized pose data. + + Returns: + Tuple of (left_image, right_image, depth_map, pose_data) + """ + if not self.is_opened: + logger.error("ZED camera not opened") + return None, None, None, None + + try: + # Grab frame + if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS: + # Get images and depth + left_img, right_img, depth = self.capture_frame() + + # Get synchronized pose if tracking is enabled + pose_data = None + if self.tracking_enabled: + pose_data = self.get_pose() + + return left_img, right_img, depth, pose_data + else: + logger.warning("Failed to grab frame from ZED camera") + return None, None, None, None + + except Exception as e: + logger.error(f"Error capturing frame with pose: {e}") + return None, None, None, None + + def close(self): + """Close the ZED camera.""" + if self.is_opened: + # Disable tracking if enabled + if self.tracking_enabled: + self.disable_positional_tracking() + + self.zed.close() + self.is_opened = False + logger.info("ZED camera closed") + def get_camera_info(self) -> Dict[str, Any]: """Get ZED camera information and calibration parameters.""" if not self.is_opened: @@ -311,3 +525,344 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" self.close() + + +class ZEDModule(Module): + """ + Dask module for ZED camera that publishes sensor data via LCM. + + Publishes: + - /zed/color_image: RGB camera images + - /zed/depth_image: Depth images + - /zed/camera_info: Camera calibration information + - /zed/pose: Camera pose (if tracking enabled) + """ + + # Define LCM outputs + color_image: Out[Image] = None + depth_image: Out[Image] = None + camera_info: Out[CameraInfo] = None + pose: Out[PoseStamped] = None + + def __init__( + self, + camera_id: int = 0, + resolution: str = "HD720", + depth_mode: str = "NEURAL", + fps: int = 30, + enable_tracking: bool = True, + enable_imu_fusion: bool = True, + set_floor_as_origin: bool = True, + publish_rate: float = 30.0, + frame_id: str = "zed_camera", + **kwargs, + ): + """ + Initialize ZED Module. + + Args: + camera_id: Camera ID (0 for first ZED) + resolution: Resolution string ("HD720", "HD1080", "HD2K", "VGA") + depth_mode: Depth mode string ("NEURAL", "ULTRA", "QUALITY", "PERFORMANCE") + fps: Camera frame rate + enable_tracking: Enable positional tracking + enable_imu_fusion: Enable IMU fusion for tracking + set_floor_as_origin: Set floor as origin for tracking + publish_rate: Rate to publish messages (Hz) + frame_id: TF frame ID for messages + """ + super().__init__(**kwargs) + + self.camera_id = camera_id + self.fps = fps + self.enable_tracking = enable_tracking + self.enable_imu_fusion = enable_imu_fusion + self.set_floor_as_origin = set_floor_as_origin + self.publish_rate = publish_rate + self.frame_id = frame_id + + # Convert string parameters to ZED enums + self.resolution = getattr(sl.RESOLUTION, resolution, sl.RESOLUTION.HD720) + self.depth_mode = getattr(sl.DEPTH_MODE, depth_mode, sl.DEPTH_MODE.NEURAL) + + # Internal state + self.zed_camera = None + self._running = False + self._subscription = None + self._sequence = 0 + + logger.info(f"ZEDModule initialized for camera {camera_id}") + + @rpc + def start(self): + """Start the ZED module and begin publishing data.""" + if self._running: + logger.warning("ZED module already running") + return + + try: + # Initialize ZED camera + self.zed_camera = ZEDCamera( + camera_id=self.camera_id, + resolution=self.resolution, + depth_mode=self.depth_mode, + fps=self.fps, + ) + + # Open camera + if not self.zed_camera.open(): + logger.error("Failed to open ZED camera") + return + + # Enable tracking if requested + if self.enable_tracking: + success = self.zed_camera.enable_positional_tracking( + enable_imu_fusion=self.enable_imu_fusion, + set_floor_as_origin=self.set_floor_as_origin, + enable_pose_smoothing=True, + enable_area_memory=True, + ) + if not success: + logger.warning("Failed to enable positional tracking") + self.enable_tracking = False + + # Publish camera info once at startup + self._publish_camera_info() + + # Start periodic frame capture and publishing + self._running = True + publish_interval = 1.0 / self.publish_rate + + self._subscription = interval(publish_interval).subscribe( + lambda _: self._capture_and_publish() + ) + + logger.info(f"ZED module started, publishing at {self.publish_rate} Hz") + + except Exception as e: + logger.error(f"Error starting ZED module: {e}") + self._running = False + + @rpc + def stop(self): + """Stop the ZED module.""" + if not self._running: + return + + self._running = False + + # Stop subscription + if self._subscription: + self._subscription.dispose() + self._subscription = None + + # Close camera + if self.zed_camera: + self.zed_camera.close() + self.zed_camera = None + + logger.info("ZED module stopped") + + def _capture_and_publish(self): + """Capture frame and publish all data.""" + if not self._running or not self.zed_camera: + return + + try: + # Capture frame with pose + left_img, _, depth, pose_data = self.zed_camera.capture_frame_with_pose() + + if left_img is None or depth is None: + return + + # Get timestamp + timestamp_ns = time.time_ns() + timestamp = Time(sec=timestamp_ns // 1_000_000_000, nsec=timestamp_ns % 1_000_000_000) + + # Create header + header = Header(seq=self._sequence, stamp=timestamp, frame_id=self.frame_id) + self._sequence += 1 + + # Publish color image + self._publish_color_image(left_img, header) + + # Publish depth image + self._publish_depth_image(depth, header) + + # Publish camera info periodically + self._publish_camera_info() + + # Publish pose if tracking enabled and valid + if self.enable_tracking and pose_data and pose_data.get("valid", False): + self._publish_pose(pose_data, header) + + except Exception as e: + logger.error(f"Error in capture and publish: {e}") + + def _publish_color_image(self, image: np.ndarray, header: Header): + """Publish color image as LCM message.""" + try: + # Convert BGR to RGB if needed + if len(image.shape) == 3 and image.shape[2] == 3: + image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + else: + image_rgb = image + + # Create LCM Image message + height, width = image_rgb.shape[:2] + encoding = "rgb8" if len(image_rgb.shape) == 3 else "mono8" + step = width * (3 if len(image_rgb.shape) == 3 else 1) + data = image_rgb.tobytes() + + msg = Image( + data_length=len(data), + header=header, + height=height, + width=width, + encoding=encoding, + is_bigendian=0, + step=step, + data=data, + ) + + self.color_image.publish(msg) + + except Exception as e: + logger.error(f"Error publishing color image: {e}") + + def _publish_depth_image(self, depth: np.ndarray, header: Header): + """Publish depth image as LCM message.""" + try: + # Depth is float32 in meters + height, width = depth.shape[:2] + encoding = "32FC1" # 32-bit float, single channel + step = width * 4 # 4 bytes per float + data = depth.astype(np.float32).tobytes() + + msg = Image( + data_length=len(data), + header=header, + height=height, + width=width, + encoding=encoding, + is_bigendian=0, + step=step, + data=data, + ) + + self.depth_image.publish(msg) + + except Exception as e: + logger.error(f"Error publishing depth image: {e}") + + def _publish_camera_info(self): + """Publish camera calibration information.""" + try: + info = self.zed_camera.get_camera_info() + if not info: + return + + # Get calibration parameters + left_cam = info.get("left_cam", {}) + resolution = info.get("resolution", {}) + + # Create CameraInfo message + header = Header(seq=0, stamp=Time(sec=int(time.time()), nsec=0), frame_id=self.frame_id) + + # Create camera matrix K (3x3) + K = [ + left_cam.get("fx", 0), + 0, + left_cam.get("cx", 0), + 0, + left_cam.get("fy", 0), + left_cam.get("cy", 0), + 0, + 0, + 1, + ] + + # Distortion coefficients + D = [ + left_cam.get("k1", 0), + left_cam.get("k2", 0), + left_cam.get("p1", 0), + left_cam.get("p2", 0), + left_cam.get("k3", 0), + ] + + # Identity rotation matrix + R = [1, 0, 0, 0, 1, 0, 0, 0, 1] + + # Projection matrix P (3x4) + P = [ + left_cam.get("fx", 0), + 0, + left_cam.get("cx", 0), + 0, + 0, + left_cam.get("fy", 0), + left_cam.get("cy", 0), + 0, + 0, + 0, + 1, + 0, + ] + + msg = CameraInfo( + D_length=len(D), + header=header, + height=resolution.get("height", 0), + width=resolution.get("width", 0), + distortion_model="plumb_bob", + D=D, + K=K, + R=R, + P=P, + binning_x=0, + binning_y=0, + ) + + self.camera_info.publish(msg) + + except Exception as e: + logger.error(f"Error publishing camera info: {e}") + + def _publish_pose(self, pose_data: Dict[str, Any], header: Header): + """Publish camera pose as PoseStamped message.""" + try: + position = pose_data.get("position", [0, 0, 0]) + rotation = pose_data.get("rotation", [0, 0, 0, 1]) # quaternion [x,y,z,w] + + # Create Pose message + pose = Pose( + position=Point(x=position[0], y=position[1], z=position[2]), + orientation=Quaternion(x=rotation[0], y=rotation[1], z=rotation[2], w=rotation[3]), + ) + + # Create PoseStamped message + msg = PoseStamped(header=header, pose=pose) + + self.pose.publish(msg) + + except Exception as e: + logger.error(f"Error publishing pose: {e}") + + @rpc + def get_camera_info(self) -> Dict[str, Any]: + """Get camera information and calibration parameters.""" + if self.zed_camera: + return self.zed_camera.get_camera_info() + return {} + + @rpc + def get_pose(self) -> Optional[Dict[str, Any]]: + """Get current camera pose if tracking is enabled.""" + if self.zed_camera and self.enable_tracking: + return self.zed_camera.get_pose() + return None + + def cleanup(self): + """Clean up resources on module destruction.""" + self.stop() diff --git a/dimos/perception/manip_aio_pipeline.py b/dimos/manipulation/manip_aio_pipeline.py similarity index 100% rename from dimos/perception/manip_aio_pipeline.py rename to dimos/manipulation/manip_aio_pipeline.py diff --git a/dimos/perception/manip_aio_processer.py b/dimos/manipulation/manip_aio_processer.py similarity index 100% rename from dimos/perception/manip_aio_processer.py rename to dimos/manipulation/manip_aio_processer.py diff --git a/dimos/manipulation/visual_servoing/detection3d.py b/dimos/manipulation/visual_servoing/detection3d.py new file mode 100644 index 0000000000..887fd023ab --- /dev/null +++ b/dimos/manipulation/visual_servoing/detection3d.py @@ -0,0 +1,300 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Real-time 3D object detection processor that extracts object poses from RGB-D data. +""" + +from typing import List, Optional, Tuple +import numpy as np +import cv2 + +from dimos.utils.logging_config import setup_logger +from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter +from dimos.perception.pointcloud.utils import extract_centroids_from_masks +from dimos.perception.detection2d.utils import calculate_object_size_from_bbox +from dimos.perception.common.utils import bbox2d_to_corners + +from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point +from dimos_lcm.vision_msgs import ( + Detection3D, + Detection3DArray, + BoundingBox3D, + ObjectHypothesisWithPose, + ObjectHypothesis, + Detection2D, + Detection2DArray, + BoundingBox2D, + Pose2D, + Point2D, +) +from dimos_lcm.std_msgs import Header +from dimos.manipulation.visual_servoing.utils import ( + estimate_object_depth, + visualize_detections_3d, + transform_pose, +) + +logger = setup_logger("dimos.manipulation.visual_servoing.detection3d") + + +class Detection3DProcessor: + """ + Real-time 3D detection processor optimized for speed. + + Uses Sam (FastSAM) for segmentation and mask generation, then extracts + 3D centroids from depth data. + """ + + def __init__( + self, + camera_intrinsics: List[float], # [fx, fy, cx, cy] + min_confidence: float = 0.6, + min_points: int = 30, + max_depth: float = 1.0, + max_object_size: float = 0.15, + ): + """ + Initialize the real-time 3D detection processor. + + Args: + camera_intrinsics: [fx, fy, cx, cy] camera parameters + min_confidence: Minimum detection confidence threshold + min_points: Minimum 3D points required for valid detection + max_depth: Maximum valid depth in meters + """ + self.camera_intrinsics = camera_intrinsics + self.min_points = min_points + self.max_depth = max_depth + self.max_object_size = max_object_size + + # Initialize Sam segmenter with tracking enabled but analysis disabled + self.detector = Sam2DSegmenter( + use_tracker=False, + use_analyzer=False, + use_filtering=True, + ) + + self.min_confidence = min_confidence + + logger.info( + f"Initialized Detection3DProcessor with Sam segmenter, confidence={min_confidence}, " + f"min_points={min_points}, max_depth={max_depth}m, max_object_size={max_object_size}m" + ) + + def process_frame( + self, rgb_image: np.ndarray, depth_image: np.ndarray, transform: Optional[np.ndarray] = None + ) -> Tuple[Detection3DArray, Detection2DArray]: + """ + Process a single RGB-D frame to extract 3D object detections. + + Args: + rgb_image: RGB image (H, W, 3) + depth_image: Depth image (H, W) in meters + transform: Optional 4x4 transformation matrix to transform objects from camera frame to desired frame + + Returns: + Tuple of (Detection3DArray, Detection2DArray) with 3D and 2D information + """ + + # Convert RGB to BGR for Sam (OpenCV format) + bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) + + # Run Sam segmentation with tracking + masks, bboxes, track_ids, probs, names = self.detector.process_image(bgr_image) + + if not masks or len(masks) == 0: + return Detection3DArray( + detections_length=0, header=Header(), detections=[] + ), Detection2DArray(detections_length=0, header=Header(), detections=[]) + + # Convert CUDA tensors to numpy arrays if needed + numpy_masks = [] + for mask in masks: + if hasattr(mask, "cpu"): # PyTorch tensor + numpy_masks.append(mask.cpu().numpy()) + else: # Already numpy array + numpy_masks.append(mask) + + # Extract 3D centroids from masks + poses = extract_centroids_from_masks( + rgb_image=rgb_image, + depth_image=depth_image, + masks=numpy_masks, + camera_intrinsics=self.camera_intrinsics, + ) + + detections_3d = [] + detections_2d = [] + pose_dict = {p["mask_idx"]: p for p in poses if p["centroid"][2] < self.max_depth} + + for i, (bbox, name, prob, track_id) in enumerate(zip(bboxes, names, probs, track_ids)): + if i not in pose_dict: + continue + + pose = pose_dict[i] + obj_cam_pos = pose["centroid"] + + if obj_cam_pos[2] > self.max_depth: + continue + + # Calculate object size from bbox and depth + width_m, height_m = calculate_object_size_from_bbox( + bbox, obj_cam_pos[2], self.camera_intrinsics + ) + + # Calculate depth dimension using segmentation mask + depth_m = estimate_object_depth( + depth_image, numpy_masks[i] if i < len(numpy_masks) else None, bbox + ) + + size_x = max(width_m, 0.01) # Minimum 1cm width + size_y = max(height_m, 0.01) # Minimum 1cm height + size_z = max(depth_m, 0.01) # Minimum 1cm depth + + if min(size_x, size_y, size_z) > self.max_object_size: + continue + + # Transform to desired frame if transform matrix is provided + if transform is not None: + # Get orientation as euler angles, default to no rotation if not available + obj_cam_orientation = pose.get( + "rotation", np.array([0.0, 0.0, 0.0]) + ) # Default to no rotation + transformed_pose = transform_pose( + obj_cam_pos, obj_cam_orientation, transform, to_robot=True + ) + center_pose = transformed_pose + else: + # If no transform, use camera coordinates + center_pose = Pose( + Point(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]), + Quaternion(0.0, 0.0, 0.0, 1.0), # Default orientation + ) + + # Create Detection3D object + detection = Detection3D( + results_length=1, + header=Header(), # Empty header + results=[ + ObjectHypothesisWithPose( + hypothesis=ObjectHypothesis(class_id=name, score=float(prob)) + ) + ], + bbox=BoundingBox3D(center=center_pose, size=Vector3(size_x, size_y, size_z)), + id=str(track_id), + ) + + detections_3d.append(detection) + + # Create corresponding Detection2D + x1, y1, x2, y2 = bbox + center_x = (x1 + x2) / 2.0 + center_y = (y1 + y2) / 2.0 + width = x2 - x1 + height = y2 - y1 + + detection_2d = Detection2D( + results_length=1, + header=Header(), + results=[ + ObjectHypothesisWithPose( + hypothesis=ObjectHypothesis(class_id=name, score=float(prob)) + ) + ], + bbox=BoundingBox2D( + center=Pose2D(position=Point2D(center_x, center_y), theta=0.0), + size_x=float(width), + size_y=float(height), + ), + id=str(track_id), + ) + detections_2d.append(detection_2d) + + # Create and return both arrays + return ( + Detection3DArray( + detections_length=len(detections_3d), header=Header(), detections=detections_3d + ), + Detection2DArray( + detections_length=len(detections_2d), header=Header(), detections=detections_2d + ), + ) + + def visualize_detections( + self, + rgb_image: np.ndarray, + detections_3d: List[Detection3D], + detections_2d: List[Detection2D], + show_coordinates: bool = True, + ) -> np.ndarray: + """ + Visualize detections with 3D position overlay next to bounding boxes. + + Args: + rgb_image: Original RGB image + detections_3d: List of Detection3D objects + detections_2d: List of Detection2D objects (must be 1:1 correspondence) + show_coordinates: Whether to show 3D coordinates + + Returns: + Visualization image + """ + # Extract 2D bboxes from Detection2D objects + + bboxes_2d = [] + for det_2d in detections_2d: + if det_2d.bbox: + x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox) + bboxes_2d.append([x1, y1, x2, y2]) + + return visualize_detections_3d(rgb_image, detections_3d, show_coordinates, bboxes_2d) + + def get_closest_detection( + self, detections: List[Detection3D], class_filter: Optional[str] = None + ) -> Optional[Detection3D]: + """ + Get the closest detection with valid 3D data. + + Args: + detections: List of Detection3D objects + class_filter: Optional class name to filter by + + Returns: + Closest Detection3D or None + """ + valid_detections = [] + for d in detections: + # Check if has valid bbox center position + if d.bbox and d.bbox.center and d.bbox.center.position: + # Check class filter if specified + if class_filter is None or ( + d.results_length > 0 and d.results[0].hypothesis.class_id == class_filter + ): + valid_detections.append(d) + + if not valid_detections: + return None + + # Sort by depth (Z coordinate) + def get_z_coord(d): + return abs(d.bbox.center.position.z) + + return min(valid_detections, key=get_z_coord) + + def cleanup(self): + """Clean up resources.""" + if hasattr(self.detector, "cleanup"): + self.detector.cleanup() + logger.info("Detection3DProcessor cleaned up") diff --git a/dimos/manipulation/visual_servoing/manipulation_module.py b/dimos/manipulation/visual_servoing/manipulation_module.py new file mode 100644 index 0000000000..b2724fb59a --- /dev/null +++ b/dimos/manipulation/visual_servoing/manipulation_module.py @@ -0,0 +1,961 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Manipulation module for robotic grasping with visual servoing. +Handles grasping logic, state machine, and hardware coordination as a Dimos module. +""" + +import cv2 +import time +import threading +from typing import Optional, Tuple, Any, Dict +from enum import Enum +from collections import deque + +import numpy as np + +from dimos.core import Module, In, Out, rpc +from dimos_lcm.sensor_msgs import Image, CameraInfo +from dimos_lcm.geometry_msgs import Vector3, Pose, Point, Quaternion +from dimos_lcm.vision_msgs import Detection3DArray, Detection2DArray + +from dimos.hardware.piper_arm import PiperArm +from dimos.manipulation.visual_servoing.detection3d import Detection3DProcessor +from dimos.manipulation.visual_servoing.pbvs import PBVS +from dimos.perception.common.utils import find_clicked_detection +from dimos.manipulation.visual_servoing.utils import ( + create_manipulation_visualization, + select_points_from_depth, + transform_points_3d, + update_target_grasp_pose, + apply_grasp_distance, + is_target_reached, +) +from dimos.utils.transform_utils import ( + pose_to_matrix, + matrix_to_pose, + create_transform_from_6dof, + compose_transforms, +) +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("dimos.manipulation.visual_servoing.manipulation_module") + + +class GraspStage(Enum): + """Enum for different grasp stages.""" + + IDLE = "idle" + PRE_GRASP = "pre_grasp" + GRASP = "grasp" + CLOSE_AND_RETRACT = "close_and_retract" + PLACE = "place" + RETRACT = "retract" + + +class Feedback: + """Feedback data containing state information about the manipulation process.""" + + def __init__( + self, + grasp_stage: GraspStage, + target_tracked: bool, + current_executed_pose: Optional[Pose] = None, + current_ee_pose: Optional[Pose] = None, + current_camera_pose: Optional[Pose] = None, + target_pose: Optional[Pose] = None, + waiting_for_reach: bool = False, + success: Optional[bool] = None, + ): + self.grasp_stage = grasp_stage + self.target_tracked = target_tracked + self.current_executed_pose = current_executed_pose + self.current_ee_pose = current_ee_pose + self.current_camera_pose = current_camera_pose + self.target_pose = target_pose + self.waiting_for_reach = waiting_for_reach + self.success = success + + +class ManipulationModule(Module): + """ + Manipulation module for visual servoing and grasping. + + Subscribes to: + - ZED RGB images + - ZED depth images + - ZED camera info + + Publishes: + - Visualization images + + RPC methods: + - handle_keyboard_command: Process keyboard input + - pick_and_place: Execute pick and place task + """ + + # LCM inputs + rgb_image: In[Image] = None + depth_image: In[Image] = None + camera_info: In[CameraInfo] = None + + # LCM outputs + viz_image: Out[Image] = None + + def __init__( + self, + ee_to_camera_6dof: Optional[list] = None, + **kwargs, + ): + """ + Initialize manipulation module. + + Args: + ee_to_camera_6dof: EE to camera transform [x, y, z, rx, ry, rz] in meters and radians + workspace_min_radius: Minimum workspace radius in meters + workspace_max_radius: Maximum workspace radius in meters + min_grasp_pitch_degrees: Minimum grasp pitch angle (at max radius) + max_grasp_pitch_degrees: Maximum grasp pitch angle (at min radius) + """ + super().__init__(**kwargs) + + self.arm = PiperArm() + + if ee_to_camera_6dof is None: + ee_to_camera_6dof = [-0.065, 0.03, -0.095, 0.0, -1.57, 0.0] + pos = Vector3(ee_to_camera_6dof[0], ee_to_camera_6dof[1], ee_to_camera_6dof[2]) + rot = Vector3(ee_to_camera_6dof[3], ee_to_camera_6dof[4], ee_to_camera_6dof[5]) + self.T_ee_to_camera = create_transform_from_6dof(pos, rot) + + self.camera_intrinsics = None + self.detector = None + self.pbvs = None + + # Control state + self.last_valid_target = None + self.waiting_for_reach = False + self.current_executed_pose = None # Track the actual pose sent to arm + self.target_updated = False + self.waiting_start_time = None + self.reach_pose_timeout = 20.0 + + # Grasp parameters + self.grasp_width_offset = 0.03 + self.pregrasp_distance = 0.25 + self.grasp_distance_range = 0.03 + self.grasp_close_delay = 2.0 + self.grasp_reached_time = None + self.gripper_max_opening = 0.07 + + # Workspace limits and dynamic pitch parameters + self.workspace_min_radius = 0.2 + self.workspace_max_radius = 0.75 + self.min_grasp_pitch_degrees = 5.0 + self.max_grasp_pitch_degrees = 60.0 + + # Grasp stage tracking + self.grasp_stage = GraspStage.IDLE + + # Pose stabilization tracking + self.pose_history_size = 4 + self.pose_stabilization_threshold = 0.01 + self.stabilization_timeout = 25.0 + self.stabilization_start_time = None + self.reached_poses = deque(maxlen=self.pose_history_size) + self.adjustment_count = 0 + + # Pose reachability tracking + self.ee_pose_history = deque(maxlen=20) # Keep history of EE poses + self.stuck_pose_threshold = 0.001 # 1mm movement threshold + self.stuck_pose_adjustment_degrees = 5.0 + self.stuck_count = 0 + self.max_stuck_reattempts = 7 + + # State for visualization + self.current_visualization = None + self.last_detection_3d_array = None + self.last_detection_2d_array = None + + # Grasp result and task tracking + self.pick_success = None + self.final_pregrasp_pose = None + self.task_failed = False + self.overall_success = None + + # Task control + self.task_running = False + self.task_thread = None + self.stop_event = threading.Event() + + # Latest sensor data + self.latest_rgb = None + self.latest_depth = None + self.latest_camera_info = None + + # Target selection + self.target_click = None + + # Place target position and object info + self.home_pose = Pose(Point(0.0, 0.0, 0.0), Quaternion(0.0, 0.0, 0.0, 1.0)) + self.place_target_position = None + self.target_object_height = None + self.retract_distance = 0.12 + self.place_pose = None + self.retract_pose = None + self.arm.gotoObserve() + + @rpc + def start(self): + """Start the manipulation module.""" + # Subscribe to camera data + self.rgb_image.subscribe(self._on_rgb_image) + self.depth_image.subscribe(self._on_depth_image) + self.camera_info.subscribe(self._on_camera_info) + + logger.info("Manipulation module started") + + @rpc + def stop(self): + """Stop the manipulation module.""" + # Stop any running task + self.stop_event.set() + if self.task_thread and self.task_thread.is_alive(): + self.task_thread.join(timeout=5.0) + + self.reset_to_idle() + logger.info("Manipulation module stopped") + + def _on_rgb_image(self, msg: Image): + """Handle RGB image messages.""" + try: + data = np.frombuffer(msg.data, dtype=np.uint8) + if msg.encoding == "rgb8": + self.latest_rgb = data.reshape((msg.height, msg.width, 3)) + else: + logger.warning(f"Unsupported RGB encoding: {msg.encoding}") + except Exception as e: + logger.error(f"Error processing RGB image: {e}") + + def _on_depth_image(self, msg: Image): + """Handle depth image messages.""" + try: + if msg.encoding == "32FC1": + data = np.frombuffer(msg.data, dtype=np.float32) + self.latest_depth = data.reshape((msg.height, msg.width)) + else: + logger.warning(f"Unsupported depth encoding: {msg.encoding}") + except Exception as e: + logger.error(f"Error processing depth image: {e}") + + def _on_camera_info(self, msg: CameraInfo): + """Handle camera info messages.""" + try: + self.camera_intrinsics = [msg.K[0], msg.K[4], msg.K[2], msg.K[5]] + + if self.detector is None: + self.detector = Detection3DProcessor(self.camera_intrinsics) + self.pbvs = PBVS() + logger.info("Initialized detection and PBVS processors") + + self.latest_camera_info = msg + except Exception as e: + logger.error(f"Error processing camera info: {e}") + + @rpc + def get_single_rgb_frame(self) -> Optional[np.ndarray]: + """ + get the latest rgb frame from the camera + """ + return self.latest_rgb + + @rpc + def handle_keyboard_command(self, key: str) -> str: + """ + Handle keyboard commands for robot control. + + Args: + key: Keyboard key as string + + Returns: + Action taken as string, or empty string if no action + """ + key_code = ord(key) if len(key) == 1 else int(key) + + if key_code == ord("r"): + self.stop_event.set() + self.task_running = False + self.reset_to_idle() + return "reset" + elif key_code == ord("s"): + logger.info("SOFT STOP - Emergency stopping robot!") + self.arm.softStop() + self.stop_event.set() + self.task_running = False + return "stop" + elif key_code == ord(" ") and self.pbvs and self.pbvs.target_grasp_pose: + if self.grasp_stage == GraspStage.PRE_GRASP: + self.set_grasp_stage(GraspStage.GRASP) + logger.info("Executing target pose") + return "execute" + elif key_code == ord("g"): + logger.info("Opening gripper") + self.arm.release_gripper() + return "release" + + return "" + + @rpc + def pick_and_place( + self, target_x: int = None, target_y: int = None, place_x: int = None, place_y: int = None + ) -> Dict[str, Any]: + """ + Start a pick and place task. + + Args: + target_x: Optional X coordinate of target object + target_y: Optional Y coordinate of target object + place_x: Optional X coordinate of place location + place_y: Optional Y coordinate of place location + + Returns: + Dict with status and message + """ + if self.task_running: + return {"status": "error", "message": "Task already running"} + + if self.camera_intrinsics is None: + return {"status": "error", "message": "Camera not initialized"} + + if target_x is not None and target_y is not None: + self.target_click = (target_x, target_y) + if place_x is not None and self.latest_depth is not None: + points_3d_camera = select_points_from_depth( + self.latest_depth, + (place_x, place_y), + self.camera_intrinsics, + radius=10, + ) + + if points_3d_camera.size > 0: + ee_pose = self.arm.get_ee_pose() + ee_transform = pose_to_matrix(ee_pose) + camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera) + + points_3d_world = transform_points_3d( + points_3d_camera, + camera_transform, + to_robot=True, + ) + + place_position = np.mean(points_3d_world, axis=0) + self.place_target_position = place_position + logger.info( + f"Place target set at position: ({place_position[0]:.3f}, {place_position[1]:.3f}, {place_position[2]:.3f})" + ) + else: + logger.warning("No valid depth points found at place location") + self.place_target_position = None + else: + self.place_target_position = None + + self.task_failed = False + self.stop_event.clear() + + if self.task_thread and self.task_thread.is_alive(): + self.stop_event.set() + self.task_thread.join(timeout=1.0) + self.task_thread = threading.Thread(target=self._run_pick_and_place, daemon=True) + self.task_thread.start() + + return {"status": "started", "message": "Pick and place task started"} + + def _run_pick_and_place(self): + """Run the pick and place task loop.""" + self.task_running = True + logger.info("Starting pick and place task") + + try: + while not self.stop_event.is_set(): + if self.task_failed: + logger.error("Task failed, terminating pick and place") + self.stop_event.set() + break + + feedback = self.update() + if feedback is None: + time.sleep(0.01) + continue + + if feedback.success is not None: + if feedback.success: + logger.info("Pick and place completed successfully!") + else: + logger.warning("Pick and place failed") + self.reset_to_idle() + self.stop_event.set() + break + + time.sleep(0.01) + + except Exception as e: + logger.error(f"Error in pick and place task: {e}") + self.task_failed = True + finally: + self.task_running = False + logger.info("Pick and place task ended") + + def set_grasp_stage(self, stage: GraspStage): + """Set the grasp stage.""" + self.grasp_stage = stage + logger.info(f"Grasp stage: {stage.value}") + + def calculate_dynamic_grasp_pitch(self, target_pose: Pose) -> float: + """ + Calculate grasp pitch dynamically based on distance from robot base. + Maps workspace radius to grasp pitch angle. + + Args: + target_pose: Target pose + + Returns: + Grasp pitch angle in degrees + """ + # Calculate 3D distance from robot base (assumes robot at origin) + position = target_pose.position + distance = np.sqrt(position.x**2 + position.y**2 + position.z**2) + + # Clamp distance to workspace limits + distance = np.clip(distance, self.workspace_min_radius, self.workspace_max_radius) + + # Linear interpolation: min_radius -> max_pitch, max_radius -> min_pitch + # Normalized distance (0 to 1) + normalized_dist = (distance - self.workspace_min_radius) / ( + self.workspace_max_radius - self.workspace_min_radius + ) + + # Inverse mapping: closer objects need higher pitch + pitch_degrees = self.max_grasp_pitch_degrees - ( + normalized_dist * (self.max_grasp_pitch_degrees - self.min_grasp_pitch_degrees) + ) + + return pitch_degrees + + def check_within_workspace(self, target_pose: Pose) -> bool: + """ + Check if pose is within workspace limits and log error if not. + + Args: + target_pose: Target pose to validate + + Returns: + True if within workspace, False otherwise + """ + # Calculate 3D distance from robot base + position = target_pose.position + distance = np.sqrt(position.x**2 + position.y**2 + position.z**2) + + if not (self.workspace_min_radius <= distance <= self.workspace_max_radius): + logger.error( + f"Target outside workspace limits: distance {distance:.3f}m not in [{self.workspace_min_radius:.2f}, {self.workspace_max_radius:.2f}]" + ) + return False + + return True + + def _check_reach_timeout(self) -> Tuple[bool, float]: + """Check if robot has exceeded timeout while reaching pose. + + Returns: + Tuple of (timed_out, time_elapsed) + """ + if self.waiting_start_time: + time_elapsed = time.time() - self.waiting_start_time + if time_elapsed > self.reach_pose_timeout: + logger.warning( + f"Robot failed to reach pose within {self.reach_pose_timeout}s timeout" + ) + self.task_failed = True + self.reset_to_idle() + return True, time_elapsed + return False, time_elapsed + return False, 0.0 + + def _check_if_stuck(self) -> bool: + """ + Check if robot is stuck by analyzing pose history. + + Returns: + Tuple of (is_stuck, max_std_dev_mm) + """ + if len(self.ee_pose_history) < self.ee_pose_history.maxlen: + return False + + # Extract positions from pose history + positions = np.array( + [[p.position.x, p.position.y, p.position.z] for p in self.ee_pose_history] + ) + + # Calculate standard deviation of positions + std_devs = np.std(positions, axis=0) + # Check if all standard deviations are below stuck threshold + is_stuck = np.all(std_devs < self.stuck_pose_threshold) + + return is_stuck + + def check_reach_and_adjust(self) -> bool: + """ + Check if robot has reached the current executed pose while waiting. + Handles timeout internally by failing the task. + Also detects if the robot is stuck (not moving towards target). + + Returns: + True if reached, False if still waiting or not in waiting state + """ + if not self.waiting_for_reach or not self.current_executed_pose: + return False + + # Get current end-effector pose + ee_pose = self.arm.get_ee_pose() + target_pose = self.current_executed_pose + + # Check for timeout - this will fail task and reset if timeout occurred + timed_out, time_elapsed = self._check_reach_timeout() + if timed_out: + return False + + self.ee_pose_history.append(ee_pose) + + # Check if robot is stuck + is_stuck = self._check_if_stuck() + if is_stuck: + if self.grasp_stage == GraspStage.RETRACT or self.grasp_stage == GraspStage.PLACE: + self.waiting_for_reach = False + self.waiting_start_time = None + self.stuck_count = 0 + self.ee_pose_history.clear() + return True + self.stuck_count += 1 + pitch_degrees = self.calculate_dynamic_grasp_pitch(target_pose) + if self.stuck_count % 2 == 0: + pitch_degrees += self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2) + else: + pitch_degrees -= self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2) + + pitch_degrees = max( + self.min_grasp_pitch_degrees, min(self.max_grasp_pitch_degrees, pitch_degrees) + ) + updated_target_pose = update_target_grasp_pose(target_pose, ee_pose, 0.0, pitch_degrees) + self.arm.cmd_ee_pose(updated_target_pose) + self.current_executed_pose = updated_target_pose + self.ee_pose_history.clear() + self.waiting_for_reach = True + self.waiting_start_time = time.time() + return False + + if self.stuck_count >= self.max_stuck_reattempts: + self.task_failed = True + self.reset_to_idle() + return False + + if is_target_reached(target_pose, ee_pose, self.pbvs.target_tolerance): + self.waiting_for_reach = False + self.waiting_start_time = None + self.stuck_count = 0 + self.ee_pose_history.clear() + return True + return False + + def _update_tracking(self, detection_3d_array: Optional[Detection3DArray]) -> bool: + """Update tracking with new detections.""" + if not detection_3d_array or not self.pbvs: + return False + + target_tracked = self.pbvs.update_tracking(detection_3d_array) + if target_tracked: + self.target_updated = True + self.last_valid_target = self.pbvs.get_current_target() + return target_tracked + + def reset_to_idle(self): + """Reset the manipulation system to IDLE state.""" + if self.pbvs: + self.pbvs.clear_target() + self.grasp_stage = GraspStage.IDLE + self.reached_poses.clear() + self.ee_pose_history.clear() + self.adjustment_count = 0 + self.waiting_for_reach = False + self.current_executed_pose = None + self.target_updated = False + self.stabilization_start_time = None + self.grasp_reached_time = None + self.waiting_start_time = None + self.pick_success = None + self.final_pregrasp_pose = None + self.overall_success = None + self.place_pose = None + self.retract_pose = None + self.stuck_count = 0 + + self.arm.gotoObserve() + + def execute_idle(self): + """Execute idle stage.""" + pass + + def execute_pre_grasp(self): + """Execute pre-grasp stage: visual servoing to pre-grasp position.""" + if self.waiting_for_reach: + if self.check_reach_and_adjust(): + self.reached_poses.append(self.current_executed_pose) + self.target_updated = False + time.sleep(0.2) + return + if ( + self.stabilization_start_time + and (time.time() - self.stabilization_start_time) > self.stabilization_timeout + ): + logger.warning( + f"Failed to get stable grasp after {self.stabilization_timeout} seconds, resetting" + ) + self.task_failed = True + self.reset_to_idle() + return + + ee_pose = self.arm.get_ee_pose() + dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.pbvs.current_target.bbox.center) + + _, _, _, has_target, target_pose = self.pbvs.compute_control( + ee_pose, self.pregrasp_distance, dynamic_pitch + ) + if target_pose and has_target: + # Validate target pose is within workspace + if not self.check_within_workspace(target_pose): + self.task_failed = True + self.reset_to_idle() + return + + if self.check_target_stabilized(): + logger.info("Target stabilized, transitioning to GRASP") + self.final_pregrasp_pose = self.current_executed_pose + self.grasp_stage = GraspStage.GRASP + self.adjustment_count = 0 + self.waiting_for_reach = False + elif not self.waiting_for_reach and self.target_updated: + self.arm.cmd_ee_pose(target_pose) + self.current_executed_pose = target_pose + self.waiting_for_reach = True + self.waiting_start_time = time.time() + self.target_updated = False + self.adjustment_count += 1 + time.sleep(0.2) + + def execute_grasp(self): + """Execute grasp stage: move to final grasp position.""" + if self.waiting_for_reach: + if self.check_reach_and_adjust() and not self.grasp_reached_time: + self.grasp_reached_time = time.time() + return + + if self.grasp_reached_time: + if (time.time() - self.grasp_reached_time) >= self.grasp_close_delay: + logger.info("Grasp delay completed, closing gripper") + self.grasp_stage = GraspStage.CLOSE_AND_RETRACT + return + + if self.last_valid_target: + # Calculate dynamic pitch for current target + dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.last_valid_target.bbox.center) + normalized_pitch = dynamic_pitch / 90.0 + grasp_distance = -self.grasp_distance_range + ( + 2 * self.grasp_distance_range * normalized_pitch + ) + + ee_pose = self.arm.get_ee_pose() + _, _, _, has_target, target_pose = self.pbvs.compute_control( + ee_pose, grasp_distance, dynamic_pitch + ) + + if target_pose and has_target: + # Validate grasp pose is within workspace + if not self.check_within_workspace(target_pose): + self.task_failed = True + self.reset_to_idle() + return + + object_width = self.last_valid_target.bbox.size.x + gripper_opening = max( + 0.005, min(object_width + self.grasp_width_offset, self.gripper_max_opening) + ) + + logger.info(f"Executing grasp: gripper={gripper_opening * 1000:.1f}mm") + self.arm.cmd_gripper_ctrl(gripper_opening) + self.arm.cmd_ee_pose(target_pose, line_mode=True) + self.current_executed_pose = target_pose + self.waiting_for_reach = True + self.waiting_start_time = time.time() + + def execute_close_and_retract(self): + """Execute the retraction sequence after gripper has been closed.""" + if self.waiting_for_reach and self.final_pregrasp_pose: + if self.check_reach_and_adjust(): + logger.info("Reached pre-grasp retraction position") + self.pick_success = self.arm.gripper_object_detected() + if self.pick_success: + logger.info("Object successfully grasped!") + if self.place_target_position is not None: + logger.info("Transitioning to PLACE stage") + self.grasp_stage = GraspStage.PLACE + else: + self.overall_success = True + else: + logger.warning("No object detected in gripper") + self.task_failed = True + self.overall_success = False + return + if not self.waiting_for_reach: + logger.info("Retracting to pre-grasp position") + self.arm.cmd_ee_pose(self.final_pregrasp_pose, line_mode=True) + self.current_executed_pose = self.final_pregrasp_pose + self.arm.close_gripper() + self.waiting_for_reach = True + self.waiting_start_time = time.time() + + def execute_place(self): + """Execute place stage: move to place position and release object.""" + if self.waiting_for_reach: + # Use the already executed pose instead of recalculating + if self.check_reach_and_adjust(): + logger.info("Reached place position, releasing gripper") + self.arm.release_gripper() + time.sleep(1.0) + self.place_pose = self.current_executed_pose + logger.info("Transitioning to RETRACT stage") + self.grasp_stage = GraspStage.RETRACT + return + + if not self.waiting_for_reach: + place_pose = self.get_place_target_pose() + if place_pose: + logger.info("Moving to place position") + self.arm.cmd_ee_pose(place_pose, line_mode=True) + self.current_executed_pose = place_pose + self.waiting_for_reach = True + self.waiting_start_time = time.time() + else: + logger.error("Failed to get place target pose") + self.task_failed = True + self.overall_success = False + + def execute_retract(self): + """Execute retract stage: retract from place position.""" + if self.waiting_for_reach and self.retract_pose: + if self.check_reach_and_adjust(): + logger.info("Reached retract position") + logger.info("Returning to observe position") + self.arm.gotoObserve() + self.arm.close_gripper() + self.overall_success = True + logger.info("Pick and place completed successfully!") + return + + if not self.waiting_for_reach: + if self.place_pose: + pose_pitch = self.calculate_dynamic_grasp_pitch(self.place_pose) + self.retract_pose = update_target_grasp_pose( + self.place_pose, self.home_pose, self.retract_distance, pose_pitch + ) + logger.info("Retracting from place position") + self.arm.cmd_ee_pose(self.retract_pose, line_mode=True) + self.current_executed_pose = self.retract_pose + self.waiting_for_reach = True + self.waiting_start_time = time.time() + else: + logger.error("No place pose stored for retraction") + self.task_failed = True + self.overall_success = False + + def capture_and_process( + self, + ) -> Tuple[ + Optional[np.ndarray], Optional[Detection3DArray], Optional[Detection2DArray], Optional[Pose] + ]: + """Capture frame from camera data and process detections.""" + if self.latest_rgb is None or self.latest_depth is None or self.detector is None: + return None, None, None, None + + ee_pose = self.arm.get_ee_pose() + ee_transform = pose_to_matrix(ee_pose) + camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera) + camera_pose = matrix_to_pose(camera_transform) + detection_3d_array, detection_2d_array = self.detector.process_frame( + self.latest_rgb, self.latest_depth, camera_transform + ) + + return self.latest_rgb, detection_3d_array, detection_2d_array, camera_pose + + def pick_target(self, x: int, y: int) -> bool: + """Select a target object at the given pixel coordinates.""" + if not self.last_detection_2d_array or not self.last_detection_3d_array: + logger.warning("No detections available for target selection") + return False + + clicked_3d = find_clicked_detection( + (x, y), self.last_detection_2d_array.detections, self.last_detection_3d_array.detections + ) + if clicked_3d and self.pbvs: + # Validate workspace + if not self.check_within_workspace(clicked_3d.bbox.center): + self.task_failed = True + return False + + self.pbvs.set_target(clicked_3d) + + if clicked_3d.bbox and clicked_3d.bbox.size: + self.target_object_height = clicked_3d.bbox.size.z + logger.info(f"Target object height: {self.target_object_height:.3f}m") + + position = clicked_3d.bbox.center.position + logger.info( + f"Target selected: ID={clicked_3d.id}, pos=({position.x:.3f}, {position.y:.3f}, {position.z:.3f})" + ) + self.grasp_stage = GraspStage.PRE_GRASP + self.reached_poses.clear() + self.adjustment_count = 0 + self.waiting_for_reach = False + self.current_executed_pose = None + self.stabilization_start_time = time.time() + return True + return False + + def update(self) -> Optional[Dict[str, Any]]: + """Main update function that handles capture, processing, control, and visualization.""" + rgb, detection_3d_array, detection_2d_array, camera_pose = self.capture_and_process() + if rgb is None: + return None + + self.last_detection_3d_array = detection_3d_array + self.last_detection_2d_array = detection_2d_array + if self.target_click: + x, y = self.target_click + if self.pick_target(x, y): + self.target_click = None + + if ( + detection_3d_array + and self.grasp_stage in [GraspStage.PRE_GRASP, GraspStage.GRASP] + and not self.waiting_for_reach + ): + self._update_tracking(detection_3d_array) + stage_handlers = { + GraspStage.IDLE: self.execute_idle, + GraspStage.PRE_GRASP: self.execute_pre_grasp, + GraspStage.GRASP: self.execute_grasp, + GraspStage.CLOSE_AND_RETRACT: self.execute_close_and_retract, + GraspStage.PLACE: self.execute_place, + GraspStage.RETRACT: self.execute_retract, + } + if self.grasp_stage in stage_handlers: + stage_handlers[self.grasp_stage]() + + target_tracked = self.pbvs.get_current_target() is not None if self.pbvs else False + ee_pose = self.arm.get_ee_pose() + feedback = Feedback( + grasp_stage=self.grasp_stage, + target_tracked=target_tracked, + current_executed_pose=self.current_executed_pose, + current_ee_pose=ee_pose, + current_camera_pose=camera_pose, + target_pose=self.pbvs.target_grasp_pose if self.pbvs else None, + waiting_for_reach=self.waiting_for_reach, + success=self.overall_success, + ) + + if self.task_running: + self.current_visualization = create_manipulation_visualization( + rgb, feedback, detection_3d_array, detection_2d_array + ) + + if self.current_visualization is not None: + self._publish_visualization(self.current_visualization) + + return feedback + + def _publish_visualization(self, viz_image: np.ndarray): + """Publish visualization image to LCM.""" + try: + viz_rgb = cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB) + height, width = viz_rgb.shape[:2] + data = viz_rgb.tobytes() + + msg = Image( + data_length=len(data), + height=height, + width=width, + encoding="rgb8", + is_bigendian=0, + step=width * 3, + data=data, + ) + + self.viz_image.publish(msg) + except Exception as e: + logger.error(f"Error publishing visualization: {e}") + + def check_target_stabilized(self) -> bool: + """Check if the commanded poses have stabilized.""" + if len(self.reached_poses) < self.reached_poses.maxlen: + return False + + positions = np.array( + [[p.position.x, p.position.y, p.position.z] for p in self.reached_poses] + ) + std_devs = np.std(positions, axis=0) + return np.all(std_devs < self.pose_stabilization_threshold) + + def get_place_target_pose(self) -> Optional[Pose]: + """Get the place target pose with z-offset applied based on object height.""" + if self.place_target_position is None: + return None + + place_pos = self.place_target_position.copy() + if self.target_object_height is not None: + z_offset = self.target_object_height / 2.0 + place_pos[2] += z_offset + 0.1 + + place_center_pose = Pose( + Point(place_pos[0], place_pos[1], place_pos[2]), Quaternion(0.0, 0.0, 0.0, 1.0) + ) + + ee_pose = self.arm.get_ee_pose() + + # Calculate dynamic pitch for place position + dynamic_pitch = self.calculate_dynamic_grasp_pitch(place_center_pose) + + place_pose = update_target_grasp_pose( + place_center_pose, + ee_pose, + grasp_distance=0.0, + grasp_pitch_degrees=dynamic_pitch, + ) + + return place_pose + + @rpc + def cleanup(self): + """Clean up resources on module destruction.""" + if self.detector and hasattr(self.detector, "cleanup"): + self.detector.cleanup() + self.arm.disable() diff --git a/dimos/manipulation/visual_servoing/pbvs.py b/dimos/manipulation/visual_servoing/pbvs.py new file mode 100644 index 0000000000..e34ec94557 --- /dev/null +++ b/dimos/manipulation/visual_servoing/pbvs.py @@ -0,0 +1,490 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Position-Based Visual Servoing (PBVS) system for robotic manipulation. +Supports both eye-in-hand and eye-to-hand configurations. +""" + +import numpy as np +from typing import Optional, Tuple, List +from collections import deque +from scipy.spatial.transform import Rotation as R +from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point +from dimos_lcm.vision_msgs import Detection3D, Detection3DArray +from dimos.utils.logging_config import setup_logger +from dimos.manipulation.visual_servoing.utils import ( + update_target_grasp_pose, + find_best_object_match, + create_pbvs_visualization, + is_target_reached, +) + +logger = setup_logger("dimos.manipulation.pbvs") + + +class PBVS: + """ + High-level Position-Based Visual Servoing orchestrator. + + Handles: + - Object tracking and target management + - Pregrasp distance computation + - Grasp pose generation + - Coordination with low-level controller + + Note: This class is agnostic to camera mounting (eye-in-hand vs eye-to-hand). + The caller is responsible for providing appropriate camera and EE poses. + """ + + def __init__( + self, + position_gain: float = 0.5, + rotation_gain: float = 0.3, + max_velocity: float = 0.1, # m/s + max_angular_velocity: float = 0.5, # rad/s + target_tolerance: float = 0.01, # 1cm + max_tracking_distance_threshold: float = 0.12, # Max distance for target tracking (m) + min_size_similarity: float = 0.6, # Min size similarity threshold (0.0-1.0) + direct_ee_control: bool = True, # If True, output target poses instead of velocities + ): + """ + Initialize PBVS system. + + Args: + position_gain: Proportional gain for position control + rotation_gain: Proportional gain for rotation control + max_velocity: Maximum linear velocity command magnitude (m/s) + max_angular_velocity: Maximum angular velocity command magnitude (rad/s) + target_tolerance: Distance threshold for considering target reached (m) + max_tracking_distance: Maximum distance for valid target tracking (m) + min_size_similarity: Minimum size similarity for valid target tracking (0.0-1.0) + direct_ee_control: If True, output target poses instead of velocity commands + """ + # Initialize low-level controller only if not in direct control mode + if not direct_ee_control: + self.controller = PBVSController( + position_gain=position_gain, + rotation_gain=rotation_gain, + max_velocity=max_velocity, + max_angular_velocity=max_angular_velocity, + target_tolerance=target_tolerance, + ) + else: + self.controller = None + + # Store parameters for direct mode error computation + self.target_tolerance = target_tolerance + + # Target tracking parameters + self.max_tracking_distance_threshold = max_tracking_distance_threshold + self.min_size_similarity = min_size_similarity + self.direct_ee_control = direct_ee_control + + # Target state + self.current_target = None + self.target_grasp_pose = None + + # Detection history for robust tracking + self.detection_history_size = 3 + self.detection_history = deque(maxlen=self.detection_history_size) + + # For direct control mode visualization + self.last_position_error = None + self.last_target_reached = False + + logger.info( + f"Initialized PBVS system with controller gains: pos={position_gain}, rot={rotation_gain}, " + f"tracking_thresholds: distance={max_tracking_distance_threshold}m, size={min_size_similarity:.2f}" + ) + + def set_target(self, target_object: Detection3D) -> bool: + """ + Set a new target object for servoing. + + Args: + target_object: Detection3D object + + Returns: + True if target was set successfully + """ + if target_object and target_object.bbox and target_object.bbox.center: + self.current_target = target_object + self.target_grasp_pose = None # Will be computed when needed + logger.info(f"New target set: ID {target_object.id}") + return True + return False + + def clear_target(self): + """Clear the current target.""" + self.current_target = None + self.target_grasp_pose = None + self.last_position_error = None + self.last_target_reached = False + self.detection_history.clear() + if self.controller: + self.controller.clear_state() + logger.info("Target cleared") + + def get_current_target(self) -> Optional[Detection3D]: + """ + Get the current target object. + + Returns: + Current target Detection3D or None if no target selected + """ + return self.current_target + + def update_tracking(self, new_detections: Optional[Detection3DArray] = None) -> bool: + """ + Update target tracking with new detections using a rolling window. + If tracking is lost, keeps the old target pose. + + Args: + new_detections: Optional new detections for target tracking + + Returns: + True if target was successfully tracked, False if lost (but target is kept) + """ + # Check if we have a current target + if ( + not self.current_target + or not self.current_target.bbox + or not self.current_target.bbox.center + ): + return False + + # Add new detections to history if provided + if new_detections is not None and new_detections.detections_length > 0: + self.detection_history.append(new_detections) + + # If no detection history, can't track + if not self.detection_history: + logger.debug("No detection history for target tracking - using last known pose") + return False + + # Collect all candidates from detection history + all_candidates = [] + for detection_array in self.detection_history: + all_candidates.extend(detection_array.detections) + + if not all_candidates: + logger.debug("No candidates in detection history") + return False + + # Use stage-dependent distance threshold + max_distance = self.max_tracking_distance_threshold + + # Find best match across all recent detections + match_result = find_best_object_match( + target_obj=self.current_target, + candidates=all_candidates, + max_distance=max_distance, + min_size_similarity=self.min_size_similarity, + ) + + if match_result.is_valid_match: + self.current_target = match_result.matched_object + self.target_grasp_pose = None # Recompute grasp pose + logger.debug( + f"Target tracking successful: distance={match_result.distance:.3f}m, " + f"size_similarity={match_result.size_similarity:.2f}, " + f"confidence={match_result.confidence:.2f}" + ) + return True + + logger.debug( + f"Target tracking lost across {len(self.detection_history)} frames: " + f"distance={match_result.distance:.3f}m, " + f"size_similarity={match_result.size_similarity:.2f}, " + f"thresholds: distance={max_distance:.3f}m, size={self.min_size_similarity:.2f}" + ) + return False + + def compute_control( + self, + ee_pose: Pose, + grasp_distance: float = 0.15, + grasp_pitch_degrees: float = 45.0, + ) -> Tuple[Optional[Vector3], Optional[Vector3], bool, bool, Optional[Pose]]: + """ + Compute PBVS control with position and orientation servoing. + + Args: + ee_pose: Current end-effector pose + grasp_distance: Distance to maintain from target (meters) + + Returns: + Tuple of (velocity_command, angular_velocity_command, target_reached, has_target, target_pose) + - velocity_command: Linear velocity vector or None if no target (None in direct_ee_control mode) + - angular_velocity_command: Angular velocity vector or None if no target (None in direct_ee_control mode) + - target_reached: True if within target tolerance + - has_target: True if currently tracking a target + - target_pose: Target EE pose (only in direct_ee_control mode, otherwise None) + """ + # Check if we have a target + if not self.current_target: + return None, None, False, False, None + + # Update target grasp pose with provided distance and pitch + self.target_grasp_pose = update_target_grasp_pose( + self.current_target.bbox.center, ee_pose, grasp_distance, grasp_pitch_degrees + ) + + if self.target_grasp_pose is None: + logger.warning("Failed to compute grasp pose") + return None, None, False, False, None + + # Compute errors for visualization before checking if reached (in case pose gets cleared) + if self.direct_ee_control and self.target_grasp_pose: + self.last_position_error = Vector3( + self.target_grasp_pose.position.x - ee_pose.position.x, + self.target_grasp_pose.position.y - ee_pose.position.y, + self.target_grasp_pose.position.z - ee_pose.position.z, + ) + + # Check if target reached using our separate function + target_reached = is_target_reached(self.target_grasp_pose, ee_pose, self.target_tolerance) + + # Return appropriate values based on control mode + if self.direct_ee_control: + # Direct control mode + if self.target_grasp_pose: + self.last_target_reached = target_reached + # Return has_target=True since we have a target + return None, None, target_reached, True, self.target_grasp_pose + else: + return None, None, False, True, None + else: + # Velocity control mode - use controller + velocity_cmd, angular_velocity_cmd, controller_reached = ( + self.controller.compute_control(ee_pose, self.target_grasp_pose) + ) + # Return has_target=True since we have a target, regardless of tracking status + return velocity_cmd, angular_velocity_cmd, target_reached, True, None + + def create_status_overlay( + self, + image: np.ndarray, + grasp_stage=None, + ) -> np.ndarray: + """ + Create PBVS status overlay on image. + + Args: + image: Input image + grasp_stage: Current grasp stage (optional) + + Returns: + Image with PBVS status overlay + """ + stage_value = grasp_stage.value if grasp_stage else "idle" + return create_pbvs_visualization( + image, + self.current_target, + self.last_position_error, + self.last_target_reached, + stage_value, + ) + + +class PBVSController: + """ + Low-level Position-Based Visual Servoing controller. + Pure control logic that computes velocity commands from poses. + + Handles: + - Position and orientation error computation + - Velocity command generation with gain control + - Target reached detection + """ + + def __init__( + self, + position_gain: float = 0.5, + rotation_gain: float = 0.3, + max_velocity: float = 0.1, # m/s + max_angular_velocity: float = 0.5, # rad/s + target_tolerance: float = 0.01, # 1cm + ): + """ + Initialize PBVS controller. + + Args: + position_gain: Proportional gain for position control + rotation_gain: Proportional gain for rotation control + max_velocity: Maximum linear velocity command magnitude (m/s) + max_angular_velocity: Maximum angular velocity command magnitude (rad/s) + target_tolerance: Distance threshold for considering target reached (m) + """ + self.position_gain = position_gain + self.rotation_gain = rotation_gain + self.max_velocity = max_velocity + self.max_angular_velocity = max_angular_velocity + self.target_tolerance = target_tolerance + + self.last_position_error = None + self.last_rotation_error = None + self.last_velocity_cmd = None + self.last_angular_velocity_cmd = None + self.last_target_reached = False + + logger.info( + f"Initialized PBVS controller: pos_gain={position_gain}, rot_gain={rotation_gain}, " + f"max_vel={max_velocity}m/s, max_ang_vel={max_angular_velocity}rad/s, " + f"target_tolerance={target_tolerance}m" + ) + + def clear_state(self): + """Clear controller state.""" + self.last_position_error = None + self.last_rotation_error = None + self.last_velocity_cmd = None + self.last_angular_velocity_cmd = None + self.last_target_reached = False + + def compute_control( + self, ee_pose: Pose, grasp_pose: Pose + ) -> Tuple[Optional[Vector3], Optional[Vector3], bool]: + """ + Compute PBVS control with position and orientation servoing. + + Args: + ee_pose: Current end-effector pose + grasp_pose: Target grasp pose + + Returns: + Tuple of (velocity_command, angular_velocity_command, target_reached) + - velocity_command: Linear velocity vector + - angular_velocity_command: Angular velocity vector + - target_reached: True if within target tolerance + """ + # Calculate position error (target - EE position) + error = Vector3( + grasp_pose.position.x - ee_pose.position.x, + grasp_pose.position.y - ee_pose.position.y, + grasp_pose.position.z - ee_pose.position.z, + ) + self.last_position_error = error + + # Compute velocity command with proportional control + velocity_cmd = Vector3( + error.x * self.position_gain, + error.y * self.position_gain, + error.z * self.position_gain, + ) + + # Limit velocity magnitude + vel_magnitude = np.linalg.norm([velocity_cmd.x, velocity_cmd.y, velocity_cmd.z]) + if vel_magnitude > self.max_velocity: + scale = self.max_velocity / vel_magnitude + velocity_cmd = Vector3( + float(velocity_cmd.x * scale), + float(velocity_cmd.y * scale), + float(velocity_cmd.z * scale), + ) + + self.last_velocity_cmd = velocity_cmd + + # Compute angular velocity for orientation control + angular_velocity_cmd = self._compute_angular_velocity(grasp_pose.orientation, ee_pose) + + # Check if target reached + error_magnitude = np.linalg.norm([error.x, error.y, error.z]) + target_reached = bool(error_magnitude < self.target_tolerance) + self.last_target_reached = target_reached + + return velocity_cmd, angular_velocity_cmd, target_reached + + def _compute_angular_velocity(self, target_rot: Quaternion, current_pose: Pose) -> Vector3: + """ + Compute angular velocity commands for orientation control. + Uses quaternion error computation for better numerical stability. + + Args: + target_rot: Target orientation (quaternion) + current_pose: Current EE pose + + Returns: + Angular velocity command as Vector3 + """ + # Use quaternion error for better numerical stability + + # Convert to scipy Rotation objects + target_rot_scipy = R.from_quat([target_rot.x, target_rot.y, target_rot.z, target_rot.w]) + current_rot_scipy = R.from_quat( + [ + current_pose.orientation.x, + current_pose.orientation.y, + current_pose.orientation.z, + current_pose.orientation.w, + ] + ) + + # Compute rotation error: error = target * current^(-1) + error_rot = target_rot_scipy * current_rot_scipy.inv() + + # Convert to axis-angle representation for control + error_axis_angle = error_rot.as_rotvec() + + # Use axis-angle directly as angular velocity error (small angle approximation) + roll_error = error_axis_angle[0] + pitch_error = error_axis_angle[1] + yaw_error = error_axis_angle[2] + + self.last_rotation_error = Vector3(roll_error, pitch_error, yaw_error) + + # Apply proportional control + angular_velocity = Vector3( + roll_error * self.rotation_gain, + pitch_error * self.rotation_gain, + yaw_error * self.rotation_gain, + ) + + # Limit angular velocity magnitude + ang_vel_magnitude = np.sqrt( + angular_velocity.x**2 + angular_velocity.y**2 + angular_velocity.z**2 + ) + if ang_vel_magnitude > self.max_angular_velocity: + scale = self.max_angular_velocity / ang_vel_magnitude + angular_velocity = Vector3( + angular_velocity.x * scale, angular_velocity.y * scale, angular_velocity.z * scale + ) + + self.last_angular_velocity_cmd = angular_velocity + + return angular_velocity + + def create_status_overlay( + self, + image: np.ndarray, + current_target: Optional[Detection3D] = None, + ) -> np.ndarray: + """ + Create PBVS status overlay on image. + + Args: + image: Input image + current_target: Current target object Detection3D (for display) + + Returns: + Image with PBVS status overlay + """ + return create_pbvs_visualization( + image, + current_target, + self.last_position_error, + self.last_target_reached, + "velocity_control", + ) diff --git a/dimos/manipulation/visual_servoing/utils.py b/dimos/manipulation/visual_servoing/utils.py new file mode 100644 index 0000000000..4546326ef6 --- /dev/null +++ b/dimos/manipulation/visual_servoing/utils.py @@ -0,0 +1,828 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from typing import Dict, Any, Optional, List, Tuple, Union +from dataclasses import dataclass + +from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point +from dimos_lcm.vision_msgs import Detection3D, Detection2D +import cv2 +from dimos.perception.detection2d.utils import plot_results +from dimos.perception.common.utils import project_2d_points_to_3d +from dimos.utils.transform_utils import ( + optical_to_robot_frame, + robot_to_optical_frame, + pose_to_matrix, + matrix_to_pose, + euler_to_quaternion, + compose_transforms, + yaw_towards_point, + get_distance, +) + + +def match_detection_by_id( + detection_3d: Detection3D, detections_3d: List[Detection3D], detections_2d: List[Detection2D] +) -> Optional[Detection2D]: + """ + Find the corresponding Detection2D for a given Detection3D. + + Args: + detection_3d: The Detection3D to match + detections_3d: List of all Detection3D objects + detections_2d: List of all Detection2D objects (must be 1:1 correspondence) + + Returns: + Corresponding Detection2D if found, None otherwise + """ + for i, det_3d in enumerate(detections_3d): + if det_3d.id == detection_3d.id and i < len(detections_2d): + return detections_2d[i] + return None + + +def transform_pose( + obj_pos: np.ndarray, + obj_orientation: np.ndarray, + transform_matrix: np.ndarray, + to_optical: bool = False, + to_robot: bool = False, +) -> Pose: + """ + Transform object pose with optional frame convention conversion. + + Args: + obj_pos: Object position [x, y, z] + obj_orientation: Object orientation [roll, pitch, yaw] in radians + transform_matrix: 4x4 transformation matrix from camera frame to desired frame + to_optical: If True, input is in robot frame → convert result to optical frame + to_robot: If True, input is in optical frame → convert to robot frame first + + Returns: + Object pose in desired frame as Pose + """ + # Convert euler angles to quaternion using utility function + euler_vector = Vector3(obj_orientation[0], obj_orientation[1], obj_orientation[2]) + obj_orientation_quat = euler_to_quaternion(euler_vector) + + input_pose = Pose(Point(obj_pos[0], obj_pos[1], obj_pos[2]), obj_orientation_quat) + + # Apply input frame conversion based on flags + if to_robot: + # Input is in optical frame → convert to robot frame first + pose_for_transform = optical_to_robot_frame(input_pose) + else: + # Default or to_optical: use input pose as-is + pose_for_transform = input_pose + + # Create transformation matrix from pose (relative to camera) + T_camera_object = pose_to_matrix(pose_for_transform) + + # Use compose_transforms to combine transformations + T_desired_object = compose_transforms(transform_matrix, T_camera_object) + + # Convert back to pose + result_pose = matrix_to_pose(T_desired_object) + + # Apply output frame conversion based on flags + if to_optical: + # Input was robot frame → convert result to optical frame + desired_pose = robot_to_optical_frame(result_pose) + else: + # Default or to_robot: use result as-is + desired_pose = result_pose + + return desired_pose + + +def transform_points_3d( + points_3d: np.ndarray, + transform_matrix: np.ndarray, + to_optical: bool = False, + to_robot: bool = False, +) -> np.ndarray: + """ + Transform 3D points with optional frame convention conversion. + Applies the same transformation pipeline as transform_pose but for multiple points. + + Args: + points_3d: Nx3 array of 3D points [x, y, z] + transform_matrix: 4x4 transformation matrix from camera frame to desired frame + to_optical: If True, input is in robot frame → convert result to optical frame + to_robot: If True, input is in optical frame → convert to robot frame first + + Returns: + Nx3 array of transformed 3D points in desired frame + """ + if points_3d.size == 0: + return np.zeros((0, 3), dtype=np.float32) + + points_3d = np.asarray(points_3d) + if points_3d.ndim == 1: + points_3d = points_3d.reshape(1, -1) + + transformed_points = [] + + for point in points_3d: + input_point_pose = Pose( + Point(point[0], point[1], point[2]), + Quaternion(0.0, 0.0, 0.0, 1.0), # Identity quaternion + ) + + # Apply input frame conversion based on flags + if to_robot: + # Input is in optical frame → convert to robot frame first + pose_for_transform = optical_to_robot_frame(input_point_pose) + else: + # Default or to_optical: use input pose as-is + pose_for_transform = input_point_pose + + # Create transformation matrix from point pose (relative to camera) + T_camera_point = pose_to_matrix(pose_for_transform) + + # Use compose_transforms to combine transformations + T_desired_point = compose_transforms(transform_matrix, T_camera_point) + + # Convert back to pose + result_pose = matrix_to_pose(T_desired_point) + + # Apply output frame conversion based on flags + if to_optical: + # Input was robot frame → convert result to optical frame + desired_pose = robot_to_optical_frame(result_pose) + else: + # Default or to_robot: use result as-is + desired_pose = result_pose + + transformed_point = [ + desired_pose.position.x, + desired_pose.position.y, + desired_pose.position.z, + ] + transformed_points.append(transformed_point) + + return np.array(transformed_points, dtype=np.float32) + + +def select_points_from_depth( + depth_image: np.ndarray, + target_point: Tuple[int, int], + camera_intrinsics: Union[List[float], np.ndarray], + radius: int = 5, +) -> np.ndarray: + """ + Select points around a target point within a bounding box and project them to 3D. + + Args: + depth_image: Depth image in meters (H, W) + target_point: (x, y) target point coordinates + radius: Half-width of the bounding box (so bbox size is radius*2 x radius*2) + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + + Returns: + Nx3 array of 3D points (X, Y, Z) in camera frame + """ + x_target, y_target = target_point + height, width = depth_image.shape + + x_min = max(0, x_target - radius) + x_max = min(width, x_target + radius) + y_min = max(0, y_target - radius) + y_max = min(height, y_target + radius) + + # Create coordinate grids for the bounding box (vectorized) + y_coords, x_coords = np.meshgrid(range(y_min, y_max), range(x_min, x_max), indexing="ij") + + # Flatten to get all coordinate pairs + x_flat = x_coords.flatten() + y_flat = y_coords.flatten() + + # Extract corresponding depth values using advanced indexing + depth_flat = depth_image[y_flat, x_flat] + + valid_mask = (depth_flat > 0) & np.isfinite(depth_flat) + + if not np.any(valid_mask): + return np.zeros((0, 3), dtype=np.float32) + + points_2d = np.column_stack([x_flat[valid_mask], y_flat[valid_mask]]).astype(np.float32) + depth_values = depth_flat[valid_mask].astype(np.float32) + + points_3d = project_2d_points_to_3d(points_2d, depth_values, camera_intrinsics) + + return points_3d + + +def update_target_grasp_pose( + target_pose: Pose, ee_pose: Pose, grasp_distance: float = 0.0, grasp_pitch_degrees: float = 45.0 +) -> Optional[Pose]: + """ + Update target grasp pose based on current target pose and EE pose. + + Args: + target_pose: Target pose to grasp + ee_pose: Current end-effector pose + grasp_distance: Distance to maintain from target (pregrasp or grasp distance) + grasp_pitch_degrees: Grasp pitch angle in degrees (default 90° for top-down) + + Returns: + Target grasp pose or None if target is invalid + """ + + target_pos = target_pose.position + + # Calculate orientation pointing from target towards EE + yaw_to_ee = yaw_towards_point(target_pos, ee_pose.position) + + # Create target pose with proper orientation + # Convert grasp pitch from degrees to radians with mapping: + # 0° (level) -> π/2 (1.57 rad), 90° (top-down) -> π (3.14 rad) + pitch_radians = 1.57 + np.radians(grasp_pitch_degrees) + + # Convert euler angles to quaternion using utility function + euler = Vector3(0.0, pitch_radians, yaw_to_ee) # roll=0, pitch=mapped, yaw=calculated + target_orientation = euler_to_quaternion(euler) + + updated_pose = Pose(target_pos, target_orientation) + + if grasp_distance > 0.0: + return apply_grasp_distance(updated_pose, grasp_distance) + else: + return updated_pose + + +def apply_grasp_distance(target_pose: Pose, distance: float) -> Pose: + """ + Apply grasp distance offset to target pose along its approach direction. + + Args: + target_pose: Target grasp pose + distance: Distance to offset along the approach direction (meters) + + Returns: + Target pose offset by the specified distance along its approach direction + """ + # Convert pose to transformation matrix to extract rotation + T_target = pose_to_matrix(target_pose) + rotation_matrix = T_target[:3, :3] + + # Define the approach vector based on the target pose orientation + # Assuming the gripper approaches along its local -z axis (common for downward grasps) + # You can change this to [1, 0, 0] for x-axis or [0, 1, 0] for y-axis based on your gripper + approach_vector_local = np.array([0, 0, -1]) + + # Transform approach vector to world coordinates + approach_vector_world = rotation_matrix @ approach_vector_local + + # Apply offset along the approach direction + offset_position = Point( + target_pose.position.x + distance * approach_vector_world[0], + target_pose.position.y + distance * approach_vector_world[1], + target_pose.position.z + distance * approach_vector_world[2], + ) + + return Pose(offset_position, target_pose.orientation) + + +def is_target_reached(target_pose: Pose, current_pose: Pose, tolerance: float = 0.01) -> bool: + """ + Check if the target pose has been reached within tolerance. + + Args: + target_pose: Target pose to reach + current_pose: Current pose (e.g., end-effector pose) + tolerance: Distance threshold for considering target reached (meters, default 0.01 = 1cm) + + Returns: + True if target is reached within tolerance, False otherwise + """ + # Calculate position error using distance utility + error_magnitude = get_distance(target_pose, current_pose) + return error_magnitude < tolerance + + +@dataclass +class ObjectMatchResult: + """Result of object matching with confidence metrics.""" + + matched_object: Optional[Detection3D] + confidence: float + distance: float + size_similarity: float + is_valid_match: bool + + +def calculate_object_similarity( + target_obj: Detection3D, + candidate_obj: Detection3D, + distance_weight: float = 0.6, + size_weight: float = 0.4, +) -> Tuple[float, float, float]: + """ + Calculate comprehensive similarity between two objects. + + Args: + target_obj: Target Detection3D object + candidate_obj: Candidate Detection3D object + distance_weight: Weight for distance component (0-1) + size_weight: Weight for size component (0-1) + + Returns: + Tuple of (total_similarity, distance_m, size_similarity) + """ + # Extract positions + target_pos = target_obj.bbox.center.position + candidate_pos = candidate_obj.bbox.center.position + + target_xyz = np.array([target_pos.x, target_pos.y, target_pos.z]) + candidate_xyz = np.array([candidate_pos.x, candidate_pos.y, candidate_pos.z]) + + # Calculate Euclidean distance + distance = np.linalg.norm(target_xyz - candidate_xyz) + distance_similarity = 1.0 / (1.0 + distance) # Exponential decay + + # Calculate size similarity by comparing each dimension individually + size_similarity = 1.0 # Default if no size info + target_size = target_obj.bbox.size + candidate_size = candidate_obj.bbox.size + + if target_size and candidate_size: + # Extract dimensions + target_dims = [target_size.x, target_size.y, target_size.z] + candidate_dims = [candidate_size.x, candidate_size.y, candidate_size.z] + + # Calculate similarity for each dimension pair + dim_similarities = [] + for target_dim, candidate_dim in zip(target_dims, candidate_dims): + if target_dim == 0.0 and candidate_dim == 0.0: + dim_similarities.append(1.0) # Both dimensions are zero + elif target_dim == 0.0 or candidate_dim == 0.0: + dim_similarities.append(0.0) # One dimension is zero, other is not + else: + # Calculate similarity as min/max ratio + max_dim = max(target_dim, candidate_dim) + min_dim = min(target_dim, candidate_dim) + dim_similarity = min_dim / max_dim if max_dim > 0 else 0.0 + dim_similarities.append(dim_similarity) + + # Return average similarity across all dimensions + size_similarity = np.mean(dim_similarities) if dim_similarities else 0.0 + + # Weighted combination + total_similarity = distance_weight * distance_similarity + size_weight * size_similarity + + return total_similarity, distance, size_similarity + + +def find_best_object_match( + target_obj: Detection3D, + candidates: List[Detection3D], + max_distance: float = 0.1, + min_size_similarity: float = 0.4, + distance_weight: float = 0.7, + size_weight: float = 0.3, +) -> ObjectMatchResult: + """ + Find the best matching object from candidates using distance and size criteria. + + Args: + target_obj: Target Detection3D to match against + candidates: List of candidate Detection3D objects + max_distance: Maximum allowed distance for valid match (meters) + min_size_similarity: Minimum size similarity for valid match (0-1) + distance_weight: Weight for distance in similarity calculation + size_weight: Weight for size in similarity calculation + + Returns: + ObjectMatchResult with best match and confidence metrics + """ + if not candidates or not target_obj.bbox or not target_obj.bbox.center: + return ObjectMatchResult(None, 0.0, float("inf"), 0.0, False) + + best_match = None + best_confidence = 0.0 + best_distance = float("inf") + best_size_sim = 0.0 + + for candidate in candidates: + if not candidate.bbox or not candidate.bbox.center: + continue + + similarity, distance, size_sim = calculate_object_similarity( + target_obj, candidate, distance_weight, size_weight + ) + + # Check validity constraints + is_valid = distance <= max_distance and size_sim >= min_size_similarity + + if is_valid and similarity > best_confidence: + best_match = candidate + best_confidence = similarity + best_distance = distance + best_size_sim = size_sim + + return ObjectMatchResult( + matched_object=best_match, + confidence=best_confidence, + distance=best_distance, + size_similarity=best_size_sim, + is_valid_match=best_match is not None, + ) + + +def parse_zed_pose(zed_pose_data: Dict[str, Any]) -> Optional[Pose]: + """ + Parse ZED pose data dictionary into a Pose object. + + Args: + zed_pose_data: Dictionary from ZEDCamera.get_pose() containing: + - position: [x, y, z] in meters + - rotation: [x, y, z, w] quaternion + - euler_angles: [roll, pitch, yaw] in radians + - valid: Whether pose is valid + + Returns: + Pose object with position and orientation, or None if invalid + """ + if not zed_pose_data or not zed_pose_data.get("valid", False): + return None + + # Extract position + position = zed_pose_data.get("position", [0, 0, 0]) + pos_vector = Point(position[0], position[1], position[2]) + + quat = zed_pose_data["rotation"] + orientation = Quaternion(quat[0], quat[1], quat[2], quat[3]) + return Pose(pos_vector, orientation) + + +def estimate_object_depth( + depth_image: np.ndarray, segmentation_mask: Optional[np.ndarray], bbox: List[float] +) -> float: + """ + Estimate object depth dimension using segmentation mask and depth data. + Optimized for real-time performance. + + Args: + depth_image: Depth image in meters + segmentation_mask: Binary segmentation mask for the object + bbox: Bounding box [x1, y1, x2, y2] + + Returns: + Estimated object depth in meters + """ + x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) + + # Extract depth ROI once + roi_depth = depth_image[y1:y2, x1:x2] + + if segmentation_mask is not None and segmentation_mask.size > 0: + # Extract mask ROI efficiently + mask_roi = ( + segmentation_mask[y1:y2, x1:x2] + if segmentation_mask.shape != roi_depth.shape + else segmentation_mask + ) + + # Fast mask application using boolean indexing + valid_mask = mask_roi > 0 + if np.sum(valid_mask) > 10: # Early exit if not enough points + masked_depths = roi_depth[valid_mask] + + # Fast percentile calculation using numpy's optimized functions + depth_90 = np.percentile(masked_depths, 90) + depth_10 = np.percentile(masked_depths, 10) + depth_range = depth_90 - depth_10 + + # Clamp to reasonable bounds with single operation + return np.clip(depth_range, 0.02, 0.5) + + # Fast fallback using area calculation + bbox_area = (x2 - x1) * (y2 - y1) + + # Vectorized area-based estimation + if bbox_area > 10000: + return 0.15 + elif bbox_area > 5000: + return 0.10 + else: + return 0.05 + + +# ============= Visualization Functions ============= + + +def create_manipulation_visualization( + rgb_image: np.ndarray, + feedback, + detection_3d_array=None, + detection_2d_array=None, +) -> np.ndarray: + """ + Create simple visualization for manipulation class using feedback. + + Args: + rgb_image: RGB image array + feedback: Feedback object containing all state information + detection_3d_array: Optional 3D detections for object visualization + detection_2d_array: Optional 2D detections for object visualization + + Returns: + BGR image with visualization overlays + """ + # Convert to BGR for OpenCV + viz = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) + + # Draw detections if available + if detection_3d_array and detection_2d_array: + # Extract 2D bboxes + bboxes_2d = [] + for det_2d in detection_2d_array.detections: + if det_2d.bbox: + x1 = det_2d.bbox.center.position.x - det_2d.bbox.size_x / 2 + y1 = det_2d.bbox.center.position.y - det_2d.bbox.size_y / 2 + x2 = det_2d.bbox.center.position.x + det_2d.bbox.size_x / 2 + y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2 + bboxes_2d.append([x1, y1, x2, y2]) + + # Draw basic detections + rgb_with_detections = visualize_detections_3d( + rgb_image, detection_3d_array.detections, show_coordinates=True, bboxes_2d=bboxes_2d + ) + viz = cv2.cvtColor(rgb_with_detections, cv2.COLOR_RGB2BGR) + + # Add manipulation status overlay + status_y = 30 + cv2.putText( + viz, + "Eye-in-Hand Visual Servoing", + (10, status_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 255), + 2, + ) + + # Stage information + stage_text = f"Stage: {feedback.grasp_stage.value.upper()}" + stage_color = { + "idle": (100, 100, 100), + "pre_grasp": (0, 255, 255), + "grasp": (0, 255, 0), + "close_and_retract": (255, 0, 255), + "place": (0, 150, 255), + "retract": (255, 150, 0), + }.get(feedback.grasp_stage.value, (255, 255, 255)) + + cv2.putText( + viz, + stage_text, + (10, status_y + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + stage_color, + 1, + ) + + # Target tracking status + if feedback.target_tracked: + cv2.putText( + viz, + "Target: TRACKED", + (10, status_y + 45), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 255, 0), + 1, + ) + elif feedback.grasp_stage.value != "idle": + cv2.putText( + viz, + "Target: LOST", + (10, status_y + 45), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (0, 0, 255), + 1, + ) + + # Waiting status + if feedback.waiting_for_reach: + cv2.putText( + viz, + "Status: WAITING FOR ROBOT", + (10, status_y + 65), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 0), + 1, + ) + + # Overall result + if feedback.success is not None: + result_text = "Pick & Place: SUCCESS" if feedback.success else "Pick & Place: FAILED" + result_color = (0, 255, 0) if feedback.success else (0, 0, 255) + cv2.putText( + viz, + result_text, + (10, status_y + 85), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + result_color, + 2, + ) + + # Control hints (bottom of image) + hint_text = "Click object to grasp | s=STOP | r=RESET | g=RELEASE" + cv2.putText( + viz, + hint_text, + (10, viz.shape[0] - 10), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + (200, 200, 200), + 1, + ) + + return viz + + +def create_pbvs_visualization( + image: np.ndarray, + current_target=None, + position_error=None, + target_reached=False, + grasp_stage="idle", +) -> np.ndarray: + """ + Create simple PBVS visualization overlay. + + Args: + image: Input image (RGB or BGR) + current_target: Current target Detection3D + position_error: Position error Vector3 + target_reached: Whether target is reached + grasp_stage: Current grasp stage string + + Returns: + Image with PBVS overlay + """ + viz = image.copy() + + # Only show PBVS info if we have a target + if current_target is None: + return viz + + # Create status panel at bottom + height, width = viz.shape[:2] + panel_height = 100 + panel_y = height - panel_height + + # Semi-transparent overlay + overlay = viz.copy() + cv2.rectangle(overlay, (0, panel_y), (width, height), (0, 0, 0), -1) + viz = cv2.addWeighted(viz, 0.7, overlay, 0.3, 0) + + # PBVS Status + y_offset = panel_y + 20 + cv2.putText( + viz, + "PBVS Control", + (10, y_offset), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 255), + 2, + ) + + # Position error + if position_error: + error_mag = np.linalg.norm([position_error.x, position_error.y, position_error.z]) + error_text = f"Error: {error_mag * 100:.1f}cm" + error_color = (0, 255, 0) if target_reached else (0, 255, 255) + cv2.putText( + viz, + error_text, + (10, y_offset + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + error_color, + 1, + ) + + # Stage + cv2.putText( + viz, + f"Stage: {grasp_stage}", + (10, y_offset + 45), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 150, 255), + 1, + ) + + # Target reached indicator + if target_reached: + cv2.putText( + viz, + "TARGET REACHED", + (width - 150, y_offset + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 0), + 2, + ) + + return viz + + +def visualize_detections_3d( + rgb_image: np.ndarray, + detections: List[Detection3D], + show_coordinates: bool = True, + bboxes_2d: Optional[List[List[float]]] = None, +) -> np.ndarray: + """ + Visualize detections with 3D position overlay next to bounding boxes. + + Args: + rgb_image: Original RGB image + detections: List of Detection3D objects + show_coordinates: Whether to show 3D coordinates next to bounding boxes + bboxes_2d: Optional list of 2D bounding boxes corresponding to detections + + Returns: + Visualization image + """ + if not detections: + return rgb_image.copy() + + # If no 2D bboxes provided, skip visualization + if bboxes_2d is None: + return rgb_image.copy() + + # Extract data for plot_results function + bboxes = bboxes_2d + track_ids = [int(det.id) if det.id.isdigit() else i for i, det in enumerate(detections)] + class_ids = [i for i in range(len(detections))] + confidences = [ + det.results[0].hypothesis.score if det.results_length > 0 else 0.0 for det in detections + ] + names = [ + det.results[0].hypothesis.class_id if det.results_length > 0 else "unknown" + for det in detections + ] + + # Use plot_results for basic visualization + viz = plot_results(rgb_image, bboxes, track_ids, class_ids, confidences, names) + + # Add 3D position coordinates if requested + if show_coordinates and bboxes_2d is not None: + for i, det in enumerate(detections): + if det.bbox and det.bbox.center and i < len(bboxes_2d): + position = det.bbox.center.position + bbox = bboxes_2d[i] + + pos_xyz = np.array([position.x, position.y, position.z]) + + # Get bounding box coordinates + x1, y1, x2, y2 = map(int, bbox) + + # Add position text next to bounding box (top-right corner) + pos_text = f"({pos_xyz[0]:.2f}, {pos_xyz[1]:.2f}, {pos_xyz[2]:.2f})" + text_x = x2 + 5 # Right edge of bbox + small offset + text_y = y1 + 15 # Top edge of bbox + small offset + + # Add background rectangle for better readability + text_size = cv2.getTextSize(pos_text, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)[0] + cv2.rectangle( + viz, + (text_x - 2, text_y - text_size[1] - 2), + (text_x + text_size[0] + 2, text_y + 2), + (0, 0, 0), + -1, + ) + + cv2.putText( + viz, + pos_text, + (text_x, text_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + (255, 255, 255), + 1, + ) + + return viz diff --git a/dimos/msgs/geometry_msgs/Quaternion.py b/dimos/msgs/geometry_msgs/Quaternion.py index 9879e1e263..a7bb5543c1 100644 --- a/dimos/msgs/geometry_msgs/Quaternion.py +++ b/dimos/msgs/geometry_msgs/Quaternion.py @@ -22,6 +22,7 @@ import numpy as np from dimos_lcm.geometry_msgs import Quaternion as LCMQuaternion from plum import dispatch +from scipy.spatial.transform import Rotation as R from dimos.msgs.geometry_msgs.Vector3 import Vector3 @@ -117,27 +118,12 @@ def to_euler(self) -> Vector3: Returns: Vector3: Euler angles as (roll, pitch, yaw) in radians """ - # Convert quaternion to Euler angles using ZYX convention (yaw, pitch, roll) - # Source: https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles - - # Roll (x-axis rotation) - sinr_cosp = 2 * (self.w * self.x + self.y * self.z) - cosr_cosp = 1 - 2 * (self.x * self.x + self.y * self.y) - roll = np.arctan2(sinr_cosp, cosr_cosp) - - # Pitch (y-axis rotation) - sinp = 2 * (self.w * self.y - self.z * self.x) - if abs(sinp) >= 1: - pitch = np.copysign(np.pi / 2, sinp) # Use 90 degrees if out of range - else: - pitch = np.arcsin(sinp) - - # Yaw (z-axis rotation) - siny_cosp = 2 * (self.w * self.z + self.x * self.y) - cosy_cosp = 1 - 2 * (self.y * self.y + self.z * self.z) - yaw = np.arctan2(siny_cosp, cosy_cosp) + # Use scipy for accurate quaternion to euler conversion + quat = [self.x, self.y, self.z, self.w] + rotation = R.from_quat(quat) + euler_angles = rotation.as_euler("xyz") # roll, pitch, yaw - return Vector3(roll, pitch, yaw) + return Vector3(euler_angles[0], euler_angles[1], euler_angles[2]) def __getitem__(self, idx: int) -> float: """Allow indexing into quaternion components: 0=x, 1=y, 2=z, 3=w.""" diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py index da9cc58fe0..10d05d9b4d 100644 --- a/dimos/perception/common/utils.py +++ b/dimos/perception/common/utils.py @@ -14,15 +14,113 @@ import cv2 import numpy as np -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Any, Union from dimos.types.manipulation import ObjectData from dimos.types.vector import Vector from dimos.utils.logging_config import setup_logger +from dimos_lcm.vision_msgs import Detection3D, Detection2D, BoundingBox2D import torch logger = setup_logger("dimos.perception.common.utils") +def project_3d_points_to_2d( + points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] +) -> np.ndarray: + """ + Project 3D points to 2D image coordinates using camera intrinsics. + + Args: + points_3d: Nx3 array of 3D points (X, Y, Z) + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + + Returns: + Nx2 array of 2D image coordinates (u, v) + """ + if len(points_3d) == 0: + return np.zeros((0, 2), dtype=np.int32) + + # Filter out points with zero or negative depth + valid_mask = points_3d[:, 2] > 0 + if not np.any(valid_mask): + return np.zeros((0, 2), dtype=np.int32) + + valid_points = points_3d[valid_mask] + + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + camera_matrix = np.array(camera_intrinsics) + fx = camera_matrix[0, 0] + fy = camera_matrix[1, 1] + cx = camera_matrix[0, 2] + cy = camera_matrix[1, 2] + + # Project to image coordinates + u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx + v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy + + # Round to integer pixel coordinates + points_2d = np.column_stack([u, v]).astype(np.int32) + + return points_2d + + +def project_2d_points_to_3d( + points_2d: np.ndarray, + depth_values: np.ndarray, + camera_intrinsics: Union[List[float], np.ndarray], +) -> np.ndarray: + """ + Project 2D image points to 3D coordinates using depth values and camera intrinsics. + + Args: + points_2d: Nx2 array of 2D image coordinates (u, v) + depth_values: N-length array of depth values (Z coordinates) for each point + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix + + Returns: + Nx3 array of 3D points (X, Y, Z) + """ + if len(points_2d) == 0: + return np.zeros((0, 3), dtype=np.float32) + + # Ensure depth_values is a numpy array + depth_values = np.asarray(depth_values) + + # Filter out points with zero or negative depth + valid_mask = depth_values > 0 + if not np.any(valid_mask): + return np.zeros((0, 3), dtype=np.float32) + + valid_points_2d = points_2d[valid_mask] + valid_depths = depth_values[valid_mask] + + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + camera_matrix = np.array(camera_intrinsics) + fx = camera_matrix[0, 0] + fy = camera_matrix[1, 1] + cx = camera_matrix[0, 2] + cy = camera_matrix[1, 2] + + # Back-project to 3D coordinates + # X = (u - cx) * Z / fx + # Y = (v - cy) * Z / fy + # Z = depth + X = (valid_points_2d[:, 0] - cx) * valid_depths / fx + Y = (valid_points_2d[:, 1] - cy) * valid_depths / fy + Z = valid_depths + + # Stack into 3D points + points_3d = np.column_stack([X, Y, Z]).astype(np.float32) + + return points_3d + + def colorize_depth(depth_img: np.ndarray, max_depth: float = 5.0) -> Optional[np.ndarray]: """ Normalize and colorize depth image using COLORMAP_JET. @@ -329,3 +427,68 @@ def combine_object_data( combined.append(obj_copy) return combined + + +def point_in_bbox(point: Tuple[int, int], bbox: List[float]) -> bool: + """ + Check if a point is inside a bounding box. + + Args: + point: (x, y) coordinates + bbox: Bounding box [x1, y1, x2, y2] + + Returns: + True if point is inside bbox + """ + x, y = point + x1, y1, x2, y2 = bbox + return x1 <= x <= x2 and y1 <= y <= y2 + + +def bbox2d_to_corners(bbox_2d: BoundingBox2D) -> Tuple[float, float, float, float]: + """ + Convert BoundingBox2D from center format to corner format. + + Args: + bbox_2d: BoundingBox2D with center and size + + Returns: + Tuple of (x1, y1, x2, y2) corner coordinates + """ + center_x = bbox_2d.center.position.x + center_y = bbox_2d.center.position.y + half_width = bbox_2d.size_x / 2.0 + half_height = bbox_2d.size_y / 2.0 + + x1 = center_x - half_width + y1 = center_y - half_height + x2 = center_x + half_width + y2 = center_y + half_height + + return x1, y1, x2, y2 + + +def find_clicked_detection( + click_pos: Tuple[int, int], detections_2d: List[Detection2D], detections_3d: List[Detection3D] +) -> Optional[Detection3D]: + """ + Find which detection was clicked based on 2D bounding boxes. + + Args: + click_pos: (x, y) click position + detections_2d: List of Detection2D objects + detections_3d: List of Detection3D objects (must be 1:1 correspondence) + + Returns: + Corresponding Detection3D object if found, None otherwise + """ + click_x, click_y = click_pos + + for i, det_2d in enumerate(detections_2d): + if det_2d.bbox and i < len(detections_3d): + x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox) + + if x1 <= click_x <= x2 and y1 <= click_y <= y2: + return detections_3d[i] + + return None diff --git a/dimos/perception/detection2d/detic_2d_det.py b/dimos/perception/detection2d/detic_2d_det.py index fc81526ad2..8bc4f9c4b0 100644 --- a/dimos/perception/detection2d/detic_2d_det.py +++ b/dimos/perception/detection2d/detic_2d_det.py @@ -15,6 +15,7 @@ import numpy as np import os import sys +from dimos.perception.detection2d.utils import plot_results # Add Detic to Python path detic_path = os.path.join(os.path.dirname(__file__), "..", "..", "models", "Detic") @@ -404,7 +405,6 @@ def visualize_results(self, image, bboxes, track_ids, class_ids, confidences, na Returns: Image with visualized detections """ - from dimos.perception.detection2d.utils import plot_results return plot_results(image, bboxes, track_ids, class_ids, confidences, names) diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py index 94377363f2..ab0cfd0d15 100644 --- a/dimos/perception/grasp_generation/utils.py +++ b/dimos/perception/grasp_generation/utils.py @@ -18,49 +18,7 @@ import open3d as o3d import cv2 from typing import List, Dict, Tuple, Optional, Union - - -def project_3d_points_to_2d( - points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] -) -> np.ndarray: - """ - Project 3D points to 2D image coordinates using camera intrinsics. - - Args: - points_3d: Nx3 array of 3D points (X, Y, Z) - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - - Returns: - Nx2 array of 2D image coordinates (u, v) - """ - if len(points_3d) == 0: - return np.zeros((0, 2), dtype=np.int32) - - # Filter out points with zero or negative depth - valid_mask = points_3d[:, 2] > 0 - if not np.any(valid_mask): - return np.zeros((0, 2), dtype=np.int32) - - valid_points = points_3d[valid_mask] - - # Extract camera parameters - if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: - fx, fy, cx, cy = camera_intrinsics - else: - camera_matrix = np.array(camera_intrinsics) - fx = camera_matrix[0, 0] - fy = camera_matrix[1, 1] - cx = camera_matrix[0, 2] - cy = camera_matrix[1, 2] - - # Project to image coordinates - u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx - v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy - - # Round to integer pixel coordinates - points_2d = np.column_stack([u, v]).astype(np.int32) - - return points_2d +from dimos.perception.common.utils import project_3d_points_to_2d, project_2d_points_to_3d def create_gripper_geometry( diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py index 0813c2ca0e..b3c395bfa3 100644 --- a/dimos/perception/pointcloud/utils.py +++ b/dimos/perception/pointcloud/utils.py @@ -24,8 +24,9 @@ import os import cv2 import open3d as o3d -from typing import List, Optional, Tuple, Union, Dict +from typing import List, Optional, Tuple, Union, Dict, Any from scipy.spatial import cKDTree +from dimos.perception.common.utils import project_3d_points_to_2d def load_camera_matrix_from_yaml( @@ -304,48 +305,6 @@ def filter_point_cloud_radius( return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius) -def project_3d_points_to_2d( - points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray] -) -> np.ndarray: - """ - Project 3D points to 2D image coordinates using camera intrinsics. - - Args: - points_3d: Nx3 array of 3D points (X, Y, Z) - camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix - - Returns: - Nx2 array of 2D image coordinates (u, v) - """ - if len(points_3d) == 0: - return np.zeros((0, 2), dtype=np.int32) - - # Filter out points with zero or negative depth - valid_mask = points_3d[:, 2] > 0 - if not np.any(valid_mask): - return np.zeros((0, 2), dtype=np.int32) - - valid_points = points_3d[valid_mask] - - # Extract camera parameters - if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: - fx, fy, cx, cy = camera_intrinsics - else: - fx = camera_intrinsics[0, 0] - fy = camera_intrinsics[1, 1] - cx = camera_intrinsics[0, 2] - cy = camera_intrinsics[1, 2] - - # Project to image coordinates - u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx - v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy - - # Round to integer pixel coordinates - points_2d = np.column_stack([u, v]).astype(np.int32) - - return points_2d - - def overlay_point_clouds_on_image( base_image: np.ndarray, point_clouds: List[o3d.geometry.PointCloud], @@ -1080,3 +1039,73 @@ def combine_object_pointclouds( combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors)) return combined_pcd + + +def extract_centroids_from_masks( + rgb_image: np.ndarray, + depth_image: np.ndarray, + masks: List[np.ndarray], + camera_intrinsics: Union[List[float], np.ndarray], +) -> List[Dict[str, Any]]: + """ + Extract 3D centroids and orientations from segmentation masks. + + Args: + rgb_image: RGB image (H, W, 3) + depth_image: Depth image (H, W) in meters + masks: List of boolean masks (H, W) + camera_intrinsics: Camera parameters as [fx, fy, cx, cy] or 3x3 matrix + + Returns: + List of dictionaries containing: + - centroid: 3D centroid position [x, y, z] in camera frame + - orientation: Normalized direction vector from camera to centroid + - num_points: Number of valid 3D points + - mask_idx: Index of the mask in the input list + """ + # Extract camera parameters + if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4: + fx, fy, cx, cy = camera_intrinsics + else: + fx = camera_intrinsics[0, 0] + fy = camera_intrinsics[1, 1] + cx = camera_intrinsics[0, 2] + cy = camera_intrinsics[1, 2] + + results = [] + + for mask_idx, mask in enumerate(masks): + if mask is None or mask.sum() == 0: + continue + + # Get pixel coordinates where mask is True + y_coords, x_coords = np.where(mask) + + # Get depth values at mask locations + depths = depth_image[y_coords, x_coords] + + # Convert to 3D points in camera frame + X = (x_coords - cx) * depths / fx + Y = (y_coords - cy) * depths / fy + Z = depths + + # Calculate centroid + centroid_x = np.mean(X) + centroid_y = np.mean(Y) + centroid_z = np.mean(Z) + centroid = np.array([centroid_x, centroid_y, centroid_z]) + + # Calculate orientation as normalized direction from camera origin to centroid + # Camera origin is at (0, 0, 0) + orientation = centroid / np.linalg.norm(centroid) + + results.append( + { + "centroid": centroid, + "orientation": orientation, + "num_points": int(mask.sum()), + "mask_idx": mask_idx, + } + ) + + return results diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py index fcf27584e6..cb2acaf076 100644 --- a/dimos/perception/segmentation/sam_2d_seg.py +++ b/dimos/perception/segmentation/sam_2d_seg.py @@ -42,13 +42,12 @@ def __init__( self, model_path="models_fastsam", model_name="FastSAM-s.onnx", - device="cpu", min_analysis_interval=5.0, use_tracker=True, use_analyzer=True, use_rich_labeling=False, + use_filtering=True, ): - self.device = device if is_cuda_available(): logger.info("Using CUDA for SAM 2d segmenter") if hasattr(onnxruntime, "preload_dlls"): # Handles CUDA 11 / onnxruntime-gpu<=1.18 @@ -62,6 +61,7 @@ def __init__( self.use_tracker = use_tracker self.use_analyzer = use_analyzer self.use_rich_labeling = use_rich_labeling + self.use_filtering = use_filtering module_dir = os.path.dirname(__file__) self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml") @@ -98,11 +98,10 @@ def process_image(self, image): source=image, device=self.device, retina_masks=True, - conf=0.6, - iou=0.9, + conf=0.3, + iou=0.5, persist=True, verbose=False, - tracker=self.tracker_config, ) if len(results) > 0: @@ -112,14 +111,25 @@ def process_image(self, image): ) # Filter results - ( - filtered_masks, - filtered_bboxes, - filtered_track_ids, - filtered_probs, - filtered_names, - filtered_texture_values, - ) = filter_segmentation_results(image, masks, bboxes, track_ids, probs, names, areas) + if self.use_filtering: + ( + filtered_masks, + filtered_bboxes, + filtered_track_ids, + filtered_probs, + filtered_names, + filtered_texture_values, + ) = filter_segmentation_results( + image, masks, bboxes, track_ids, probs, names, areas + ) + else: + # Use original results without filtering + filtered_masks = masks + filtered_bboxes = bboxes + filtered_track_ids = track_ids + filtered_probs = probs + filtered_names = names + filtered_texture_values = [] if self.use_tracker: # Update tracker with filtered results diff --git a/dimos/robot/agilex/README.md b/dimos/robot/agilex/README.md new file mode 100644 index 0000000000..1e678cae65 --- /dev/null +++ b/dimos/robot/agilex/README.md @@ -0,0 +1,371 @@ +# DIMOS Manipulator Robot Development Guide + +This guide explains how to create robot classes, integrate agents, and use the DIMOS module system with LCM transport. + +## Table of Contents +1. [Robot Class Architecture](#robot-class-architecture) +2. [Module System & LCM Transport](#module-system--lcm-transport) +3. [Agent Integration](#agent-integration) +4. [Complete Example](#complete-example) + +## Robot Class Architecture + +### Basic Robot Class Structure + +A DIMOS robot class should follow this pattern: + +```python +from typing import Optional, List +from dimos import core +from dimos.types.robot_capabilities import RobotCapability + +class YourRobot: + """Your robot implementation.""" + + def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): + # Core components + self.dimos = None + self.modules = {} + self.skill_library = SkillLibrary() + + # Define capabilities + self.capabilities = robot_capabilities or [ + RobotCapability.VISION, + RobotCapability.MANIPULATION, + ] + + async def start(self): + """Start the robot modules.""" + # Initialize DIMOS with worker count + self.dimos = core.start(2) # Number of workers needed + + # Deploy modules + # ... (see Module System section) + + def stop(self): + """Stop all modules and clean up.""" + # Stop modules + # Close DIMOS + if self.dimos: + self.dimos.close() +``` + +### Key Components Explained + +1. **Initialization**: Store references to modules, skills, and capabilities +2. **Async Start**: Modules must be deployed asynchronously +3. **Proper Cleanup**: Always stop modules before closing DIMOS + +## Module System & LCM Transport + +### Understanding DIMOS Modules + +Modules are the building blocks of DIMOS robots. They: +- Process data streams (inputs) +- Produce outputs +- Can be connected together +- Communicate via LCM (Lightweight Communications and Marshalling) + +### Deploying a Module + +```python +# Deploy a camera module +self.camera = self.dimos.deploy( + ZEDModule, # Module class + camera_id=0, # Module parameters + resolution="HD720", + depth_mode="NEURAL", + fps=30, + publish_rate=30.0, + frame_id="camera_frame" +) +``` + +### Setting Up LCM Transport + +LCM transport enables inter-module communication: + +```python +# Enable LCM auto-configuration +from dimos.protocol import pubsub +pubsub.lcm.autoconf() + +# Configure output transport +self.camera.color_image.transport = core.LCMTransport( + "/camera/color_image", # Topic name + Image # Message type +) +self.camera.depth_image.transport = core.LCMTransport( + "/camera/depth_image", + Image +) +``` + +### Connecting Modules + +Connect module outputs to inputs: + +```python +# Connect manipulation module to camera outputs +self.manipulation.rgb_image.connect(self.camera.color_image) +self.manipulation.depth_image.connect(self.camera.depth_image) +self.manipulation.camera_info.connect(self.camera.camera_info) +``` + +### Module Communication Pattern + +``` +┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐ +│ Camera │────────▶│ Manipulation │────────▶│ Visualization│ +│ Module │ Messages│ Module │ Messages│ Output │ +└──────────────┘ └────────────────┘ └──────────────┘ + ▲ ▲ + │ │ + └──────────────────────────┘ + Direct Connection via RPC call +``` + +## Agent Integration + +### Setting Up Agent with Robot + +The run file pattern for agent integration: + +```python +#!/usr/bin/env python3 +import asyncio +import reactivex as rx +from dimos.agents.claude_agent import ClaudeAgent +from dimos.web.robot_web_interface import RobotWebInterface + +def main(): + # 1. Create and start robot + robot = YourRobot() + asyncio.run(robot.start()) + + # 2. Set up skills + skills = robot.get_skills() + skills.add(YourSkill) + skills.create_instance("YourSkill", robot=robot) + + # 3. Set up reactive streams + agent_response_subject = rx.subject.Subject() + agent_response_stream = agent_response_subject.pipe(ops.share()) + + # 4. Create web interface + web_interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": agent_response_stream}, + audio_subject=rx.subject.Subject() + ) + + # 5. Create agent + agent = ClaudeAgent( + dev_name="your_agent", + input_query_stream=web_interface.query_stream, + skills=skills, + system_query="Your system prompt here", + model_name="claude-3-5-haiku-latest" + ) + + # 6. Connect agent responses + agent.get_response_observable().subscribe( + lambda x: agent_response_subject.on_next(x) + ) + + # 7. Run interface + web_interface.run() +``` + +### Key Integration Points + +1. **Reactive Streams**: Use RxPy for event-driven communication +2. **Web Interface**: Provides user input/output +3. **Agent**: Processes natural language and executes skills +4. **Skills**: Define robot capabilities as executable actions + +## Complete Example + +### Step 1: Create Robot Class (`my_robot.py`) + +```python +import asyncio +from typing import Optional, List +from dimos import core +from dimos.hardware.camera import CameraModule +from dimos.manipulation.module import ManipulationModule +from dimos.skills.skills import SkillLibrary +from dimos.types.robot_capabilities import RobotCapability +from dimos_lcm.sensor_msgs import Image, CameraInfo +from dimos.protocol import pubsub + +class MyRobot: + def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): + self.dimos = None + self.camera = None + self.manipulation = None + self.skill_library = SkillLibrary() + + self.capabilities = robot_capabilities or [ + RobotCapability.VISION, + RobotCapability.MANIPULATION, + ] + + async def start(self): + # Start DIMOS + self.dimos = core.start(2) + + # Enable LCM + pubsub.lcm.autoconf() + + # Deploy camera + self.camera = self.dimos.deploy( + CameraModule, + camera_id=0, + fps=30 + ) + + # Configure camera LCM + self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image) + self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image) + self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo) + + # Deploy manipulation + self.manipulation = self.dimos.deploy(ManipulationModule) + + # Connect modules + self.manipulation.rgb_image.connect(self.camera.color_image) + self.manipulation.depth_image.connect(self.camera.depth_image) + self.manipulation.camera_info.connect(self.camera.camera_info) + + # Configure manipulation output + self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image) + + # Start modules + self.camera.start() + self.manipulation.start() + + await asyncio.sleep(2) # Allow initialization + + def get_skills(self): + return self.skill_library + + def stop(self): + if self.manipulation: + self.manipulation.stop() + if self.camera: + self.camera.stop() + if self.dimos: + self.dimos.close() +``` + +### Step 2: Create Run Script (`run.py`) + +```python +#!/usr/bin/env python3 +import asyncio +import os +from my_robot import MyRobot +from dimos.agents.claude_agent import ClaudeAgent +from dimos.skills.basic import BasicSkill +from dimos.web.robot_web_interface import RobotWebInterface +import reactivex as rx +import reactivex.operators as ops + +SYSTEM_PROMPT = """You are a helpful robot assistant.""" + +def main(): + # Check API key + if not os.getenv("ANTHROPIC_API_KEY"): + print("Please set ANTHROPIC_API_KEY") + return + + # Create robot + robot = MyRobot() + + try: + # Start robot + asyncio.run(robot.start()) + + # Set up skills + skills = robot.get_skills() + skills.add(BasicSkill) + skills.create_instance("BasicSkill", robot=robot) + + # Set up streams + agent_response_subject = rx.subject.Subject() + agent_response_stream = agent_response_subject.pipe(ops.share()) + + # Create web interface + web_interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": agent_response_stream} + ) + + # Create agent + agent = ClaudeAgent( + dev_name="my_agent", + input_query_stream=web_interface.query_stream, + skills=skills, + system_query=SYSTEM_PROMPT + ) + + # Connect responses + agent.get_response_observable().subscribe( + lambda x: agent_response_subject.on_next(x) + ) + + print("Robot ready at http://localhost:5555") + + # Run + web_interface.run() + + finally: + robot.stop() + +if __name__ == "__main__": + main() +``` + +### Step 3: Define Skills (`skills.py`) + +```python +from dimos.skills import Skill, skill + +@skill( + description="Perform a basic action", + parameters={ + "action": "The action to perform" + } +) +class BasicSkill(Skill): + def __init__(self, robot): + self.robot = robot + + def run(self, action: str): + # Implement skill logic + return f"Performed: {action}" +``` + +## Best Practices + +1. **Module Lifecycle**: Always start DIMOS before deploying modules +2. **LCM Topics**: Use descriptive topic names with namespaces +3. **Error Handling**: Wrap module operations in try-except blocks +4. **Resource Cleanup**: Ensure proper cleanup in stop() methods +5. **Async Operations**: Use asyncio for non-blocking operations +6. **Stream Management**: Use RxPy for reactive programming patterns + +## Debugging Tips + +1. **Check Module Status**: Print module.io().result() to see connections +2. **Monitor LCM**: Use Foxglove to visualize LCM messages +3. **Log Everything**: Use dimos.utils.logging_config.setup_logger() +4. **Test Modules Independently**: Deploy and test one module at a time + +## Common Issues + +1. **"Module not started"**: Ensure start() is called after deployment +2. **"No data received"**: Check LCM transport configuration +3. **"Connection failed"**: Verify input/output types match +4. **"Cleanup errors"**: Stop modules before closing DIMOS \ No newline at end of file diff --git a/dimos/robot/agilex/README_CN.md b/dimos/robot/agilex/README_CN.md new file mode 100644 index 0000000000..482a09dd6d --- /dev/null +++ b/dimos/robot/agilex/README_CN.md @@ -0,0 +1,465 @@ +# DIMOS 机械臂机器人开发指南 + +本指南介绍如何创建机器人类、集成智能体(Agent)以及使用 DIMOS 模块系统和 LCM 传输。 + +## 目录 +1. [机器人类架构](#机器人类架构) +2. [模块系统与 LCM 传输](#模块系统与-lcm-传输) +3. [智能体集成](#智能体集成) +4. [完整示例](#完整示例) + +## 机器人类架构 + +### 基本机器人类结构 + +DIMOS 机器人类应遵循以下模式: + +```python +from typing import Optional, List +from dimos import core +from dimos.types.robot_capabilities import RobotCapability + +class YourRobot: + """您的机器人实现。""" + + def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): + # 核心组件 + self.dimos = None + self.modules = {} + self.skill_library = SkillLibrary() + + # 定义能力 + self.capabilities = robot_capabilities or [ + RobotCapability.VISION, + RobotCapability.MANIPULATION, + ] + + async def start(self): + """启动机器人模块。""" + # 初始化 DIMOS,指定工作线程数 + self.dimos = core.start(2) # 需要的工作线程数 + + # 部署模块 + # ... (参见模块系统章节) + + def stop(self): + """停止所有模块并清理资源。""" + # 停止模块 + # 关闭 DIMOS + if self.dimos: + self.dimos.close() +``` + +### 关键组件说明 + +1. **初始化**:存储模块、技能和能力的引用 +2. **异步启动**:模块必须异步部署 +3. **正确清理**:在关闭 DIMOS 之前始终停止模块 + +## 模块系统与 LCM 传输 + +### 理解 DIMOS 模块 + +模块是 DIMOS 机器人的构建块。它们: +- 处理数据流(输入) +- 产生输出 +- 可以相互连接 +- 通过 LCM(轻量级通信和编组)进行通信 + +### 部署模块 + +```python +# 部署相机模块 +self.camera = self.dimos.deploy( + ZEDModule, # 模块类 + camera_id=0, # 模块参数 + resolution="HD720", + depth_mode="NEURAL", + fps=30, + publish_rate=30.0, + frame_id="camera_frame" +) +``` + +### 设置 LCM 传输 + +LCM 传输实现模块间通信: + +```python +# 启用 LCM 自动配置 +from dimos.protocol import pubsub +pubsub.lcm.autoconf() + +# 配置输出传输 +self.camera.color_image.transport = core.LCMTransport( + "/camera/color_image", # 主题名称 + Image # 消息类型 +) +self.camera.depth_image.transport = core.LCMTransport( + "/camera/depth_image", + Image +) +``` + +### 连接模块 + +将模块输出连接到输入: + +```python +# 将操作模块连接到相机输出 +self.manipulation.rgb_image.connect(self.camera.color_image) # ROS set_callback +self.manipulation.depth_image.connect(self.camera.depth_image) +self.manipulation.camera_info.connect(self.camera.camera_info) +``` + +### 模块通信模式 + +``` +┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐ +│ 相机模块 │────────▶│ 操作模块 │────────▶│ 可视化输出 │ +│ │ 消息 │ │ 消息 │ │ +└──────────────┘ └────────────────┘ └──────────────┘ + ▲ ▲ + │ │ + └──────────────────────────┘ + 直接连接(RPC指令) +``` + +## 智能体集成 + +### 设置智能体与机器人 + +运行文件的智能体集成模式: + +```python +#!/usr/bin/env python3 +import asyncio +import reactivex as rx +from dimos.agents.claude_agent import ClaudeAgent +from dimos.web.robot_web_interface import RobotWebInterface + +def main(): + # 1. 创建并启动机器人 + robot = YourRobot() + asyncio.run(robot.start()) + + # 2. 设置技能 + skills = robot.get_skills() + skills.add(YourSkill) + skills.create_instance("YourSkill", robot=robot) + + # 3. 设置响应式流 + agent_response_subject = rx.subject.Subject() + agent_response_stream = agent_response_subject.pipe(ops.share()) + + # 4. 创建 Web 界面 + web_interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": agent_response_stream}, + audio_subject=rx.subject.Subject() + ) + + # 5. 创建智能体 + agent = ClaudeAgent( + dev_name="your_agent", + input_query_stream=web_interface.query_stream, + skills=skills, + system_query="您的系统提示词", + model_name="claude-3-5-haiku-latest" + ) + + # 6. 连接智能体响应 + agent.get_response_observable().subscribe( + lambda x: agent_response_subject.on_next(x) + ) + + # 7. 运行界面 + web_interface.run() +``` + +### 关键集成点 + +1. **响应式流**:使用 RxPy 进行事件驱动通信 +2. **Web 界面**:提供用户输入/输出 +3. **智能体**:处理自然语言并执行技能 +4. **技能**:将机器人能力定义为可执行动作 + +## 完整示例 + +### 步骤 1:创建机器人类(`my_robot.py`) + +```python +import asyncio +from typing import Optional, List +from dimos import core +from dimos.hardware.camera import CameraModule +from dimos.manipulation.module import ManipulationModule +from dimos.skills.skills import SkillLibrary +from dimos.types.robot_capabilities import RobotCapability +from dimos_lcm.sensor_msgs import Image, CameraInfo +from dimos.protocol import pubsub + +class MyRobot: + def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): + self.dimos = None + self.camera = None + self.manipulation = None + self.skill_library = SkillLibrary() + + self.capabilities = robot_capabilities or [ + RobotCapability.VISION, + RobotCapability.MANIPULATION, + ] + + async def start(self): + # 启动 DIMOS + self.dimos = core.start(2) + + # 启用 LCM + pubsub.lcm.autoconf() + + # 部署相机 + self.camera = self.dimos.deploy( + CameraModule, + camera_id=0, + fps=30 + ) + + # 配置相机 LCM + self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image) + self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image) + self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo) + + # 部署操作模块 + self.manipulation = self.dimos.deploy(ManipulationModule) + + # 连接模块 + self.manipulation.rgb_image.connect(self.camera.color_image) + self.manipulation.depth_image.connect(self.camera.depth_image) + self.manipulation.camera_info.connect(self.camera.camera_info) + + # 配置操作输出 + self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image) + + # 启动模块 + self.camera.start() + self.manipulation.start() + + await asyncio.sleep(2) # 允许初始化 + + def get_skills(self): + return self.skill_library + + def stop(self): + if self.manipulation: + self.manipulation.stop() + if self.camera: + self.camera.stop() + if self.dimos: + self.dimos.close() +``` + +### 步骤 2:创建运行脚本(`run.py`) + +```python +#!/usr/bin/env python3 +import asyncio +import os +from my_robot import MyRobot +from dimos.agents.claude_agent import ClaudeAgent +from dimos.skills.basic import BasicSkill +from dimos.web.robot_web_interface import RobotWebInterface +import reactivex as rx +import reactivex.operators as ops + +SYSTEM_PROMPT = """您是一个有用的机器人助手。""" + +def main(): + # 检查 API 密钥 + if not os.getenv("ANTHROPIC_API_KEY"): + print("请设置 ANTHROPIC_API_KEY") + return + + # 创建机器人 + robot = MyRobot() + + try: + # 启动机器人 + asyncio.run(robot.start()) + + # 设置技能 + skills = robot.get_skills() + skills.add(BasicSkill) + skills.create_instance("BasicSkill", robot=robot) + + # 设置流 + agent_response_subject = rx.subject.Subject() + agent_response_stream = agent_response_subject.pipe(ops.share()) + + # 创建 Web 界面 + web_interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": agent_response_stream} + ) + + # 创建智能体 + agent = ClaudeAgent( + dev_name="my_agent", + input_query_stream=web_interface.query_stream, + skills=skills, + system_query=SYSTEM_PROMPT + ) + + # 连接响应 + agent.get_response_observable().subscribe( + lambda x: agent_response_subject.on_next(x) + ) + + print("机器人就绪,访问 http://localhost:5555") + + # 运行 + web_interface.run() + + finally: + robot.stop() + +if __name__ == "__main__": + main() +``` + +### 步骤 3:定义技能(`skills.py`) + +```python +from dimos.skills import Skill, skill + +@skill( + description="执行一个基本动作", + parameters={ + "action": "要执行的动作" + } +) +class BasicSkill(Skill): + def __init__(self, robot): + self.robot = robot + + def run(self, action: str): + # 实现技能逻辑 + return f"已执行:{action}" +``` + +## 最佳实践 + +1. **模块生命周期**:在部署模块之前始终先启动 DIMOS +2. **LCM 主题**:使用带命名空间的描述性主题名称 +3. **错误处理**:用 try-except 块包装模块操作 +4. **资源清理**:确保在 stop() 方法中正确清理 +5. **异步操作**:使用 asyncio 进行非阻塞操作 +6. **流管理**:使用 RxPy 进行响应式编程模式 + +## 调试技巧 + +1. **检查模块状态**:打印 module.io().result() 查看连接 +2. **监控 LCM**:使用 Foxglove 可视化 LCM 消息 +3. **记录一切**:使用 dimos.utils.logging_config.setup_logger() +4. **独立测试模块**:一次部署和测试一个模块 + +## 常见问题 + +1. **"模块未启动"**:确保在部署后调用 start() +2. **"未收到数据"**:检查 LCM 传输配置 +3. **"连接失败"**:验证输入/输出类型是否匹配 +4. **"清理错误"**:在关闭 DIMOS 之前停止模块 + +## 高级主题 + +### 自定义模块开发 + +创建自定义模块的基本结构: + +```python +from dimos.core import Module, In, Out, rpc + +class CustomModule(Module): + # 定义输入 + input_data: In[DataType] = None + + # 定义输出 + output_data: Out[DataType] = None + + def __init__(self, param1, param2, **kwargs): + super().__init__(**kwargs) + self.param1 = param1 + self.param2 = param2 + + @rpc + def start(self): + """启动模块处理。""" + self.input_data.subscribe(self._process_data) + + def _process_data(self, data): + """处理输入数据。""" + # 处理逻辑 + result = self.process(data) + # 发布输出 + self.output_data.publish(result) + + @rpc + def stop(self): + """停止模块。""" + # 清理资源 + pass +``` + +### 技能开发指南 + +技能是机器人可执行的高级动作: + +```python +from dimos.skills import Skill, skill +from typing import Optional + +@skill( + description="复杂操作技能", + parameters={ + "target": "目标对象", + "location": "目标位置" + } +) +class ComplexSkill(Skill): + def __init__(self, robot, **kwargs): + super().__init__(**kwargs) + self.robot = robot + + def run(self, target: str, location: Optional[str] = None): + """执行技能逻辑。""" + try: + # 1. 感知阶段 + object_info = self.robot.detect_object(target) + + # 2. 规划阶段 + if location: + plan = self.robot.plan_movement(object_info, location) + + # 3. 执行阶段 + result = self.robot.execute_plan(plan) + + return { + "success": True, + "message": f"成功移动 {target} 到 {location}" + } + + except Exception as e: + return { + "success": False, + "error": str(e) + } +``` + +### 性能优化 + +1. **并行处理**:使用多个工作线程处理不同模块 +2. **数据缓冲**:为高频数据流实现缓冲机制 +3. **延迟加载**:仅在需要时初始化重型模块 +4. **资源池化**:重用昂贵的资源(如神经网络模型) + +希望本指南能帮助您快速上手 DIMOS 机器人开发! \ No newline at end of file diff --git a/dimos/robot/agilex/piper_arm.py b/dimos/robot/agilex/piper_arm.py new file mode 100644 index 0000000000..63dc419a78 --- /dev/null +++ b/dimos/robot/agilex/piper_arm.py @@ -0,0 +1,201 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import logging +from typing import Optional, List + +from dimos import core +from dimos.hardware.zed_camera import ZEDModule +from dimos.manipulation.visual_servoing.manipulation_module import ManipulationModule +from dimos_lcm.sensor_msgs import Image +from dimos.protocol import pubsub +from dimos.skills.skills import SkillLibrary +from dimos.types.robot_capabilities import RobotCapability +from dimos.robot.foxglove_bridge import FoxgloveBridge +from dimos.utils.logging_config import setup_logger + +# Import LCM message types +from dimos_lcm.sensor_msgs import CameraInfo + +logger = setup_logger("dimos.robot.agilex.piper_arm") + + +class PiperArmRobot: + """Piper Arm robot with ZED camera and manipulation capabilities.""" + + def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None): + self.dimos = None + self.stereo_camera = None + self.manipulation_interface = None + self.skill_library = SkillLibrary() + + # Initialize capabilities + self.capabilities = robot_capabilities or [ + RobotCapability.VISION, + RobotCapability.MANIPULATION, + ] + + async def start(self): + """Start the robot modules.""" + # Start Dimos + self.dimos = core.start(2) # Need 2 workers for ZED and manipulation modules + self.foxglove_bridge = FoxgloveBridge() + + # Enable LCM auto-configuration + pubsub.lcm.autoconf() + + # Deploy ZED module + logger.info("Deploying ZED module...") + self.stereo_camera = self.dimos.deploy( + ZEDModule, + camera_id=0, + resolution="HD720", + depth_mode="NEURAL", + fps=30, + enable_tracking=False, # We don't need tracking for manipulation + publish_rate=30.0, + frame_id="zed_camera", + ) + + # Configure ZED LCM transports + self.stereo_camera.color_image.transport = core.LCMTransport("/zed/color_image", Image) + self.stereo_camera.depth_image.transport = core.LCMTransport("/zed/depth_image", Image) + self.stereo_camera.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo) + + # Deploy manipulation module + logger.info("Deploying manipulation module...") + self.manipulation_interface = self.dimos.deploy(ManipulationModule) + + # Connect manipulation inputs to ZED outputs + self.manipulation_interface.rgb_image.connect(self.stereo_camera.color_image) + self.manipulation_interface.depth_image.connect(self.stereo_camera.depth_image) + self.manipulation_interface.camera_info.connect(self.stereo_camera.camera_info) + + # Configure manipulation output + self.manipulation_interface.viz_image.transport = core.LCMTransport( + "/manipulation/viz", Image + ) + + # Print module info + logger.info("Modules configured:") + print("\nZED Module:") + print(self.stereo_camera.io().result()) + print("\nManipulation Module:") + print(self.manipulation_interface.io().result()) + + # Start modules + logger.info("Starting modules...") + self.foxglove_bridge.start() + self.stereo_camera.start() + self.manipulation_interface.start() + + # Give modules time to initialize + await asyncio.sleep(2) + + logger.info("PiperArmRobot initialized and started") + + def get_skills(self): + """Get the robot's skill library. + + Returns: + The robot's skill library for adding/managing skills + """ + return self.skill_library + + def pick_and_place( + self, pick_x: int, pick_y: int, place_x: Optional[int] = None, place_y: Optional[int] = None + ): + """Execute pick and place task. + + Args: + pick_x: X coordinate for pick location + pick_y: Y coordinate for pick location + place_x: X coordinate for place location (optional) + place_y: Y coordinate for place location (optional) + + Returns: + Result of the pick and place operation + """ + if self.manipulation_interface: + return self.manipulation_interface.pick_and_place(pick_x, pick_y, place_x, place_y) + else: + logger.error("Manipulation module not initialized") + return False + + def handle_keyboard_command(self, key: str): + """Pass keyboard commands to manipulation module. + + Args: + key: Keyboard key pressed + + Returns: + Action taken or None + """ + if self.manipulation_interface: + return self.manipulation_interface.handle_keyboard_command(key) + else: + logger.error("Manipulation module not initialized") + return None + + def has_capability(self, capability: RobotCapability) -> bool: + """Check if the robot has a specific capability. + + Args: + capability: The capability to check for + + Returns: + bool: True if the robot has the capability + """ + return capability in self.capabilities + + def stop(self): + """Stop all modules and clean up.""" + logger.info("Stopping PiperArmRobot...") + + try: + if self.manipulation_interface: + self.manipulation_interface.stop() + self.manipulation_interface.cleanup() + + if self.stereo_camera: + self.stereo_camera.stop() + except Exception as e: + logger.warning(f"Error stopping modules: {e}") + + # Close dimos last to ensure workers are available for cleanup + if self.dimos: + self.dimos.close() + + logger.info("PiperArmRobot stopped") + + +async def run_piper_arm(): + """Run the Piper Arm robot.""" + robot = PiperArmRobot() + + await robot.start() + + # Keep the robot running + try: + while True: + await asyncio.sleep(1) + except KeyboardInterrupt: + logger.info("Keyboard interrupt received") + finally: + await robot.stop() + + +if __name__ == "__main__": + asyncio.run(run_piper_arm()) diff --git a/dimos/robot/agilex/run.py b/dimos/robot/agilex/run.py new file mode 100644 index 0000000000..c9e5a036d8 --- /dev/null +++ b/dimos/robot/agilex/run.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run script for Piper Arm robot with Claude agent integration. +Provides manipulation capabilities with natural language interface. +""" + +import asyncio +import os +import sys +import time +from dotenv import load_dotenv + +import reactivex as rx +import reactivex.operators as ops + +from dimos.robot.agilex.piper_arm import PiperArmRobot +from dimos.agents.claude_agent import ClaudeAgent +from dimos.skills.manipulation.pick_and_place import PickAndPlace +from dimos.skills.kill_skill import KillSkill +from dimos.skills.observe import Observe +from dimos.web.robot_web_interface import RobotWebInterface +from dimos.stream.audio.pipelines import stt, tts +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("dimos.robot.agilex.run") + +# Load environment variables +load_dotenv() + +# System prompt for the Piper Arm manipulation agent +SYSTEM_PROMPT = """You are an intelligent robotic assistant controlling a Piper Arm robot with advanced manipulation capabilities. Your primary role is to help users with pick and place tasks using natural language understanding. + +## Your Capabilities: +1. **Visual Perception**: You have access to a ZED stereo camera that provides RGB and depth information +2. **Object Manipulation**: You can pick up and place objects using a 6-DOF robotic arm with a gripper +3. **Language Understanding**: You use the Qwen vision-language model to identify objects and locations from natural language descriptions + +## Available Skills: +- **PickAndPlace**: Execute pick and place operations based on object and location descriptions + - Pick only: "Pick up the red mug" + - Pick and place: "Move the book to the shelf" +- **Observe**: Capture and analyze the current camera view +- **KillSkill**: Stop any currently running skill + +## Guidelines: +1. **Safety First**: Always ensure safe operation. If unsure about an object's graspability or a placement location's stability, ask for clarification +2. **Clear Communication**: Explain what you're doing and ask for confirmation when needed +3. **Error Handling**: If a task fails, explain why and suggest alternatives +4. **Precision**: When users give specific object descriptions, use them exactly as provided to the vision model + +## Interaction Examples: +- User: "Pick up the coffee mug" + You: "I'll pick up the coffee mug for you." [Execute PickAndPlace with object_query="coffee mug"] + +- User: "Put the toy on the table" + You: "I'll place the toy on the table." [Execute PickAndPlace with object_query="toy", target_query="on the table"] + +- User: "What do you see?" + You: "Let me take a look at the current view." [Execute Observe] + +Remember: You're here to assist with manipulation tasks. Be helpful, precise, and always prioritize safe operation of the robot.""" + + +def main(): + """Main entry point.""" + print("\n" + "=" * 60) + print("Piper Arm Robot with Claude Agent") + print("=" * 60) + print("\nThis system integrates:") + print(" - Piper Arm 6-DOF robot") + print(" - ZED stereo camera") + print(" - Claude AI for natural language understanding") + print(" - Qwen VLM for visual object detection") + print(" - Web interface with text and voice input") + print(" - Foxglove visualization via LCM") + print("\nStarting system...\n") + + # Check for API key + if not os.getenv("ANTHROPIC_API_KEY"): + print("WARNING: ANTHROPIC_API_KEY not found in environment") + print("Please set your API key in .env file or environment") + sys.exit(1) + + logger.info("Starting Piper Arm Robot with Agent") + + # Create robot instance + robot = PiperArmRobot() + + try: + # Start the robot (this is async, so we need asyncio.run) + logger.info("Initializing robot...") + asyncio.run(robot.start()) + logger.info("Robot initialized successfully") + + # Set up skill library + skills = robot.get_skills() + skills.add(PickAndPlace) + skills.add(Observe) + skills.add(KillSkill) + + # Create skill instances + skills.create_instance("PickAndPlace", robot=robot) + skills.create_instance("Observe", robot=robot) + skills.create_instance("KillSkill", robot=robot, skill_library=skills) + + logger.info(f"Skills registered: {[skill.__name__ for skill in skills.get_class_skills()]}") + + # Set up streams for agent and web interface + agent_response_subject = rx.subject.Subject() + agent_response_stream = agent_response_subject.pipe(ops.share()) + audio_subject = rx.subject.Subject() + + # Set up streams for web interface + streams = {} + + text_streams = { + "agent_responses": agent_response_stream, + } + + # Create web interface first (needed for agent) + try: + web_interface = RobotWebInterface( + port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams + ) + logger.info("Web interface created successfully") + except Exception as e: + logger.error(f"Failed to create web interface: {e}") + raise + + # Set up speech-to-text + stt_node = stt() + stt_node.consume_audio(audio_subject.pipe(ops.share())) + + # Create Claude agent + agent = ClaudeAgent( + dev_name="piper_arm_agent", + input_query_stream=web_interface.query_stream, # Use text input from web interface + # input_query_stream=stt_node.emit_text(), # Uncomment to use voice input + skills=skills, + system_query=SYSTEM_PROMPT, + model_name="claude-3-5-haiku-latest", + thinking_budget_tokens=0, + max_output_tokens_per_request=4096, + ) + + # Subscribe to agent responses + agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x)) + + # Set up text-to-speech for agent responses + tts_node = tts() + tts_node.consume_text(agent.get_response_observable()) + + logger.info("=" * 60) + logger.info("Piper Arm Agent Ready!") + logger.info(f"Web interface available at: http://localhost:5555") + logger.info("Foxglove visualization available at: ws://localhost:8765") + logger.info("You can:") + logger.info(" - Type commands in the web interface") + logger.info(" - Use voice commands") + logger.info(" - Ask the robot to pick up objects") + logger.info(" - Ask the robot to move objects to locations") + logger.info("=" * 60) + + # Run web interface (this blocks) + web_interface.run() + + except KeyboardInterrupt: + logger.info("Keyboard interrupt received") + except Exception as e: + logger.error(f"Error running robot: {e}") + import traceback + + traceback.print_exc() + finally: + logger.info("Shutting down...") + # Stop the robot (this is also async) + robot.stop() + logger.info("Robot stopped") + + +if __name__ == "__main__": + main() diff --git a/dimos/robot/frontier_exploration/__init__.py b/dimos/robot/frontier_exploration/__init__.py index 2b69011a9f..8b13789179 100644 --- a/dimos/robot/frontier_exploration/__init__.py +++ b/dimos/robot/frontier_exploration/__init__.py @@ -1 +1 @@ -from utils import * + diff --git a/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py index c9b75b28d8..cd344dd0b4 100644 --- a/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py +++ b/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py @@ -25,7 +25,7 @@ ) from dimos.robot.unitree_webrtc.type.lidar import LidarMessage from dimos.robot.unitree_webrtc.type.map import Map -from dimos.types.vector import Vector +from dimos.msgs.geometry_msgs import Vector3 as Vector from dimos.utils.testing import SensorReplay @@ -130,13 +130,6 @@ def test_exploration_goal_selection(): assert isinstance(goal, Vector), "Goal should be a Vector" print(f"Selected exploration goal: ({goal.x:.2f}, {goal.y:.2f})") - # Verify goal is at reasonable distance from robot - distance = np.sqrt((goal.x - robot_pose.x) ** 2 + (goal.y - robot_pose.y) ** 2) - print(f"Goal distance from robot: {distance:.2f}m") - assert distance >= explorer.min_distance_from_robot, ( - "Goal should respect minimum distance from robot" - ) - # Test that goal gets marked as explored assert len(explorer.explored_goals) == 1, "Goal should be marked as explored" assert explorer.explored_goals[0] == goal, "Explored goal should match selected goal" diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py index 1aca32fc93..454a70e803 100644 --- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py +++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py @@ -82,11 +82,9 @@ class WavefrontFrontierExplorer: def __init__( self, - min_frontier_size: int = 10, + min_frontier_size: int = 8, occupancy_threshold: int = 65, - subsample_resolution: int = 2, - min_distance_from_robot: float = 0.5, - explored_area_buffer: float = 0.5, + subsample_resolution: int = 3, min_distance_from_obstacles: float = 0.6, info_gain_threshold: float = 0.03, num_no_gain_attempts: int = 4, @@ -101,8 +99,6 @@ def __init__( min_frontier_size: Minimum number of points to consider a valid frontier occupancy_threshold: Cost threshold above which a cell is considered occupied (0-255) subsample_resolution: Factor by which to subsample the costmap for faster processing (1=no subsampling, 2=half resolution, 4=quarter resolution) - min_distance_from_robot: Minimum distance frontier must be from robot (meters) - explored_area_buffer: Buffer distance around free areas to consider as explored (meters) min_distance_from_obstacles: Minimum distance frontier must be from obstacles (meters) info_gain_threshold: Minimum percentage increase in costmap information required to continue exploration (0.05 = 5%) num_no_gain_attempts: Maximum number of consecutive attempts with no information gain @@ -113,8 +109,6 @@ def __init__( self.min_frontier_size = min_frontier_size self.occupancy_threshold = occupancy_threshold self.subsample_resolution = subsample_resolution - self.min_distance_from_robot = min_distance_from_robot - self.explored_area_buffer = explored_area_buffer self.min_distance_from_obstacles = min_distance_from_obstacles self.info_gain_threshold = info_gain_threshold self.num_no_gain_attempts = num_no_gain_attempts @@ -513,15 +507,6 @@ def _rank_frontiers( valid_frontiers = [] for i, frontier in enumerate(frontier_centroids): - robot_distance = np.sqrt( - (frontier.x - robot_pose.x) ** 2 + (frontier.y - robot_pose.y) ** 2 - ) - - # Filter 1: Skip frontiers too close to robot - if robot_distance < self.min_distance_from_robot: - continue - - # Filter 2: Skip frontiers too close to obstacles obstacle_distance = self._compute_distance_to_obstacles(frontier, costmap) if obstacle_distance < self.min_distance_from_obstacles: continue diff --git a/dimos/robot/unitree_webrtc/connection.py b/dimos/robot/unitree_webrtc/connection.py index 9bc1874cbe..6119cba860 100644 --- a/dimos/robot/unitree_webrtc/connection.py +++ b/dimos/robot/unitree_webrtc/connection.py @@ -239,7 +239,7 @@ async def accept_track(track: MediaStreamTrack) -> VideoMessage: if stop_event.is_set(): return frame = await track.recv() - subject.on_next(Image.from_numpy(frame.to_ndarray(format="bgr24"))) + subject.on_next(Image.from_numpy(frame.to_ndarray(format="rgb24"))) self.conn.video.add_track_callback(accept_track) diff --git a/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py b/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py index f2b701fc63..0532be8320 100644 --- a/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py +++ b/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py @@ -94,7 +94,9 @@ def odom_stream(self): @functools.cache def video_stream(self): print("video stream start") - video_store = TimedSensorReplay("unitree_office_walk/video", autocast=Image.from_numpy) + video_store = TimedSensorReplay( + "unitree_office_walk/video", autocast=lambda x: Image.from_numpy(x).to_rgb() + ) return video_store.stream() def move(self, vector: Vector): diff --git a/dimos/skills/manipulation/pick_and_place.py b/dimos/skills/manipulation/pick_and_place.py new file mode 100644 index 0000000000..4306975d8d --- /dev/null +++ b/dimos/skills/manipulation/pick_and_place.py @@ -0,0 +1,439 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Pick and place skill for Piper Arm robot. + +This module provides a skill that uses Qwen VLM to identify pick and place +locations based on natural language queries, then executes the manipulation. +""" + +import json +import cv2 +import os +from typing import Optional, Tuple, Dict, Any +import numpy as np +from pydantic import Field + +from dimos.skills.manipulation.abstract_manipulation_skill import AbstractManipulationSkill +from dimos.models.qwen.video_query import query_single_frame +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("dimos.skills.manipulation.pick_and_place") + + +def parse_qwen_points_response(response: str) -> Optional[Tuple[Tuple[int, int], Tuple[int, int]]]: + """ + Parse Qwen's response containing two points. + + Args: + response: Qwen's response containing JSON with two points + + Returns: + Tuple of (pick_point, place_point) where each point is (x, y), or None if parsing fails + """ + try: + # Try to extract JSON from the response + start_idx = response.find("{") + end_idx = response.rfind("}") + 1 + + if start_idx >= 0 and end_idx > start_idx: + json_str = response[start_idx:end_idx] + result = json.loads(json_str) + + # Extract pick and place points + if "pick_point" in result and "place_point" in result: + pick = result["pick_point"] + place = result["place_point"] + + # Validate points have x,y coordinates + if ( + isinstance(pick, (list, tuple)) + and len(pick) >= 2 + and isinstance(place, (list, tuple)) + and len(place) >= 2 + ): + return (int(pick[0]), int(pick[1])), (int(place[0]), int(place[1])) + + except Exception as e: + logger.error(f"Error parsing Qwen points response: {e}") + logger.debug(f"Raw response: {response}") + + return None + + +def save_debug_image_with_points( + image: np.ndarray, + pick_point: Optional[Tuple[int, int]] = None, + place_point: Optional[Tuple[int, int]] = None, + filename_prefix: str = "qwen_debug", +) -> str: + """ + Save debug image with crosshairs marking pick and/or place points. + + Args: + image: RGB image array + pick_point: (x, y) coordinates for pick location + place_point: (x, y) coordinates for place location + filename_prefix: Prefix for the saved filename + + Returns: + Path to the saved image + """ + # Create a copy to avoid modifying original + debug_image = image.copy() + + # Draw pick point crosshair (green) + if pick_point: + x, y = pick_point + # Draw crosshair + cv2.drawMarker(debug_image, (x, y), (0, 255, 0), cv2.MARKER_CROSS, 30, 2) + # Draw circle + cv2.circle(debug_image, (x, y), 5, (0, 255, 0), -1) + # Add label + cv2.putText( + debug_image, "PICK", (x + 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2 + ) + + # Draw place point crosshair (cyan) + if place_point: + x, y = place_point + # Draw crosshair + cv2.drawMarker(debug_image, (x, y), (255, 255, 0), cv2.MARKER_CROSS, 30, 2) + # Draw circle + cv2.circle(debug_image, (x, y), 5, (255, 255, 0), -1) + # Add label + cv2.putText( + debug_image, "PLACE", (x + 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2 + ) + + # Draw arrow from pick to place if both exist + if pick_point and place_point: + cv2.arrowedLine(debug_image, pick_point, place_point, (255, 0, 255), 2, tipLength=0.03) + + # Generate filename with timestamp + filename = f"{filename_prefix}.png" + filepath = os.path.join(os.getcwd(), filename) + + # Save image + cv2.imwrite(filepath, debug_image) + logger.info(f"Debug image saved to: {filepath}") + + return filepath + + +def parse_qwen_single_point_response(response: str) -> Optional[Tuple[int, int]]: + """ + Parse Qwen's response containing a single point. + + Args: + response: Qwen's response containing JSON with a point + + Returns: + Tuple of (x, y) or None if parsing fails + """ + try: + # Try to extract JSON from the response + start_idx = response.find("{") + end_idx = response.rfind("}") + 1 + + if start_idx >= 0 and end_idx > start_idx: + json_str = response[start_idx:end_idx] + result = json.loads(json_str) + + # Try different possible keys + point = None + for key in ["point", "location", "position", "coordinates"]: + if key in result: + point = result[key] + break + + # Validate point has x,y coordinates + if point and isinstance(point, (list, tuple)) and len(point) >= 2: + return int(point[0]), int(point[1]) + + except Exception as e: + logger.error(f"Error parsing Qwen single point response: {e}") + logger.debug(f"Raw response: {response}") + + return None + + +class PickAndPlace(AbstractManipulationSkill): + """ + A skill that performs pick and place operations using vision-language guidance. + + This skill uses Qwen VLM to identify objects and locations based on natural + language queries, then executes pick and place operations using the robot's + manipulation interface. + + Example usage: + # Just pick the object + skill = PickAndPlace(robot=robot, object_query="red mug") + + # Pick and place the object + skill = PickAndPlace(robot=robot, object_query="red mug", target_query="on the coaster") + + The skill uses the robot's stereo camera to capture RGB images and its manipulation + interface to execute the pick and place operation. It automatically handles coordinate + transformation from 2D pixel coordinates to 3D world coordinates. + """ + + object_query: str = Field( + "mug", + description="Natural language description of the object to pick (e.g., 'red mug', 'small box')", + ) + + target_query: Optional[str] = Field( + None, + description="Natural language description of where to place the object (e.g., 'on the table', 'in the basket'). If not provided, only pick operation will be performed.", + ) + + model_name: str = Field( + "qwen2.5-vl-72b-instruct", description="Qwen model to use for visual queries" + ) + + def __init__(self, robot=None, **data): + """ + Initialize the PickAndPlace skill. + + Args: + robot: The PiperArmRobot instance + **data: Additional configuration data + """ + super().__init__(robot=robot, **data) + + def _get_camera_frame(self) -> Optional[np.ndarray]: + """ + Get a single RGB frame from the robot's camera. + + Returns: + RGB image as numpy array or None if capture fails + """ + if not self._robot or not self._robot.manipulation_interface: + logger.error("Robot or stereo camera not available") + return None + + try: + # Use the RPC call to get a single RGB frame + rgb_frame = self._robot.manipulation_interface.get_single_rgb_frame() + if rgb_frame is None: + logger.error("Failed to capture RGB frame from camera") + return rgb_frame + except Exception as e: + logger.error(f"Error getting camera frame: {e}") + return None + + def _query_pick_and_place_points( + self, frame: np.ndarray + ) -> Optional[Tuple[Tuple[int, int], Tuple[int, int]]]: + """ + Query Qwen to get both pick and place points in a single query. + + Args: + frame: RGB image array + + Returns: + Tuple of (pick_point, place_point) or None if query fails + """ + # This method is only called when both object and target are specified + prompt = ( + f"Look at this image carefully. I need you to identify two specific locations:\n" + f"1. Find the {self.object_query} - this is the object I want to pick up\n" + f"2. Identify where to place it {self.target_query}\n\n" + "Instructions:\n" + "- The pick_point should be at the center or graspable part of the object\n" + "- The place_point should be a stable, flat surface at the target location\n" + "- Consider the object's size when choosing the placement point\n\n" + "Return ONLY a JSON object with this exact format:\n" + "{'pick_point': [x, y], 'place_point': [x, y]}\n" + "where [x, y] are pixel coordinates in the image." + ) + + try: + response = query_single_frame(frame, prompt, model_name=self.model_name) + return parse_qwen_points_response(response) + except Exception as e: + logger.error(f"Error querying Qwen for pick and place points: {e}") + return None + + def _query_single_point( + self, frame: np.ndarray, query: str, point_type: str + ) -> Optional[Tuple[int, int]]: + """ + Query Qwen to get a single point location. + + Args: + frame: RGB image array + query: Natural language description of what to find + point_type: Type of point ('pick' or 'place') for context + + Returns: + Tuple of (x, y) pixel coordinates or None if query fails + """ + if point_type == "pick": + prompt = ( + f"Look at this image carefully and find the {query}.\n\n" + "Instructions:\n" + "- Identify the exact object matching the description\n" + "- Choose the center point or the most graspable location on the object\n" + "- If multiple matching objects exist, choose the most prominent or accessible one\n" + "- Consider the object's shape and material when selecting the grasp point\n\n" + "Return ONLY a JSON object with this exact format:\n" + "{'point': [x, y]}\n" + "where [x, y] are the pixel coordinates of the optimal grasping point on the object." + ) + else: # place + prompt = ( + f"Look at this image and identify where to place an object {query}.\n\n" + "Instructions:\n" + "- Find a stable, flat surface at the specified location\n" + "- Ensure the placement spot is clear of obstacles\n" + "- Consider the size of the object being placed\n" + "- If the query specifies a container or specific spot, center the placement there\n" + "- Otherwise, find the most appropriate nearby surface\n\n" + "Return ONLY a JSON object with this exact format:\n" + "{'point': [x, y]}\n" + "where [x, y] are the pixel coordinates of the optimal placement location." + ) + + try: + response = query_single_frame(frame, prompt, model_name=self.model_name) + return parse_qwen_single_point_response(response) + except Exception as e: + logger.error(f"Error querying Qwen for {point_type} point: {e}") + return None + + def __call__(self) -> Dict[str, Any]: + """ + Execute the pick and place operation. + + Returns: + Dictionary with operation results + """ + super().__call__() + + if not self._robot: + error_msg = "No robot instance provided to PickAndPlace skill" + logger.error(error_msg) + return {"success": False, "error": error_msg} + + # Register skill as running + skill_library = self._robot.get_skills() + self.register_as_running("PickAndPlace", skill_library) + + # Get camera frame + frame = self._get_camera_frame() + if frame is None: + return {"success": False, "error": "Failed to capture camera frame"} + + # Convert RGB to BGR for OpenCV if needed + if len(frame.shape) == 3 and frame.shape[2] == 3: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + + # Get pick and place points from Qwen + pick_point = None + place_point = None + + # Determine mode based on whether target_query is provided + if self.target_query is None: + # Pick only mode + logger.info("Pick-only mode (no target specified)") + + # Query for pick point + pick_point = self._query_single_point(frame, self.object_query, "pick") + if not pick_point: + return {"success": False, "error": f"Failed to find {self.object_query}"} + + # No place point needed for pick-only + place_point = None + else: + # Pick and place mode - can use either single or dual query + logger.info("Pick and place mode (target specified)") + + # Try single query first for efficiency + points = self._query_pick_and_place_points(frame) + pick_point, place_point = points + + logger.info(f"Pick point: {pick_point}, Place point: {place_point}") + + # Save debug image with marked points + if pick_point or place_point: + save_debug_image_with_points(frame, pick_point, place_point) + + # Execute pick (and optionally place) using the robot's interface + try: + if place_point: + # Pick and place + result = self._robot.pick_and_place( + pick_x=pick_point[0], + pick_y=pick_point[1], + place_x=place_point[0], + place_y=place_point[1], + ) + else: + # Pick only + result = self._robot.pick_and_place( + pick_x=pick_point[0], pick_y=pick_point[1], place_x=None, place_y=None + ) + + if result: + if self.target_query: + message = ( + f"Successfully picked {self.object_query} and placed it {self.target_query}" + ) + else: + message = f"Successfully picked {self.object_query}" + + return { + "success": True, + "pick_point": pick_point, + "place_point": place_point, + "object": self.object_query, + "target": self.target_query, + "message": message, + } + else: + operation = "Pick and place" if self.target_query else "Pick" + return { + "success": False, + "pick_point": pick_point, + "place_point": place_point, + "error": f"{operation} operation failed", + } + + except Exception as e: + logger.error(f"Error executing pick and place: {e}") + return { + "success": False, + "error": f"Execution error: {str(e)}", + "pick_point": pick_point, + "place_point": place_point, + } + finally: + # Always unregister skill when done + self.stop() + + def stop(self) -> None: + """ + Stop the pick and place operation and perform cleanup. + """ + logger.info("Stopping PickAndPlace skill") + + # Unregister skill from skill library + if self._robot: + skill_library = self._robot.get_skills() + self.unregister_as_running("PickAndPlace", skill_library) + + logger.info("PickAndPlace skill stopped successfully") diff --git a/dimos/utils/transform_utils.py b/dimos/utils/transform_utils.py index 31d3840884..eaedbcecf3 100644 --- a/dimos/utils/transform_utils.py +++ b/dimos/utils/transform_utils.py @@ -13,10 +13,11 @@ # limitations under the License. import numpy as np -from typing import Tuple, Dict, Any +from typing import Tuple import logging +from scipy.spatial.transform import Rotation as R -from dimos.types.vector import Vector +from dimos_lcm.geometry_msgs import Pose, Point, Vector3, Quaternion logger = logging.getLogger(__name__) @@ -31,25 +32,216 @@ def distance_angle_to_goal_xy(distance: float, angle: float) -> Tuple[float, flo return distance * np.cos(angle), distance * np.sin(angle) +def pose_to_matrix(pose: Pose) -> np.ndarray: + """ + Convert pose to 4x4 homogeneous transform matrix. + + Args: + pose: Pose object with position and orientation (quaternion) + + Returns: + 4x4 transformation matrix + """ + # Extract position + tx, ty, tz = pose.position.x, pose.position.y, pose.position.z + + # Create rotation matrix from quaternion using scipy + quat = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w] + + # Check for zero norm quaternion and use identity if invalid + quat_norm = np.linalg.norm(quat) + if quat_norm == 0.0: + # Use identity quaternion [0, 0, 0, 1] if zero norm detected + quat = [0.0, 0.0, 0.0, 1.0] + + rotation = R.from_quat(quat) + Rot = rotation.as_matrix() + + # Create 4x4 transform + T = np.eye(4) + T[:3, :3] = Rot + T[:3, 3] = [tx, ty, tz] + + return T + + +def matrix_to_pose(T: np.ndarray) -> Pose: + """ + Convert 4x4 transformation matrix to Pose object. + + Args: + T: 4x4 transformation matrix + + Returns: + Pose object with position and orientation (quaternion) + """ + # Extract position + pos = Point(T[0, 3], T[1, 3], T[2, 3]) + + # Extract rotation matrix and convert to quaternion + Rot = T[:3, :3] + rotation = R.from_matrix(Rot) + quat = rotation.as_quat() # Returns [x, y, z, w] + + orientation = Quaternion(quat[0], quat[1], quat[2], quat[3]) + + return Pose(pos, orientation) + + +def apply_transform(pose: Pose, transform_matrix: np.ndarray) -> Pose: + """ + Apply a transformation matrix to a pose. + + Args: + pose: Input pose + transform_matrix: 4x4 transformation matrix to apply + + Returns: + Transformed pose + """ + # Convert pose to matrix + T_pose = pose_to_matrix(pose) + + # Apply transform + T_result = transform_matrix @ T_pose + + # Convert back to pose + return matrix_to_pose(T_result) + + +def optical_to_robot_frame(pose: Pose) -> Pose: + """ + Convert pose from optical camera frame to robot frame convention. + + Optical Camera Frame (e.g., ZED): + - X: Right + - Y: Down + - Z: Forward (away from camera) + + Robot Frame (ROS/REP-103): + - X: Forward + - Y: Left + - Z: Up + + Args: + pose: Pose in optical camera frame + + Returns: + Pose in robot frame + """ + # Position transformation + robot_x = pose.position.z # Forward = Camera Z + robot_y = -pose.position.x # Left = -Camera X + robot_z = -pose.position.y # Up = -Camera Y + + # Rotation transformation using quaternions + # First convert quaternion to rotation matrix + quat_optical = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w] + R_optical = R.from_quat(quat_optical).as_matrix() + + # Coordinate frame transformation matrix from optical to robot + # X_robot = Z_optical, Y_robot = -X_optical, Z_robot = -Y_optical + T_frame = np.array( + [ + [0, 0, 1], # X_robot = Z_optical + [-1, 0, 0], # Y_robot = -X_optical + [0, -1, 0], # Z_robot = -Y_optical + ] + ) + + # Transform the rotation matrix + R_robot = T_frame @ R_optical @ T_frame.T + + # Convert back to quaternion + quat_robot = R.from_matrix(R_robot).as_quat() # [x, y, z, w] + + return Pose( + Point(robot_x, robot_y, robot_z), + Quaternion(quat_robot[0], quat_robot[1], quat_robot[2], quat_robot[3]), + ) + + +def robot_to_optical_frame(pose: Pose) -> Pose: + """ + Convert pose from robot frame to optical camera frame convention. + This is the inverse of optical_to_robot_frame. + + Args: + pose: Pose in robot frame + + Returns: + Pose in optical camera frame + """ + # Position transformation (inverse) + optical_x = -pose.position.y # Right = -Left + optical_y = -pose.position.z # Down = -Up + optical_z = pose.position.x # Forward = Forward + + # Rotation transformation using quaternions + quat_robot = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w] + R_robot = R.from_quat(quat_robot).as_matrix() + + # Coordinate frame transformation matrix from Robot to optical (inverse of optical to Robot) + # This is the transpose of the forward transformation + T_frame_inv = np.array( + [ + [0, -1, 0], # X_optical = -Y_robot + [0, 0, -1], # Y_optical = -Z_robot + [1, 0, 0], # Z_optical = X_robot + ] + ) + + # Transform the rotation matrix + R_optical = T_frame_inv @ R_robot @ T_frame_inv.T + + # Convert back to quaternion + quat_optical = R.from_matrix(R_optical).as_quat() # [x, y, z, w] + + return Pose( + Point(optical_x, optical_y, optical_z), + Quaternion(quat_optical[0], quat_optical[1], quat_optical[2], quat_optical[3]), + ) + + +def yaw_towards_point(position: Point, target_point: Point = None) -> float: + """ + Calculate yaw angle from target point to position (away from target). + This is commonly used for object orientation in grasping applications. + Assumes robot frame where X is forward and Y is left. + + Args: + position: Current position in robot frame + target_point: Reference point (default: origin) + + Returns: + Yaw angle in radians pointing from target_point to position + """ + if target_point is None: + target_point = Point(0.0, 0.0, 0.0) + direction_x = position.x - target_point.x + direction_y = position.y - target_point.y + return np.arctan2(direction_y, direction_x) + + def transform_robot_to_map( - robot_position: Vector, robot_rotation: Vector, position: Vector, rotation: Vector -) -> Tuple[Vector, Vector]: + robot_position: Point, robot_rotation: Vector3, position: Point, rotation: Vector3 +) -> Tuple[Point, Vector3]: """Transform position and rotation from robot frame to map frame. Args: robot_position: Current robot position in map frame robot_rotation: Current robot rotation in map frame - position: Position in robot frame as Vector (x, y, z) - rotation: Rotation in robot frame as Vector (roll, pitch, yaw) in radians + position: Position in robot frame as Point (x, y, z) + rotation: Rotation in robot frame as Vector3 (roll, pitch, yaw) in radians Returns: Tuple of (transformed_position, transformed_rotation) where: - - transformed_position: Vector (x, y, z) in map frame - - transformed_rotation: Vector (roll, pitch, yaw) in map frame + - transformed_position: Point (x, y, z) in map frame + - transformed_rotation: Vector3 (roll, pitch, yaw) in map frame Example: - obj_pos_robot = Vector(1.0, 0.5, 0.0) # 1m forward, 0.5m left of robot - obj_rot_robot = Vector(0.0, 0.0, 0.0) # No rotation relative to robot + obj_pos_robot = Point(1.0, 0.5, 0.0) # 1m forward, 0.5m left of robot + obj_rot_robot = Vector3(0.0, 0.0, 0.0) # No rotation relative to robot map_pos, map_rot = transform_robot_to_map(robot_position, robot_rotation, obj_pos_robot, obj_rot_robot) """ @@ -79,7 +271,128 @@ def transform_robot_to_map( map_pitch = robot_rot.y + rot_pitch # Add robot's pitch map_yaw_rot = normalize_angle(robot_yaw + rot_yaw) # Add robot's yaw and normalize - transformed_position = Vector(map_x, map_y, map_z) - transformed_rotation = Vector(map_roll, map_pitch, map_yaw_rot) + transformed_position = Point(map_x, map_y, map_z) + transformed_rotation = Vector3(map_roll, map_pitch, map_yaw_rot) return transformed_position, transformed_rotation + + +def create_transform_from_6dof(translation: Vector3, euler_angles: Vector3) -> np.ndarray: + """ + Create a 4x4 transformation matrix from 6DOF parameters. + + Args: + translation: Translation vector [x, y, z] in meters + euler_angles: Euler angles [rx, ry, rz] in radians (XYZ convention) + + Returns: + 4x4 transformation matrix + """ + # Create transformation matrix + T = np.eye(4) + + # Set translation + T[0:3, 3] = [translation.x, translation.y, translation.z] + + # Set rotation using scipy + if np.linalg.norm([euler_angles.x, euler_angles.y, euler_angles.z]) > 1e-6: + rotation = R.from_euler("xyz", [euler_angles.x, euler_angles.y, euler_angles.z]) + T[0:3, 0:3] = rotation.as_matrix() + + return T + + +def invert_transform(T: np.ndarray) -> np.ndarray: + """ + Invert a 4x4 transformation matrix efficiently. + + Args: + T: 4x4 transformation matrix + + Returns: + Inverted 4x4 transformation matrix + """ + # For homogeneous transform matrices, we can use the special structure: + # [R t]^-1 = [R^T -R^T*t] + # [0 1] [0 1 ] + + Rot = T[:3, :3] + t = T[:3, 3] + + T_inv = np.eye(4) + T_inv[:3, :3] = Rot.T + T_inv[:3, 3] = -Rot.T @ t + + return T_inv + + +def compose_transforms(*transforms: np.ndarray) -> np.ndarray: + """ + Compose multiple transformation matrices. + + Args: + *transforms: Variable number of 4x4 transformation matrices + + Returns: + Composed 4x4 transformation matrix (T1 @ T2 @ ... @ Tn) + """ + result = np.eye(4) + for T in transforms: + result = result @ T + return result + + +def euler_to_quaternion(euler_angles: Vector3, degrees: bool = False) -> Quaternion: + """ + Convert euler angles to quaternion. + + Args: + euler_angles: Euler angles as Vector3 [roll, pitch, yaw] in radians (XYZ convention) + + Returns: + Quaternion object [x, y, z, w] + """ + rotation = R.from_euler( + "xyz", [euler_angles.x, euler_angles.y, euler_angles.z], degrees=degrees + ) + quat = rotation.as_quat() # Returns [x, y, z, w] + return Quaternion(quat[0], quat[1], quat[2], quat[3]) + + +def quaternion_to_euler(quaternion: Quaternion, degrees: bool = False) -> Vector3: + """ + Convert quaternion to euler angles. + + Args: + quaternion: Quaternion object [x, y, z, w] + + Returns: + Euler angles as Vector3 [roll, pitch, yaw] in radians (XYZ convention) + """ + quat = [quaternion.x, quaternion.y, quaternion.z, quaternion.w] + rotation = R.from_quat(quat) + euler = rotation.as_euler("xyz", degrees=degrees) # Returns [roll, pitch, yaw] + if not degrees: + return Vector3( + normalize_angle(euler[0]), normalize_angle(euler[1]), normalize_angle(euler[2]) + ) + else: + return Vector3(euler[0], euler[1], euler[2]) + + +def get_distance(pose1: Pose, pose2: Pose) -> float: + """ + Calculate Euclidean distance between two poses. + + Args: + pose1: First pose + pose2: Second pose + + Returns: + Euclidean distance between the two poses in meters + """ + dx = pose1.position.x - pose2.position.x + dy = pose1.position.y - pose2.position.y + dz = pose1.position.z - pose2.position.z + + return np.linalg.norm(np.array([dx, dy, dz])) diff --git a/docs/modules_CN.md b/docs/modules_CN.md new file mode 100644 index 0000000000..d8f088ef59 --- /dev/null +++ b/docs/modules_CN.md @@ -0,0 +1,188 @@ +# Dimensional 模块系统 + +DimOS 模块系统使用 Dask 进行计算分布和 LCM(轻量级通信和编组)进行高性能进程间通信,实现分布式、多进程的机器人应用。 + +## 核心概念 + +### 1. 模块定义 +模块是继承自 `dimos.core.Module` 的 Python 类,定义输入、输出和 RPC 方法: + +```python +from dimos.core import Module, In, Out, rpc +from dimos.msgs.geometry_msgs import Vector3 + +class MyModule(Module): # ROS Node + # 将输入/输出声明为初始化为 None 的类属性 + data_in: In[Vector3] = None # ROS Subscriber + data_out: Out[Vector3] = None # ROS Publisher + + def __init__(): + # 调用父类 Module 初始化 + super().__init__() + + @rpc + def remote_method(self, param): + """使用 @rpc 装饰的方法可以远程调用""" + return param * 2 +``` + +### 2. 模块部署 +使用 `dimos.deploy()` 方法在 Dask 工作进程中部署模块: + +```python +from dimos import core + +# 启动具有 N 个工作进程的 Dask 集群 +dimos = core.start(4) + +# 部署模块时可以传递初始化参数 +# 在这种情况下,param1 和 param2 被传递到模块初始化中 +module = dimos.deploy(Module, param1="value1", param2=123) +``` + +### 3. 流连接 +模块通过使用 LCM 传输的响应式流进行通信: + +```python +# 为输出配置 LCM 传输 +module1.data_out.transport = core.LCMTransport("/topic_name", MessageType) + +# 将模块输入连接到输出 +module2.data_in.connect(module1.data_out) + +# 访问底层的 Observable 流 +stream = module1.data_out.observable() +stream.subscribe(lambda msg: print(f"接收到: {msg}")) +``` + +### 4. 模块生命周期 +```python +# 启动模块以开始处理 +module.start() # 如果定义了 @rpc start() 方法,则调用它 + +# 检查模块 I/O 配置 +print(module.io().result()) # 显示输入、输出和 RPC 方法 + +# 优雅关闭 +dimos.shutdown() +``` + +## 实际示例:机器人控制系统 + +```python +# 连接模块封装机器人硬件/仿真 +connection = dimos.deploy(ConnectionModule, ip=robot_ip) +connection.lidar.transport = core.LCMTransport("/lidar", LidarMessage) +connection.video.transport = core.LCMTransport("/video", Image) + +# 感知模块处理传感器数据 +perception = dimos.deploy(PersonTrackingStream, camera_intrinsics=[...]) +perception.video.connect(connection.video) +perception.tracking_data.transport = core.pLCMTransport("/person_tracking") + +# 开始处理 +connection.start() +perception.start() + +# 通过 RPC 启用跟踪 +perception.enable_tracking() + +# 获取最新的跟踪数据 +data = perception.get_tracking_data() +``` + +## LCM 传输配置 + +```python +# 用于简单类型(如激光雷达)的标准 LCM 传输 +connection.lidar.transport = core.LCMTransport("/lidar", LidarMessage) + +# 用于复杂 Python 对象/字典的基于 pickle 的传输 +connection.tracking_data.transport = core.pLCMTransport("/person_tracking") + +# 自动配置 LCM 系统缓冲区(在容器中必需) +from dimos.protocol import pubsub +pubsub.lcm.autoconf() +``` + +这种架构使得能够将复杂的机器人系统构建为可组合的分布式模块,这些模块通过流和 RPC 高效通信,从单机扩展到集群。 + +# Dimensional 安装指南 +## Python 安装(Ubuntu 22.04) + +```bash +sudo apt install python3-venv + +# 克隆仓库(dev 分支,无子模块) +git clone -b dev https://github.com/dimensionalOS/dimos.git +cd dimos + +# 创建并激活虚拟环境 +python3 -m venv venv +source venv/bin/activate + +sudo apt install portaudio19-dev python3-pyaudio + +# 如果尚未安装,请安装 torch 和 torchvision +# 示例 CUDA 11.7,Pytorch 2.0.1(如果需要不同的 pytorch 版本,请替换) +pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +``` + +### 安装依赖 +```bash +# 仅 CPU(建议首先尝试) +pip install .[cpu,dev] + +# CUDA 安装 +pip install .[cuda,dev] + +# 复制并配置环境变量 +cp default.env .env +``` + +### 测试安装 +```bash +# 运行标准测试 +pytest -s dimos/ + +# 测试模块功能 +pytest -s -m module dimos/ + +# 测试 LCM 通信 +pytest -s -m lcm dimos/ +``` + +# Unitree Go2 快速开始 + +要快速测试模块系统,您可以直接运行 Unitree Go2 多进程示例: + +```bash +# 确保设置了所需的环境变量 +export ROBOT_IP= + +# 运行多进程 Unitree Go2 示例 +python dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py +``` + +## 模块系统的高级特性 + +### 分布式计算 +DimOS 模块系统建立在 Dask 之上,提供了强大的分布式计算能力: + +- **自动负载均衡**:模块自动分布在可用的工作进程中 +- **容错性**:如果工作进程失败,模块可以在其他工作进程上重新启动 +- **可扩展性**:从单机到集群的无缝扩展 + +### 响应式编程模型 +使用 RxPY 实现的响应式流提供了: + +- **异步处理**:非阻塞的数据流处理 +- **背压处理**:自动管理快速生产者和慢速消费者 +- **操作符链**:使用 map、filter、merge 等操作符进行流转换 + +### 性能优化 +LCM 传输针对机器人应用进行了优化: + +- **零拷贝**:大型消息的高效内存使用 +- **低延迟**:微秒级的消息传递 +- **多播支持**:一对多的高效通信 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7d51aa91d4..436b9a5750 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,6 +115,9 @@ manipulation = [ "pyyaml>=6.0", "contact-graspnet-pytorch @ git+https://github.com/dimensionalOS/contact_graspnet_pytorch.git", + # piper arm + "piper-sdk", + # Visualization (Optional) "kaleido>=0.2.1", "plotly>=5.9.0", diff --git a/tests/test_manipulation_perception_pipeline.py b/tests/test_manipulation_perception_pipeline.py index 8b333ec310..227f991650 100644 --- a/tests/test_manipulation_perception_pipeline.py +++ b/tests/test_manipulation_perception_pipeline.py @@ -36,7 +36,7 @@ from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream from dimos.web.robot_web_interface import RobotWebInterface from dimos.utils.logging_config import logger -from dimos.perception.manip_aio_pipeline import ManipulationPipeline +from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline def monitor_grasps(pipeline): diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py.py new file mode 100644 index 0000000000..227f991650 --- /dev/null +++ b/tests/test_manipulation_perception_pipeline.py.py @@ -0,0 +1,167 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import threading +from reactivex import operators as ops + +import tests.test_header + +from pyzed import sl +from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream +from dimos.web.robot_web_interface import RobotWebInterface +from dimos.utils.logging_config import logger +from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline + + +def monitor_grasps(pipeline): + """Monitor and print grasp updates in a separate thread.""" + print(" Grasp monitor started...") + + last_grasp_count = 0 + last_update_time = time.time() + + while True: + try: + # Get latest grasps using the getter function + grasps = pipeline.get_latest_grasps(timeout=0.5) + current_time = time.time() + + if grasps is not None: + current_count = len(grasps) + if current_count != last_grasp_count: + print(f" Grasps received: {current_count} (at {time.strftime('%H:%M:%S')})") + if current_count > 0: + best_score = max(grasp.get("score", 0.0) for grasp in grasps) + print(f" Best grasp score: {best_score:.3f}") + last_grasp_count = current_count + last_update_time = current_time + else: + # Show periodic "still waiting" message + if current_time - last_update_time > 10.0: + print(f" Still waiting for grasps... ({time.strftime('%H:%M:%S')})") + last_update_time = current_time + + time.sleep(1.0) # Check every second + + except Exception as e: + print(f" Error in grasp monitor: {e}") + time.sleep(2.0) + + +def main(): + """Test point cloud filtering with grasp generation using ManipulationPipeline.""" + print(" Testing point cloud filtering + grasp generation with ManipulationPipeline...") + + # Configuration + min_confidence = 0.6 + web_port = 5555 + grasp_server_url = "ws://18.224.39.74:8000/ws/grasp" + + try: + # Initialize ZED camera stream + zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10) + + # Get camera intrinsics + camera_intrinsics_dict = zed_stream.get_camera_info() + camera_intrinsics = [ + camera_intrinsics_dict["fx"], + camera_intrinsics_dict["fy"], + camera_intrinsics_dict["cx"], + camera_intrinsics_dict["cy"], + ] + + # Create the concurrent manipulation pipeline WITH grasp generation + pipeline = ManipulationPipeline( + camera_intrinsics=camera_intrinsics, + min_confidence=min_confidence, + max_objects=10, + grasp_server_url=grasp_server_url, + enable_grasp_generation=True, # Enable grasp generation + ) + + # Create ZED stream + zed_frame_stream = zed_stream.create_stream().pipe(ops.share()) + + # Create concurrent processing streams + streams = pipeline.create_streams(zed_frame_stream) + detection_viz_stream = streams["detection_viz"] + pointcloud_viz_stream = streams["pointcloud_viz"] + grasps_stream = streams.get("grasps") # Get grasp stream if available + grasp_overlay_stream = streams.get("grasp_overlay") # Get grasp overlay stream if available + + except ImportError: + print("Error: ZED SDK not installed. Please install pyzed package.") + sys.exit(1) + except RuntimeError as e: + print(f"Error: Failed to open ZED camera: {e}") + sys.exit(1) + + try: + # Set up web interface with concurrent visualization streams + print("Initializing web interface...") + web_interface = RobotWebInterface( + port=web_port, + object_detection=detection_viz_stream, + pointcloud_stream=pointcloud_viz_stream, + grasp_overlay_stream=grasp_overlay_stream, + ) + + # Start grasp monitoring in background thread + grasp_monitor_thread = threading.Thread( + target=monitor_grasps, args=(pipeline,), daemon=True + ) + grasp_monitor_thread.start() + + print(f"\n Point Cloud + Grasp Generation Test Running:") + print(f" Web Interface: http://localhost:{web_port}") + print(f" Object Detection View: RGB with bounding boxes") + print(f" Point Cloud View: Depth with colored point clouds and 3D bounding boxes") + print(f" Confidence threshold: {min_confidence}") + print(f" Grasp server: {grasp_server_url}") + print(f" Available streams: {list(streams.keys())}") + print("\nPress Ctrl+C to stop the test\n") + + # Start web server (blocking call) + web_interface.run() + + except KeyboardInterrupt: + print("\nTest interrupted by user") + except Exception as e: + print(f"Error during test: {e}") + finally: + print("Cleaning up resources...") + if "zed_stream" in locals(): + zed_stream.cleanup() + if "pipeline" in locals(): + pipeline.cleanup() + print("Test completed") + + +if __name__ == "__main__": + main() diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py index 061eb9035e..629ba4dbee 100644 --- a/tests/test_manipulation_pipeline_single_frame.py +++ b/tests/test_manipulation_pipeline_single_frame.py @@ -34,7 +34,7 @@ import open3d as o3d from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid -from dimos.perception.manip_aio_processer import ManipulationProcessor +from dimos.manipulation.manip_aio_processer import ManipulationProcessor from dimos.perception.pointcloud.utils import ( load_camera_matrix_from_yaml, visualize_pcd, diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py index 635f82c9c9..7b57887ddc 100644 --- a/tests/test_manipulation_pipeline_single_frame_lcm.py +++ b/tests/test_manipulation_pipeline_single_frame_lcm.py @@ -42,7 +42,7 @@ from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid -from dimos.perception.manip_aio_processer import ManipulationProcessor +from dimos.manipulation.manip_aio_processer import ManipulationProcessor from dimos.perception.grasp_generation.utils import visualize_grasps_3d from dimos.perception.pointcloud.utils import visualize_pcd from dimos.utils.logging_config import setup_logger diff --git a/tests/test_pick_and_place_module.py b/tests/test_pick_and_place_module.py new file mode 100644 index 0000000000..6a8470863e --- /dev/null +++ b/tests/test_pick_and_place_module.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run script for Piper Arm robot with pick and place functionality. +Subscribes to visualization images and handles mouse/keyboard input. +""" + +import cv2 +import sys +import asyncio +import threading +import time +import numpy as np +from typing import Optional + +try: + import pyzed.sl as sl +except ImportError: + print("Error: ZED SDK not installed.") + sys.exit(1) + +from dimos.robot.agilex.piper_arm import PiperArmRobot +from dimos.utils.logging_config import setup_logger + +# Import LCM message types +from dimos_lcm.sensor_msgs import Image +from dimos.protocol.pubsub.lcmpubsub import LCM, Topic + +logger = setup_logger("dimos.tests.test_pick_and_place_module") + +# Global for mouse events +mouse_click = None +camera_mouse_click = None +current_window = None +pick_location = None # Store pick location +place_location = None # Store place location +place_mode = False # Track if we're in place selection mode + + +def mouse_callback(event, x, y, _flags, param): + global mouse_click, camera_mouse_click + window_name = param + if event == cv2.EVENT_LBUTTONDOWN: + if window_name == "Camera Feed": + camera_mouse_click = (x, y) + else: + mouse_click = (x, y) + + +class VisualizationNode: + """Node that subscribes to visualization images and handles user input.""" + + def __init__(self, robot: PiperArmRobot): + self.lcm = LCM() + self.latest_viz = None + self.latest_camera = None + self._running = False + self.robot = robot + + # Subscribe to visualization topic + self.viz_topic = Topic("/manipulation/viz", Image) + self.camera_topic = Topic("/zed/color_image", Image) + + def start(self): + """Start the visualization node.""" + self._running = True + self.lcm.start() + + # Subscribe to visualization topic + self.lcm.subscribe(self.viz_topic, self._on_viz_image) + # Subscribe to camera topic for point selection + self.lcm.subscribe(self.camera_topic, self._on_camera_image) + + logger.info("Visualization node started") + + def stop(self): + """Stop the visualization node.""" + self._running = False + cv2.destroyAllWindows() + + def _on_viz_image(self, msg: Image, topic: str): + """Handle visualization image messages.""" + try: + # Convert LCM message to numpy array + data = np.frombuffer(msg.data, dtype=np.uint8) + if msg.encoding == "rgb8": + image = data.reshape((msg.height, msg.width, 3)) + # Convert RGB to BGR for OpenCV + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + self.latest_viz = image + except Exception as e: + logger.error(f"Error processing viz image: {e}") + + def _on_camera_image(self, msg: Image, topic: str): + """Handle camera image messages.""" + try: + # Convert LCM message to numpy array + data = np.frombuffer(msg.data, dtype=np.uint8) + if msg.encoding == "rgb8": + image = data.reshape((msg.height, msg.width, 3)) + # Convert RGB to BGR for OpenCV + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + self.latest_camera = image + except Exception as e: + logger.error(f"Error processing camera image: {e}") + + def run_visualization(self): + """Run the visualization loop with user interaction.""" + global mouse_click, camera_mouse_click, pick_location, place_location, place_mode + + # Setup windows + cv2.namedWindow("Pick and Place") + cv2.setMouseCallback("Pick and Place", mouse_callback, "Pick and Place") + + cv2.namedWindow("Camera Feed") + cv2.setMouseCallback("Camera Feed", mouse_callback, "Camera Feed") + + print("=== Piper Arm Robot - Pick and Place ===") + print("Control mode: Module-based with LCM communication") + print("\nPICK AND PLACE WORKFLOW:") + print("1. Click on an object to select PICK location") + print("2. Click again to select PLACE location (auto pick & place)") + print("3. OR press 'p' after first click for pick-only task") + print("\nCONTROLS:") + print(" 'p' - Execute pick-only task (after selecting pick location)") + print(" 'r' - Reset everything") + print(" 'q' - Quit") + print(" 's' - SOFT STOP (emergency stop)") + print(" 'g' - RELEASE GRIPPER (open gripper)") + print(" 'SPACE' - EXECUTE target pose (manual override)") + print("\nNOTE: Click on objects in the Camera Feed window!") + + while self._running: + # Show camera feed with status overlay + if self.latest_camera is not None: + display_image = self.latest_camera.copy() + + # Add status text + status_text = "" + if pick_location is None: + status_text = "Click to select PICK location" + color = (0, 255, 0) + elif place_location is None: + status_text = "Click to select PLACE location (or press 'p' for pick-only)" + color = (0, 255, 255) + else: + status_text = "Executing pick and place..." + color = (255, 0, 255) + + cv2.putText( + display_image, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2 + ) + + # Draw pick location marker if set + if pick_location is not None: + # Simple circle marker + cv2.circle(display_image, pick_location, 10, (0, 255, 0), 2) + cv2.circle(display_image, pick_location, 2, (0, 255, 0), -1) + + # Simple label + cv2.putText( + display_image, + "PICK", + (pick_location[0] + 15, pick_location[1] + 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 0), + 2, + ) + + # Draw place location marker if set + if place_location is not None: + # Simple circle marker + cv2.circle(display_image, place_location, 10, (0, 255, 255), 2) + cv2.circle(display_image, place_location, 2, (0, 255, 255), -1) + + # Simple label + cv2.putText( + display_image, + "PLACE", + (place_location[0] + 15, place_location[1] + 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + (0, 255, 255), + 2, + ) + + # Draw simple arrow between pick and place + if pick_location is not None: + cv2.arrowedLine( + display_image, + pick_location, + place_location, + (255, 255, 0), + 2, + tipLength=0.05, + ) + + cv2.imshow("Camera Feed", display_image) + + # Show visualization if available + if self.latest_viz is not None: + cv2.imshow("Pick and Place", self.latest_viz) + + # Handle keyboard input + key = cv2.waitKey(1) & 0xFF + if key != 255: # Key was pressed + if key == ord("q"): + logger.info("Quit requested") + self._running = False + break + elif key == ord("r"): + # Reset everything + pick_location = None + place_location = None + place_mode = False + logger.info("Reset pick and place selections") + # Also send reset to robot + action = self.robot.handle_keyboard_command("r") + if action: + logger.info(f"Action: {action}") + elif key == ord("p"): + # Execute pick-only task if pick location is set + if pick_location is not None: + logger.info(f"Executing pick-only task at {pick_location}") + result = self.robot.pick_and_place( + pick_location[0], + pick_location[1], + None, # No place location + None, + ) + logger.info(f"Pick task started: {result}") + # Clear selection after sending + pick_location = None + place_location = None + else: + logger.warning("Please select a pick location first!") + else: + # Send keyboard command to robot + if key in [82, 84]: # Arrow keys + action = self.robot.handle_keyboard_command(str(key)) + else: + action = self.robot.handle_keyboard_command(chr(key)) + if action: + logger.info(f"Action: {action}") + + # Handle mouse clicks + if camera_mouse_click: + x, y = camera_mouse_click + + if pick_location is None: + # First click - set pick location + pick_location = (x, y) + logger.info(f"Pick location set at ({x}, {y})") + elif place_location is None: + # Second click - set place location and execute + place_location = (x, y) + logger.info(f"Place location set at ({x}, {y})") + logger.info(f"Executing pick at {pick_location} and place at ({x}, {y})") + + # Start pick and place task with both locations + result = self.robot.pick_and_place(pick_location[0], pick_location[1], x, y) + logger.info(f"Pick and place task started: {result}") + + # Clear all points after sending mission + pick_location = None + place_location = None + + camera_mouse_click = None + + # Handle mouse click from Pick and Place window (if viz is running) + elif mouse_click and self.latest_viz is not None: + # Similar logic for viz window clicks + x, y = mouse_click + + if pick_location is None: + # First click - set pick location + pick_location = (x, y) + logger.info(f"Pick location set at ({x}, {y}) from viz window") + elif place_location is None: + # Second click - set place location and execute + place_location = (x, y) + logger.info(f"Place location set at ({x}, {y}) from viz window") + logger.info(f"Executing pick at {pick_location} and place at ({x}, {y})") + + # Start pick and place task with both locations + result = self.robot.pick_and_place(pick_location[0], pick_location[1], x, y) + logger.info(f"Pick and place task started: {result}") + + # Clear all points after sending mission + pick_location = None + place_location = None + + mouse_click = None + + time.sleep(0.03) # ~30 FPS + + +async def run_piper_arm_with_viz(): + """Run the Piper Arm robot with visualization.""" + logger.info("Starting Piper Arm Robot") + + # Create robot instance + robot = PiperArmRobot() + + try: + # Start the robot + await robot.start() + + # Give modules time to fully initialize + await asyncio.sleep(2) + + # Create and start visualization node + viz_node = VisualizationNode(robot) + viz_node.start() + + # Run visualization in separate thread + viz_thread = threading.Thread(target=viz_node.run_visualization, daemon=True) + viz_thread.start() + + # Keep running until visualization stops + while viz_node._running: + await asyncio.sleep(0.1) + + # Stop visualization + viz_node.stop() + + except Exception as e: + logger.error(f"Error running robot: {e}") + import traceback + + traceback.print_exc() + + finally: + # Clean up + robot.stop() + logger.info("Robot stopped") + + +if __name__ == "__main__": + # Run the robot + asyncio.run(run_piper_arm_with_viz()) diff --git a/tests/test_pick_and_place_skill.py b/tests/test_pick_and_place_skill.py new file mode 100644 index 0000000000..40cf2c23b0 --- /dev/null +++ b/tests/test_pick_and_place_skill.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run script for Piper Arm robot with pick and place functionality. +Uses hardcoded points and the PickAndPlace skill. +""" + +import sys +import asyncio + +try: + import pyzed.sl as sl # Required for ZED camera +except ImportError: + print("Error: ZED SDK not installed.") + sys.exit(1) + +from dimos.robot.agilex.piper_arm import PiperArmRobot +from dimos.skills.manipulation.pick_and_place import PickAndPlace +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("dimos.robot.agilex.run_robot") + + +async def run_piper_arm(): + """Run the Piper Arm robot with pick and place skill.""" + logger.info("Starting Piper Arm Robot") + + # Create robot instance + robot = PiperArmRobot() + + try: + # Start the robot + await robot.start() + + # Give modules time to fully initialize + await asyncio.sleep(3) + + # Add the PickAndPlace skill to the robot's skill library + robot.skill_library.add(PickAndPlace) + + logger.info("Robot initialized successfully") + print("\n=== Piper Arm Robot - Pick and Place Demo ===") + print("This demo uses hardcoded pick and place points.") + print("\nCommands:") + print(" 1. Run pick and place with hardcoded points") + print(" 2. Run pick-only with hardcoded point") + print(" r. Reset robot to idle") + print(" q. Quit") + print("") + + running = True + while running: + try: + # Get user input + command = input("\nEnter command: ").strip().lower() + + if command == "q": + logger.info("Quit requested") + running = False + break + + elif command == "r" or command == "s": + logger.info("Resetting robot") + robot.handle_keyboard_command(command) + + elif command == "1": + # Hardcoded pick and place points + # These should be adjusted based on your camera view + print("\nExecuting pick and place with hardcoded points...") + + # Create and execute the skill + skill = PickAndPlace( + robot=robot, + object_query="labubu doll", # Will use visual detection + target_query="on the keyboard", # Will use visual detection + ) + + result = skill() + + if result["success"]: + print(f"✓ {result['message']}") + else: + print(f"✗ Failed: {result.get('error', 'Unknown error')}") + + elif command == "2": + # Pick-only with hardcoded point + print("\nExecuting pick-only with hardcoded point...") + + # Create and execute the skill for pick-only + skill = PickAndPlace( + robot=robot, + object_query="labubu doll", # Will use visual detection + target_query=None, # No place target - pick only + ) + + result = skill() + + if result["success"]: + print(f"✓ {result['message']}") + else: + print(f"✗ Failed: {result.get('error', 'Unknown error')}") + + else: + print("Invalid command. Please try again.") + + # Small delay to prevent CPU spinning + await asyncio.sleep(0.1) + + except KeyboardInterrupt: + logger.info("Keyboard interrupt received") + running = False + break + except Exception as e: + logger.error(f"Error in command loop: {e}") + print(f"Error: {e}") + + except Exception as e: + logger.error(f"Error running robot: {e}") + import traceback + + traceback.print_exc() + + finally: + # Clean up + logger.info("Shutting down robot...") + await robot.stop() + logger.info("Robot stopped") + + +def main(): + """Main entry point.""" + print("Starting Piper Arm Robot...") + print("Note: The robot will use Qwen VLM to identify objects and locations") + print("based on the queries specified in the code.") + + # Run the robot + asyncio.run(run_piper_arm()) + + +if __name__ == "__main__": + main() diff --git a/tests/test_pointcloud_filtering.py b/tests/test_pointcloud_filtering.py index 308b4fc6ac..57a1cb5b00 100644 --- a/tests/test_pointcloud_filtering.py +++ b/tests/test_pointcloud_filtering.py @@ -23,7 +23,7 @@ from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream from dimos.web.robot_web_interface import RobotWebInterface from dimos.utils.logging_config import logger -from dimos.perception.manip_aio_pipeline import ManipulationPipeline +from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline def main(): diff --git a/tests/test_zed_module.py b/tests/test_zed_module.py new file mode 100644 index 0000000000..fbc99a54a4 --- /dev/null +++ b/tests/test_zed_module.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test script for ZED Module with LCM visualization.""" + +import asyncio +import threading +import time +from typing import Optional +import numpy as np +import cv2 + +from dimos import core +from dimos.hardware.zed_camera import ZEDModule +from dimos.protocol import pubsub +from dimos.utils.logging_config import setup_logger +from dimos.perception.common.utils import colorize_depth + +# Import LCM message types +from dimos_lcm.sensor_msgs import Image as LCMImage +from dimos_lcm.sensor_msgs import CameraInfo +from dimos_lcm.geometry_msgs import PoseStamped +from dimos.protocol.pubsub.lcmpubsub import LCM, Topic + +logger = setup_logger("test_zed_module") + + +class ZEDVisualizationNode: + """Node that subscribes to ZED topics and visualizes the data.""" + + def __init__(self): + self.lcm = LCM() + self.latest_color = None + self.latest_depth = None + self.latest_pose = None + self.camera_info = None + self._running = False + + # Subscribe to topics + self.color_topic = Topic("/zed/color_image", LCMImage) + self.depth_topic = Topic("/zed/depth_image", LCMImage) + self.camera_info_topic = Topic("/zed/camera_info", CameraInfo) + self.pose_topic = Topic("/zed/pose", PoseStamped) + + def start(self): + """Start the visualization node.""" + self._running = True + self.lcm.start() + + # Subscribe to topics + self.lcm.subscribe(self.color_topic, self._on_color_image) + self.lcm.subscribe(self.depth_topic, self._on_depth_image) + self.lcm.subscribe(self.camera_info_topic, self._on_camera_info) + self.lcm.subscribe(self.pose_topic, self._on_pose) + + logger.info("Visualization node started, subscribed to ZED topics") + + def stop(self): + """Stop the visualization node.""" + self._running = False + cv2.destroyAllWindows() + + def _on_color_image(self, msg: LCMImage, topic: str): + """Handle color image messages.""" + try: + # Convert LCM message to numpy array + data = np.frombuffer(msg.data, dtype=np.uint8) + + if msg.encoding == "rgb8": + image = data.reshape((msg.height, msg.width, 3)) + # Convert RGB to BGR for OpenCV + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + elif msg.encoding == "mono8": + image = data.reshape((msg.height, msg.width)) + else: + logger.warning(f"Unsupported encoding: {msg.encoding}") + return + + self.latest_color = image + logger.debug(f"Received color image: {msg.width}x{msg.height}") + + except Exception as e: + logger.error(f"Error processing color image: {e}") + + def _on_depth_image(self, msg: LCMImage, topic: str): + """Handle depth image messages.""" + try: + # Convert LCM message to numpy array + if msg.encoding == "32FC1": + data = np.frombuffer(msg.data, dtype=np.float32) + depth = data.reshape((msg.height, msg.width)) + else: + logger.warning(f"Unsupported depth encoding: {msg.encoding}") + return + + self.latest_depth = depth + logger.debug(f"Received depth image: {msg.width}x{msg.height}") + + except Exception as e: + logger.error(f"Error processing depth image: {e}") + + def _on_camera_info(self, msg: CameraInfo, topic: str): + """Handle camera info messages.""" + self.camera_info = msg + logger.info( + f"Received camera info: {msg.width}x{msg.height}, distortion model: {msg.distortion_model}" + ) + + def _on_pose(self, msg: PoseStamped, topic: str): + """Handle pose messages.""" + self.latest_pose = msg + pos = msg.pose.position + ori = msg.pose.orientation + logger.debug( + f"Pose: pos=({pos.x:.2f}, {pos.y:.2f}, {pos.z:.2f}), " + + f"ori=({ori.x:.2f}, {ori.y:.2f}, {ori.z:.2f}, {ori.w:.2f})" + ) + + def visualize(self): + """Run visualization loop.""" + while self._running: + # Create visualization + vis_images = [] + + # Color image + if self.latest_color is not None: + color_vis = self.latest_color.copy() + + # Add pose text if available + if self.latest_pose is not None: + pos = self.latest_pose.pose.position + text = f"Pose: ({pos.x:.2f}, {pos.y:.2f}, {pos.z:.2f})" + cv2.putText( + color_vis, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2 + ) + + vis_images.append(("ZED Color", color_vis)) + + # Depth image + if self.latest_depth is not None: + depth_colorized = colorize_depth(self.latest_depth, max_depth=5.0) + if depth_colorized is not None: + # Convert RGB to BGR for OpenCV + depth_colorized = cv2.cvtColor(depth_colorized, cv2.COLOR_RGB2BGR) + + # Add depth stats + valid_mask = np.isfinite(self.latest_depth) & (self.latest_depth > 0) + if np.any(valid_mask): + min_depth = np.min(self.latest_depth[valid_mask]) + max_depth = np.max(self.latest_depth[valid_mask]) + mean_depth = np.mean(self.latest_depth[valid_mask]) + + text = f"Depth: min={min_depth:.2f}m, max={max_depth:.2f}m, mean={mean_depth:.2f}m" + cv2.putText( + depth_colorized, + text, + (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) + + vis_images.append(("ZED Depth", depth_colorized)) + + # Show windows + for name, image in vis_images: + cv2.imshow(name, image) + + # Handle key press + key = cv2.waitKey(1) & 0xFF + if key == ord("q"): + logger.info("Quit requested") + self._running = False + break + elif key == ord("s"): + # Save images + if self.latest_color is not None: + cv2.imwrite("zed_color.png", self.latest_color) + logger.info("Saved color image to zed_color.png") + if self.latest_depth is not None: + np.save("zed_depth.npy", self.latest_depth) + logger.info("Saved depth data to zed_depth.npy") + + time.sleep(0.03) # ~30 FPS + + +async def test_zed_module(): + """Test the ZED Module with visualization.""" + logger.info("Starting ZED Module test") + + # Start Dask + dimos = core.start(1) + + # Enable LCM auto-configuration + pubsub.lcm.autoconf() + + try: + # Deploy ZED module + logger.info("Deploying ZED module...") + zed = dimos.deploy( + ZEDModule, + camera_id=0, + resolution="HD720", + depth_mode="NEURAL", + fps=30, + enable_tracking=True, + publish_rate=10.0, # 10 Hz for testing + frame_id="zed_camera", + ) + + # Configure LCM transports + zed.color_image.transport = core.LCMTransport("/zed/color_image", LCMImage) + zed.depth_image.transport = core.LCMTransport("/zed/depth_image", LCMImage) + zed.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo) + zed.pose.transport = core.LCMTransport("/zed/pose", PoseStamped) + + # Print module info + logger.info("ZED Module configured:") + print(zed.io().result()) + + # Start ZED module + logger.info("Starting ZED module...") + zed.start() + + # Give module time to initialize + await asyncio.sleep(2) + + # Create and start visualization node + viz_node = ZEDVisualizationNode() + viz_node.start() + + # Run visualization in separate thread + viz_thread = threading.Thread(target=viz_node.visualize, daemon=True) + viz_thread.start() + + logger.info("ZED Module running. Press 'q' in image window to quit, 's' to save images.") + + # Keep running until visualization stops + while viz_node._running: + await asyncio.sleep(0.1) + + # Stop ZED module + logger.info("Stopping ZED module...") + zed.stop() + + # Stop visualization + viz_node.stop() + + except Exception as e: + logger.error(f"Error in test: {e}") + import traceback + + traceback.print_exc() + + finally: + # Clean up + dimos.close() + logger.info("Test completed") + + +if __name__ == "__main__": + # Run the test + asyncio.run(test_zed_module())